first commit

This commit is contained in:
yumoqing 2025-07-16 14:46:24 +08:00
commit 3f6944adb0
93 changed files with 7700 additions and 0 deletions

90
README.md Normal file
View File

@ -0,0 +1,90 @@
# pcapi
算力中心API服务器
## 安装须知
* 先执行sh install.sh安装升级第三方库
* 将sqlor,ahserver,apppublic,rbac,appbase库拉取下来依次安装
* 建议不要直接pip install -r requirement.txt,小麻烦较多
## 安全保证
* 使用https协议
* 使用BasicAuth传输用户密码
* 检查IP是否在允许IP集中
* 上述检查失败的请求全部返回401
## 服务方代码
### 例子功能
#### 步骤1 包装K8S功能
```
async def create_namespaced_job_v1(namespace, jobdesc):
batch_v1 = client.BatchV1Api()
f = awaitify(batch_v1.create_namespaced_job)
return await f(namespace=namespace, body=jobdesc)
```
上述代码用来包装k8s的以下功能
```
batch_v1 = client.BatchV1Api()
batch_v1.create_namespaced_job(namespace=..., nody=...)
```
#### 步骤2 让新功能在dspy脚本中可用
将下面的代码放在def init_func()函数的最后:
```
g.create_namespaced_job_v1 = create_namespaced_job_v1
```
### 功能扩充
有新功能需要扩充时,请参照上述功能操作
### 数据参数话函数
paramify(data, ns):
data是一个python的字符串,字典或数组,其中的数据有用“${name}$"形式定义的变量,此函数将data中的参数用ns字典中的值替换
此函数在dspy脚本中可用
### 后台目录协议
* 所有后台程序在wwwroot下
* wwwroot中只有一个api目录
* api目录按照api的版本创建v{版本号}目录, 版本号为整数,从“1”开始
* 每个api一个在版本目录下创建一个api名称的目录
* api代码在api名称目录的index.dspy文件中
### 接口脚本例子
```
info('test .....{params_kw=}')
data = {
"k":"${key}$",
"s":"${secretkey}$"
}
ns = paramify(data, params_kw)
return ns
```
## 请求方代码例子
### curl版本
curl --basic --user kyycloud:Kyy@123456 https://pcapi.opencomputing.cn
### dspy版本
```
# 客户方需要提供用户密码,用basic_auth_headers函数生成验证用户所需
# 的http请求的headers,并在请求是放在headers参数中
headers = basic_auth_headers('kyycloud','Kyy@123456')
hc = HttpClient()
resp = await hc.request('https://pcapi.opencomputing.cn',
method='GET',
params={
"a":"1",
"key":"87y32iuhi453u8y56",
"secretkey":"qqqqqcccccceeeee"
},
headers=headers)
info(f'{resp=}')
return resp
# bricks

0
app/__init__.py Normal file
View File

View File

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,530 @@
import yaml
from kubernetes import client, config
from kubernetes.client.exceptions import ApiException
from appPublic.log import debug
import time
import re
import json
import ast
def format_source_labels(source_selflabel, type=None):
"""
格式化源标签支持多个 key-value
:param source_selflabel: 源标签字符串格式如
pod类型: "key1:value1,key2:value2"
node类型: "key3=value3,key4=value4"
:param type: 标签类型 ("pod" "node")
:return: 格式化后的标签字典 {key: value}
"""
if not source_selflabel or len(source_selflabel.strip()) == 0:
return {}
label_dict = {}
if type == "pod":
# Pod 标签使用冒号分隔,多个用逗号分隔
for pair in source_selflabel.strip().split(","):
if ":" in pair:
key, value = pair.strip().split(":", 1)
label_dict[key.strip()] = value.strip()
elif type == "node":
# Node 标签使用等号分隔,多个用逗号分隔
for pair in source_selflabel.strip().split(","):
if "=" in pair:
key, value = pair.strip().split("=", 1)
label_dict[key.strip()] = value.strip()
else:
return {}
return label_dict
def format_runtime(seconds):
if seconds < 60:
return f"{int(seconds)}s"
elif seconds < 3600:
minutes = int(seconds // 60)
return f"{minutes}m"
elif seconds < 86400:
hours = int(seconds // 3600)
return f"{hours}h"
else:
days = int(seconds // 86400)
return f"{days}d"
def extract_model_labels(hardware_list):
"""
提取硬件列表中的模型标签为列表
:param hardware_list: 硬件列表格式如
kyy-gpu-model=RTX5090-32G,kyy-cpu-model=INTEL(R) XEON(R) PLATINUM 8582C
"""
labels = []
for item in hardware_list:
if item["type"] in {"cpu", "gpu"}:
labels.append(f"kyy-{item['type']}-model={item['model'].replace(' ','-').replace('(','-').replace(')','-').replace('kyy-','')}")
return labels
def determine_accommodat(kubeconfig,get_resources):
"""
判断产品资源是否可以被当前集群容纳
磁盘除外因为磁盘资源通常是通过 PV/PVC 管理的不在节点资源统计中
:param kubeconfig: kubeconfig 配置
:param get_resources: 产品资源字典格式如
{
"5436-f-gdsb--ewrewrerrtwt": [
{
"type": "cpu",
"model": "INTEL(R) XEON(R) PLATINUM 8582C",
"amount": 0
},
{
"type": "memory",
"model": "Samsung DDR4 DIMMs",
"amount": 0
},
{
"type": "disk",
"model": "DATA",
"amount": 0
},
{
"type": "gpu",
"model": "RTX5090-32G",
"amount": 0
}
],
"6787jhgvgjhv32412343142jvgj": [
{
"type": "cpu",
"model": "INTEL(R) XEON(R) PLATINUM 8582C",
"amount": 4
},
{
"type": "memory",
"model": "Samsung DDR4 DIMMs",
"amount": 100
},
{
"type": "disk",
"model": "DATA",
"amount": 512
},
{
"type": "gpu",
"model": "RTX5090-32G",
"amount": 2
}
],
}
: params kyylabels: 节点自定义标签格式如 "key1:value1,key2:value2"
:return: 可容纳的产品 ID 列表
"""
init_ids = []
try:
all_quota = get_node_info(kubeconfig).get('rows', [])
if not all_quota:
debug("determine_accommodat: 没有获取到节点信息")
return init_ids
products = {}
if isinstance(get_resources, str):
debug(f"---get_resources格式:{type(get_resources)}")
products = json.loads(get_resources)
# debug(f"1---products格式:{type(products)}")
if isinstance(products, str):
products = eval(products)
debug(f"2---products格式:{type(products)}")
all_quota = [x for x in all_quota if x['node_status'] != '未就绪' and x['node_role'] != 'master']
debug(f"\n 接收请求资源={products},\n 现有资源:{all_quota}")
# 预处理节点数据,转换为数值类型
processed_nodes = []
for node in all_quota:
# 跳过不可用节点和控制节点
if node['node_status'] != '已就绪' or node['node_role'] == 'master':
#debug(f"跳过未就绪节点/控制节点:{node['node_internalip']} {node['node_status']} {node['node_name']}")
continue
# 提取可用CPU去除"核"字并转换为float
cpu_str = node['available_cpu'].replace('', '')
available_cpu = float(cpu_str)
# 提取可用内存处理Gi单位
mem_str = node['available_memory']
if mem_str.endswith('Gi'):
available_memory = float(mem_str.replace('Gi', ''))
else:
# 假设其他单位为Mi并转换为Gi
available_memory = float(mem_str.replace('Mi', '')) / 1024
available_gpu = node['available_gpu']
processed_nodes.append({
'node_name': node['node_name'],
'node_labels': node['node_labels'], # 节点自定义标签
'cpu': available_cpu,
'memory': available_memory,
'gpu': available_gpu
})
# 找出无法部署的产品ID
init_ids = []
for product_id, resources in products.items():
# 提取产品资源需求
product_cpu = next((r['amount'] for r in resources if r['type'] == 'cpu'), 0)
product_memory = next((r['amount'] for r in resources if r['type'] == 'memory'), 0)
product_gpu = next((r['amount'] for r in resources if r['type'] == 'gpu'), 0)
# 管理员视角创建Pod的时候CPU的请求单位可能是毫核(m)也可能是Gi如果是m则转成核
if "m" in str(product_cpu):
product_cpu = float(product_cpu.replace("m", "")) / 1000.0
# 管理员视角创建Pod的时候内存的请求单位可能带了单位Gi如果是1G则转成1.0
if "Gi" in str(product_memory):
product_memory = float(product_memory.replace("Gi", ""))
elif "Mi" in str(product_memory):
product_memory = float(product_memory.replace("Mi", "")) / 1024.0
# 管理员视角创建Pod的时候磁盘的请求单位可能是带了单位Gi如果是1G则转成1.0
# 这里磁盘不在节点资源统计中,所以不处理
# if "Gi" in str(product_disk):
# product_disk = float(product_disk.replace("Gi", ""))
# elif "Mi" in str(product_disk):
# product_disk = float(product_disk.replace("Mi", "")) / 1024
# 检查是否存在任何节点可以满足该产品需求(这里规定,不能完全占满,只能略小于,毕竟节点上可能还有其他服务会动态占用资源)
can_deploy = False
for node in processed_nodes:
#此处转换标签并给出判断该节点此标签产品是否可部署
kyy_labels = extract_model_labels(resources)
if kyy_labels:
# 检查节点标签是否包含产品所需的标签
if not all(label in node['node_labels'] for label in kyy_labels):
debug(f"节点 {node['node_name']} 不满足产品 {product_id} 的标签要求: {kyy_labels}")
continue
debug(f'✅ 请求标签在其中节点选择器标签范围内,可部署: {kyy_labels}')
debug(f"核心参数判断:{product_cpu=} {node['cpu']=} # {float(product_memory)=} {node['memory']=} # {float(product_gpu)=} {node['gpu']=}")
if (product_cpu < node['cpu'] and float(product_memory) < node['memory'] and float(product_gpu) <= node['gpu']):
can_deploy = True
break
if not can_deploy:
init_ids.append(product_id)
debug(f"无法在集群任何节点上部署的产品ID: {init_ids}")
return init_ids
except:
import traceback
debug(f"创建异常: {traceback.format_exc()}")
raise f"determine_accommodat 异常: {traceback.format_exc()}"
def get_pod_info(kubeconfig):
try:
# config.load_kube_config()
kubeconfig = yaml.safe_load(kubeconfig)
config.load_kube_config_from_dict(kubeconfig)
v1 = client.CoreV1Api()
api_client = client.ApiClient()
namespaces = v1.list_namespace(timeout_seconds=1).items
non_system_namespaces = [ns.metadata.name for ns in namespaces if
not ns.metadata.name.startswith(('kube-', 'default', 'local', 'ingress-'))]
rows = []
for namespace in non_system_namespaces:
pods = v1.list_namespaced_pod(namespace).items
pod_metrics_path = f"/apis/metrics.k8s.io/v1beta1/namespaces/{namespace}/pods"
pod_metrics_response = api_client.call_api(
pod_metrics_path, 'GET', auth_settings=['BearerToken'], response_type='object')[0]
pod_metrics = {pod['metadata']['name']: pod.get("containers",[{}])[0].get('usage', {})
for pod in pod_metrics_response.get('items', [])}
# debug(f"### pods={pods}")
for pod in pods:
pod_name = pod.metadata.name
if pod.status.container_statuses:
ready_count = sum(1 for cs in pod.status.container_statuses if cs.ready)
else:
ready_count = 0
# 获取容器总数
total_containers = len(pod.spec.containers)
# 计算就绪容器数
ready_count = 0
if pod.status.container_statuses:
ready_count = sum(1 for status in pod.status.container_statuses if status.ready)
# 计算就绪比例
ready_ratio = ready_count / total_containers if total_containers > 0 else 0
# 判断就绪状态
ready_status = "已就绪" if ready_ratio >= 1 else "未就绪"
# 抛弃下面这种写法,极端情况下集合操作会出问题
# ready_status = "已就绪" if ({ready_count}/{len(pod.spec.containers)}) >= 1 else "未就绪"
readiness_conditions = [{"type": cond.type, "status": cond.status}
for cond in pod.status.conditions if cond.type == "Ready"]
phase = pod.status.phase
restart_count = sum(cs.restart_count for cs in pod.status.container_statuses) if pod.status.container_statuses else 0
running_time = time.time() - pod.metadata.creation_timestamp.timestamp()
pod_age = format_runtime(running_time)
pod_ip = pod.status.pod_ip if pod.status.pod_ip else "Unknown"
node_name = pod.spec.node_name if pod.spec.node_name else "Pod未被调度到节点"
nominated_node = pod.status.nominated_node_name if pod.status.nominated_node_name else ""
if phase == "Pending":
pod_ip = "Pending状态,未分配 IP"
node_name = "Pending状态,未分配节点"
nominated_node = "Pending状态,未分配节点"
# 提取容器的资源限制limits
cpu_limit = "未设置"
memory_limit = "未设置"
gpu_limit = "未设置"
if pod.spec.containers:
container = pod.spec.containers[0] # 假设只取第一个容器
if container.resources and container.resources.limits:
limits = container.resources.limits
cpu_limit = limits.get("cpu", "未设置") # 假设 CPU 限制以核为单位
# 处理特殊情况,如果 CPU 限制以毫核(m)为单位,转换为核
# debug(f'cpu_limit==={cpu_limit}')
if isinstance(cpu_limit, str) and cpu_limit.endswith("m"):
debug(f'无法识别的cpu_limit格式:{cpu_limit} 转换为 {float((int(cpu_limit.replace("m", "")) / 1000))}')
cpu_limit = f'{float((int(cpu_limit.replace("m", "")) / 1000))}'
memory_limit = limits.get("memory", "未设置")
gpu_limit = limits.get("nvidia.com/gpu", "未设置") # 只支持 NVIDIA GPU
# 获取 metrics 数据(已有逻辑不变)
cpu_usage = pod_metrics.get(pod_name, {}).get('cpu', 'undefined')
if cpu_usage and isinstance(cpu_usage, str):
cpu_usage = int(cpu_usage.replace("n", "")) if cpu_usage.endswith("n") else 0
cpu_usage = f'{(cpu_usage / 1000000 / 1000):.3f}'
memory_usage = pod_metrics.get(pod_name, {}).get('memory', 'undefined')
if memory_usage and isinstance(memory_usage, str):
memory_usage = int(memory_usage.replace("Ki", "")) if memory_usage.endswith("Ki") else 0
memory_usage = f"{(memory_usage / 1024 / 1024):.3f}Gi"
if phase in ["Pending", "Succeeded", "Failed"]:
cpu_usage = "Pod未运行,无资源使用数据"
memory_usage = "Pod未运行,无资源使用数据"
# 新增 GPU 使用情况字段(暂时用占位符)
gpu_usage = "0%" # 如果你有 DCGM / Prometheus 可替换为实际值
pod_info = {
"pod_namespace": namespace,
"pod_name": pod_name,
"pod_ready": ready_status,
"pod_running": phase,
"pod_restart": str(restart_count),
"pod_age": pod_age,
"pod_ip": pod_ip,
"pod_node": node_name,
"pod_nominated_node": nominated_node,
"pod_cpurate": cpu_usage,
"pod_memrate": memory_usage,
# 新增字段
"pod_gpu": gpu_limit,
"pod_cpu_limit": cpu_limit + "" if cpu_limit != "未设置" else "未设置",
"pod_memory_limit": memory_limit,
"pod_gpu_limit": gpu_limit,
}
rows.append(pod_info)
result = {
"total": len(rows),
"rows": rows
}
return result
except Exception as e:
import traceback
debug(f"获取Pod信息失败: {traceback.format_exc()}")
raise traceback.format_exc()
def get_node_info(kubeconfig):
# 加载配置
try:
kubeconfig = yaml.safe_load(kubeconfig)
config.load_kube_config_from_dict(kubeconfig)
v1 = client.CoreV1Api()
api_client = client.ApiClient()
# 获取节点指标和 Pod 列表
node_metrics_path = "/apis/metrics.k8s.io/v1beta1/nodes"
node_metrics_response = api_client.call_api(
node_metrics_path, 'GET', auth_settings=['BearerToken'], response_type='object')[0]
node_metrics = {node['metadata']['name']: node.get('usage', {})
for node in node_metrics_response.get('items', [])}
# 获取所有 Pod 及其资源请求
pods = v1.list_pod_for_all_namespaces(timeout_seconds=1).items
node_pod_resources = {} # 存储每个节点上 Pod 的资源请求
for pod in pods:
if pod.spec.node_name and pod.status.phase in ["Running", "Pending"]:
node_name = pod.spec.node_name
if node_name not in node_pod_resources:
node_pod_resources[node_name] = {
"cpu": 0,
"memory": 0,
"gpu": 0
}
# 累加容器请求的资源
for container in pod.spec.containers:
if container.resources and container.resources.requests:
# CPU (转换为 millicores)
cpu_request = container.resources.requests.get("cpu", "0m")
cpu_millis = int(float(cpu_request.rstrip("m"))) if "m" in cpu_request else int(float(cpu_request) * 1000)
node_pod_resources[node_name]["cpu"] += cpu_millis
# Memory (转换为 bytes)
memory_request = container.resources.requests.get("memory", "0")
memory_bytes = int(float(memory_request.rstrip("KiMiGi")))
if "Ki" in memory_request:
memory_bytes *= 1024
elif "Mi" in memory_request:
memory_bytes *= 1024 * 1024
elif "Gi" in memory_request:
memory_bytes *= 1024 * 1024 * 1024
node_pod_resources[node_name]["memory"] += memory_bytes
# GPU
gpu_request = container.resources.requests.get("nvidia.com/gpu", "0")
node_pod_resources[node_name]["gpu"] += int(gpu_request)
# 获取节点列表并计算资源使用情况
nodes = v1.list_node().items
rows = []
for node in nodes:
node_name = node.metadata.name
internal_ip = next((address.address for address in node.status.addresses
if address.type == "InternalIP"), "未分配")
external_ip = next((address.address for address in node.status.addresses
if address.type == "ExternalIP"), "未分配")
status = node.status.conditions[-1].status if node.status.conditions else "Unknown"
status = "已就绪" if status == "True" else "未就绪"
# 节点角色
roles = []
role_labels = [
"node-role.kubernetes.io/control-plane",
"node-role.kubernetes.io/master",
"node-role.kubernetes.io/worker"
]
for label in role_labels:
if label in node.metadata.labels:
roles.append(label.split("/")[-1])
roles_str = "master" if roles else "worker"
# 节点运行时间
running_time = time.time() - node.metadata.creation_timestamp.timestamp()
node_age = format_runtime(running_time)
# 节点信息
k8s_version = node.status.node_info.kubelet_version
os_image = node.status.node_info.os_image
kernel_version = node.status.node_info.kernel_version
container_runtime = node.status.node_info.container_runtime_version
# 自定义标签
labels = node.metadata.labels
kyy_labels = [f"{k}={v}" for k, v in labels.items() if k.startswith('kyy-')]
# 实时资源使用情况
cpu_usage = node_metrics.get(node_name, {}).get('cpu', 'undefined')
if cpu_usage and isinstance(cpu_usage, str):
cpu_usage = int(cpu_usage.replace("n", ""))
cpu_usage = f'{(cpu_usage / 1000000 / 1000):.3f}'
memory_usage = node_metrics.get(node_name, {}).get('memory', 'undefined')
if memory_usage and isinstance(memory_usage, str):
memory_usage = int(memory_usage.replace("Ki", ""))
memory_usage = f"{(memory_usage / 1024 / 1024):.3f}Gi"
# 节点总资源
total_cpu = float(node.status.allocatable.get("cpu", "0"))
total_memory = parse_resource_value(node.status.allocatable.get("memory", "0")) / (1024 ** 1) #内存默认Mi转成Gi
total_gpu = int(node.status.allocatable.get("nvidia.com/gpu", "0"))
# 已分配资源
allocated_cpu = node_pod_resources.get(node_name, {}).get("cpu", 0) / 1000.0 # 转换为 cores
allocated_memory = node_pod_resources.get(node_name, {}).get("memory", 0) / (1024 ** 3) # 转换为 Gi
allocated_gpu = node_pod_resources.get(node_name, {}).get("gpu", 0)
# 可用资源
available_cpu = total_cpu - allocated_cpu
available_memory = total_memory - allocated_memory
available_gpu = total_gpu - allocated_gpu
node_info = {
"node_name": node_name,
"node_status": status,
"node_role": roles_str,
"node_age": node_age,
"node_version": k8s_version,
"node_internalip": internal_ip,
"node_externalip": external_ip,
"node_osversion": os_image,
"node_kernelversion": kernel_version,
"node_containeruntime": container_runtime,
"node_labels": kyy_labels,
"node_cpurate": f"{(allocated_cpu / total_cpu * 100):.1f}%" if total_cpu > 0 else "0%",#cpu_usage,
"node_memrate": f"{(allocated_memory / total_memory * 100):.1f}%" if total_memory > 0 else "0%",#memory_usage,
"node_gpu":f"{(allocated_gpu / total_gpu * 100):.1f}%" if total_gpu > 0 else "0%",
# 新增资源信息
# "node_total_cpu": f"{total_cpu:.2f}核",
# "allocated_cpu": f"{allocated_cpu:.2f}核",
"available_cpu": f"{available_cpu:.2f}",
# "cpu_rate": f"{(allocated_cpu / total_cpu * 100):.1f}%" if total_cpu > 0 else "0%",
# "node_total_memory": f"{total_memory:.2f}Gi",
# "allocated_memory": f"{allocated_memory:.2f}Gi",
"available_memory": f"{available_memory:.2f}Gi",
# "memory_rate": f"{(allocated_memory / total_memory * 100):.1f}%" if total_memory > 0 else "0%",
# "node_total_gpu": total_gpu,
# "allocated_gpu": allocated_gpu,
"available_gpu": available_gpu,
# "gpu_rate": f"{(allocated_gpu / total_gpu * 100):.1f}%" if total_gpu > 0 else "0%"
}
rows.append(node_info)
result = {
"total": len(rows),
"rows": rows
}
debug(f"=== node_info={result}")
return result
except:
import traceback
e = traceback.format_exc()
debug(f"获取节点信息失败: {e}")
raise e
# 辅助函数:解析资源值
def parse_resource_value(value: str) -> float:
"""解析 Kubernetes 资源值(如 "1.5", "500m", "2Gi")为统一单位"""
if not value:
return 0.0
# 处理 CPU (cores 或 millicores)
if value.endswith('m'):
return float(value[:-1]) / 1000.0 # 转换为 cores
elif re.match(r'^\d+(\.\d+)?$', value):
return float(value) # 已经是 cores
# 处理内存 (Ki, Mi, Gi, Ti)
elif value.endswith('Ki'):
return float(value[:-2]) / (1024 ** 1) # 转换为 Gi
elif value.endswith('Mi'):
return float(value[:-2]) / (1024 ** 2)
elif value.endswith('Gi'):
return float(value[:-2])
elif value.endswith('Ti'):
return float(value[:-2]) * 1024
return float(value) # 默认按原单位返回

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,573 @@
import json
import yaml
import os
import hashlib
import sqlite3
from pathlib import Path
from datetime import datetime
from os.path import expanduser
from kubernetes import client, config
from kubernetes.client import ApiException
from . import k8s_utils_linuxos_ubuntu, k8s_utils_relationaldb_mysql, parse_k8s_params
from . import ssh_utils,k8s_utils_public
from appPublic.log import debug
import traceback
def delete_cluster_node(params):
"""
删除集群节点
--namespace -n:指定节点所在的命名空间不过,节点是集群级别的资源,不隶属于特定的命名空间,所以此参数一般不用于删除节点
--force:当节点处于不可达状态或者无法正常响应时,可以使用 --force 参数强制删除节点
kubectl delete node <node-name> --force
--grace-period:指定节点在被强制终止之前的宽限期以秒为单位默认值是 30 ,设置为 0 表示立即强制删除一般和 --force 一起使用
kubectl delete node <node-name> --force --grace-period=0
在删除节点之前,需要先将节点标记为不可调度(Cordon),并将节点上的 Pod 安全地迁移到其他节点(Drain)
将节点标记为不可调度,防止新的 Pod 被调度到该节点
kubectl cordon <node-name>
排空节点上的 Pod,将它们迁移到其他节点
kubectl drain <node-name> --ignore-daemonsets --delete-emptydir-data --ignore-not-found
--ignore-daemonsets:忽略 DaemonSet 创建的 Pod,因为 DaemonSet 会确保每个节点上都运行一个 Pod 副本,这些 Pod 不需要迁移
--delete-emptydir-data:删除节点上 EmptyDir 卷中的数据,EmptyDir 卷是临时存储,删除节点时数据会丢失
--ignore-not-found:如果指定的节点不存在,忽略错误,不会报错退出
"""
return "delete_cluster_node ok"
def node_state_switch(params):
"""
恢复节点
kubectl uncordon 命令将节点标记为可调度状态,这样调度器就会重新考虑将新的 Pod 分配到该节点上
kubectl uncordon worker-node-1
暂停节点
kubectl cordon 命令将节点标记为不可调度状态,这样调度器就不会将新的 Pod 分配到该节点上
kubectl cordon worker-node-1
可选排空节点上的 Pod
kubectl drain <node-name> --ignore-daemonsets --delete-emptydir-data
"""
return "node_state_switch ok"
def yaml_apply_delete(params):
"""
1. 通过cpcc传递过来的参数进行级联初始化资源实例;
2. 通过cpcc传递过来的参数进行级联更新资源实例;
3. 通过cpcc传递过来的参数进行级联删除资源实例;
"""
# 为了更好支持多种资源实例类型(操作系统/关系型/非关系型数据库等每种资源实例类型单开逻辑便于维护)
instance_type = params.get("instance_type")
if instance_type == "RelationalDB":
k8s_utils_relationaldb_mysql.handle_k8s_operations(params)
# if instance_type == "RelationalDB_PostgreSQL":
# k8s_utils_relationaldb_mysql.handle_k8s_operations(params)
elif instance_type == "LinuxOS":
k8s_utils_linuxos_ubuntu.handle_k8s_operations(params)
def node_label_opt(params):
"""
要设置节点 worker-node-1 上的标签 app,可以使用以下命令
kubectl label nodes worker-node-1 app=app,注意标签键和值之间有一个等号 (=),表示设置该标签
要取消节点 worker-node-1 上的标签 app,可以使用以下命令
kubectl label nodes worker-node-1 app-注意标签键后面有一个短横线 (-),表示取消该标签
设置/解绑标签后,调度器将考虑该标签进行 Pod 调度,可以使用该标签来选择特定的节点
设置/解绑标签不会影响节点上已经运行的 Pod,它们仍然会继续运行
"""
host = params.get("host")
port = int(params.get("port"))
username = params.get("user")
password = params.get("password")
worker_node = params.get("worker_node")
label = params.get("label")
opt = params.get("opt")
if opt == "label":
get_cluster_node_cmd = [f"kubectl label nodes {worker_node} {label} --overwrite"]
debug(f'绑定标签命令: {get_cluster_node_cmd}')
if username != "root":
results = ssh_utils.ssh_execute_command_noroot(host, port, username, password,
get_cluster_node_cmd, sudo_timeout=10) # 设置标签可能需要一些时间
else:
results = ssh_utils.ssh_execute_command(host, port, username, password, get_cluster_node_cmd)
overwrite_info = results[0][0].strip()
if "not labeled" in overwrite_info:
raise f"{worker_node} 绑定标签 {label} 失败,请检查集群节点状态或标签是否已绑定?"
else:
return f"{worker_node} 绑定标签 {label} 成功!"
elif opt == "unlabel":
get_cluster_node_cmd = [f"kubectl label nodes %s %s-" % (worker_node,label.split('=')[0])]
debug(f'解绑标签命令: {get_cluster_node_cmd}')
if username != "root":
results = ssh_utils.ssh_execute_command_noroot(host, port, username, password,
get_cluster_node_cmd, sudo_timeout=10) # 取消标签可能需要一些时间
else:
results = ssh_utils.ssh_execute_command(host, port, username, password, get_cluster_node_cmd)
# debug(f'解绑标签结果: {results}')
overwrite_info = results[0][0].strip()
if "unlabeled" in overwrite_info or overwrite_info == "":
return f"{worker_node} 解绑标签 {label} 成功!"
else:
raise f"{worker_node} 解绑标签 {label} 失败,请检查集群节点状态或标签是否已绑定?"
def unset_node_label(params):
"""
要取消节点 worker-node-1 上的标签 app,可以使用以下命令
kubectl label nodes worker-node-1 app-注意标签键后面有一个短横线 (-),表示取消该标签
取消标签后,节点将不再具有该标签,调度器将不再考虑该标签进行 Pod 调度
取消标签不会影响节点上已经运行的 Pod,它们仍然会继续运行
"""
host = params.get("host")
port = int(params.get("port"))
username = params.get("user")
password = params.get("password")
worker_node = params.get("worker_node")
label = params.get("label")
def get_cluster_nodes_by_server(params):
host = params.get("host")
port = int(params.get("port"))
username = params.get("user")
password = params.get("password")
get_cluster_node_cmd = ["kubectl get nodes -o wide --show-labels"]
if username != "root":
results = ssh_utils.ssh_execute_command_noroot(host, port, username, password,
get_cluster_node_cmd, sudo_timeout=10)
else:
results = ssh_utils.ssh_execute_command(host, port, username, password, get_cluster_node_cmd)
parse_k8s_nodes_result = results[0][0].strip()
parse_k8s_nodes_result = parse_k8s_params.parse_k8s_nodes(parse_k8s_nodes_result)
# debug(f'集群 {host=} 所有节点信息如下{results=} => 转换后:\n{parse_k8s_nodes_result=}')
return parse_k8s_nodes_result
def get_cluster_pods_by_kubeconfig(params):
"""
通过调用方传递来的kubeconfig信息
获取集群中所有资源实例(Pod)信息详情
"""
kubeconfig = params.get("kubeconfig")
return k8s_utils_public.get_pod_info(kubeconfig)
def determine_accommodat_by_kubeconfig(params):
"""
通过调用方传递来的kubeconfig信息
判断集群中可部署哪些部件组合n
返回的是产品ID列表
"""
# debug(f'=====determine_accommodat_by_kubeconfig params: {params}')
kubeconfig = params.get("kubeconfig")
resources = params.get("resources", {})
# debug(f'=====kubeconfig: {kubeconfig}, resources: {resources}')
return k8s_utils_public.determine_accommodat(kubeconfig, resources)
def get_cluster_nodes_by_kubeconfig(params):
"""
通过调用方传递来的kubeconfig信息
获取集群中所有节点信息详情
"""
kubeconfig = params.get("kubeconfig")
return k8s_utils_public.get_node_info(kubeconfig)
def get_cluster_pods_by_server(params):
host = params.get("host")
port = int(params.get("port"))
username = params.get("user")
password = params.get("password")
# get_cluster_node_cmd = ["kubectl get pods --all-namespaces -o wide"]
get_cluster_pod_cmd = ["kubectl get pods --all-namespaces -o wide | grep -Ev 'kube-flannel|kube-system'"]
if username != "root":
results = ssh_utils.ssh_execute_command_noroot(host, port, username, password,
get_cluster_pod_cmd, sudo_timeout=10)
else:
results = ssh_utils.ssh_execute_command(host, port, username, password, get_cluster_pod_cmd)
parse_k8s_pods_result = results[0][0].strip()
parse_k8s_pods_result = parse_k8s_params.parse_k8s_pods(parse_k8s_pods_result)
# debug(f'集群 {host=} 所有Pod信息如下{results=} => 转换后:\n{parse_k8s_pods_result=}')
return parse_k8s_pods_result
def new_cluster_install(params):
# 随后填充远程操控k8s主逻辑
"""
用于接收cpcc端传递过来的k8s安装指令参数, 进行远程sshx调用操作内网机器进行集群节点的安装
可以安装控制节点和工作节点
参数示例
{'cluster_type': '0', 'host': '192.168.0.3', 'port': '22', 'user': 'ysh', 'password': 'Kyy@123456'}
"""
debug(f'=====new_cluster_install params: {params}')
host = params.get("host")
port = int(params.get("port"))
username = params.get("user")
password = params.get("password")
role = params.get("role")
target_file_path = "/opt/k8s_install.sh"
local_file_path="script/k8s_install.sh"
scp_map = {
local_file_path: target_file_path,
"files/kube-flannel.yml":"/opt/kube-flannel.yml",
"files/components.yaml":"/opt/components.yaml",
"files/ingress-nginx-controller.yaml":"/opt/ingress-nginx-controller.yaml",
"files/storage_class.yaml":"/opt/storage_class.yaml",
# "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml",
"files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml",
"files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml",
"script/k8s_uninstall.sh": "/opt/k8s_uninstall.sh",
"script/import_images.sh": "/opt/import_images.sh",
}
# 此处如果是工作节点的话应该完成
nfs_server_ip = host if role == "master" else str()
nfs_share_path = "/k8sdata" if role == "master" else str()
install_clusterrole_command = ["chmod 755 %s" % target_file_path,"%s %s %s %s" % (target_file_path,role,nfs_server_ip,nfs_share_path)]
debug(f'{install_clusterrole_command=}')
try:
if username == "root":
# 如果是root用户,直接执行安装脚本
debug(f'开始Root用户安装集群节点,用户名: {username}, 角色: {role},主机: {host},端口: {port}')
ssh_utils.ssh_execute_command(host, port, username, password,
install_clusterrole_command, real_time_log=True,
scp_map=scp_map)
else:
# 如果是普通用户,需要先将处理好
debug(f'开始普通用户安装集群节点,用户名: {username}, 角色: {role},主机: {host},端口: {port}')
ssh_utils.ssh_execute_command_noroot(host, port, username, password,
install_clusterrole_command, real_time_log=True,
scp_map=scp_map,
sudo_timeout=500) # 设置较长的超时时间适应K8s安装过程
except:
# debug(f"集群节点安装失败:{traceback.format_exc()}")
raise traceback.format_exc()
results = "%s => %s节点安装成功" % (host,role)
if role == "master":
# 安装控制节点接口,一共分三步:
# 第一步:执行安装命令
# 第二步:获取集群工作节点加入凭证
# 第三步返回加入凭证给cpcc保存pcapi无状态
clusterauth_command = ['kubeadm token create --print-join-command --ttl 0']
if username != "root":
join_idp = ssh_utils.ssh_execute_command_noroot(host, port, username, password, clusterauth_command,
real_time_log=True, sudo_timeout=60) # 获取token命令应该较快完成
else:
join_idp = ssh_utils.ssh_execute_command(host, port, username, password, clusterauth_command, real_time_log=True)
join_idp = join_idp[0][0].strip()
debug(f'集群验证码:{join_idp=}')
kubeconfig_context_command = ['cat /root/.kube/config']
if username != "root":
kubeconfig = ssh_utils.ssh_execute_command_noroot(host, port, username, password,
kubeconfig_context_command, real_time_log=True,
sudo_timeout=60) # 获取kubeconfig命令应该较快完成
else:
kubeconfig = ssh_utils.ssh_execute_command(host, port, username, password, kubeconfig_context_command, real_time_log=True)
kubeconfig = kubeconfig[0][0].strip()
debug(f'集群上下文:{kubeconfig=}')
results = join_idp + "###" + kubeconfig
if role == "worker":
# 安装工作节点接口,一共分两步:
# 第一步:执行安装命令
# 第二步:通过传进来的加入命令加入集群
debug(f'开始工作节点加入集群')
join_command = params.get("join_command")
if username != "root":
ssh_utils.ssh_execute_command_noroot(host, port, username, password, [join_command],
real_time_log=True, sudo_timeout=120) # 工作节点加入可能需要一些时间
else:
ssh_utils.ssh_execute_command(host, port, username, password, [join_command], real_time_log=True)
return results
def get_multiple_cluster_pod():
"""
获取 kubeconfig 中所有集群的 Pod 信息JSON 格式
功能
1. 遍历 kubeconfig 中所有上下文集群
2. 对每个集群获取所有命名空间的 Pod 信息
3. 返回格式化的 JSON 结果
返回值
str: 格式化的 JSON 字符串,结构示例
{
"cluster1": [
{"ip": "10.0.0.1", "namespace": "default", "name": "pod1"},
...
],
"cluster2": [...]
}
"""
# 获取所有集群上下文(忽略当前激活状态)
contexts, _ = config.list_kube_config_contexts()
if not contexts:
print("未找到任何集群上下文")
return
all_clusters_pods = {} # 存储所有集群的 Pod 信息
for context in contexts:
cluster_name = context["name"]
try:
# 创建集群专属的 API 客户端
api_client = config.new_client_from_config(context=cluster_name)
v1 = client.CoreV1Api(api_client)
# 收集当前集群的 Pod 信息
pods = []
for pod in v1.list_pod_for_all_namespaces().items:
pods.append({
"ip": pod.status.pod_ip,
"namespace": pod.metadata.namespace,
"name": pod.metadata.name
})
all_clusters_pods[cluster_name] = pods
except Exception as e:
print(f"集群 {cluster_name} 访问失败: {str(e)}")
return all_clusters_pods
def get_multiple_cluster():
"""
获取所有集群的完整信息,包括用户证书RBAC状态服务账号颁发者等
该函数会遍历kubeconfig文件中的所有上下文,针对每个上下文对应的集群进行以下操作
1. 从kubeconfig配置中提取静态信息,如API服务器地址CA证书数据用户证书和私钥数据
2. 通过Kubernetes API获取动态信息,如节点数量Kubernetes版本RBAC是否启用以及服务账号颁发者如果是OIDC集群
3. 处理在配置解析和API调用过程中可能出现的错误,并将错误信息记录在结果中
返回格式示例
{
"cluster1": {
"context_name": "ctx1",
"api_server": "https://1.1.1.1:6443",
"ca_cert_data": "LS0tLS1CRUd...",
"user_cert_data": "LS0tLS1CRUd...",
"user_key_data": "LS0tLS1CRUd...",
"nodes_count": 3,
"notready_count": 0,
"version": "1.28.3",
"rbac_enabled": true,
"service_account_issuer": "https://oidc.example.com",
"error": null
}
}
"""
try:
config.load_kube_config()
contexts, _ = config.list_kube_config_contexts()
if not contexts:
return json.dumps({"error": "未找到任何集群上下文信息"}, indent=4)
# 直接读取 kubeconfig 文件获取原始配置
kubeconfig_path = expanduser("~/.kube/config")
with open(kubeconfig_path, 'r') as f:
config_dict = yaml.safe_load(f)
clusters_config = config_dict.get('clusters', [])
users_config = config_dict.get('users', [])
all_clusters_info = {}
for context in contexts:
cluster_name = context['context']['cluster']
user_name = context['context'].get('user')
context_name = context['name'] # 新增:获取上下文名称
cluster_info = {
'nodes_count': 0,
'notready_nodes': 0,
'k8s_version': '',
'error': None,
'server_url': '',
'context_name': context_name, # 新增字段
'user_info': {
'name': '',
'client_certificate': 'not_support',
'client_key': 'not_support',
'token': 'not_support'
}
}
# 提取 serverUrl 和用户信息
cluster_config = next(
(c for c in clusters_config if c['name'] == cluster_name),
{}
)
cluster_info['server_url'] = cluster_config.get('cluster', {}).get('server', '')
user_config = next(
(u for u in users_config if u['name'] == user_name),
{}
)
user_data = user_config.get('user', {})
# 填充用户信息
cluster_info['user_info']['name'] = user_config.get('name', '')
# cluster_info['user_info']['client_certificate'] = user_data.get('client-certificate-data', '')
# cluster_info['user_info']['client_key'] = user_data.get('client-key-data', '')
# cluster_info['user_info']['token'] = user_data.get('token', '')
try:
api_client = config.new_client_from_config(context=context['name'])
v1 = client.CoreV1Api(api_client)
version_api = client.VersionApi(api_client)
nodes = v1.list_node().items
cluster_info['nodes_count'] = len(nodes)
notready_nodes = 0
for node in nodes:
ready_condition = next(
(cond for cond in node.status.conditions
if cond.type == "Ready" and cond.status == "True"),
None
)
if not ready_condition:
notready_nodes += 1
cluster_info['notready_nodes'] = notready_nodes
version = version_api.get_code()
cluster_info['k8s_version'] = version.git_version
except ApiException as e:
cluster_info['error'] = f"API错误({e.status}): {e.reason}"
except Exception as e:
cluster_info['error'] = f"连接失败: {str(e)}"
all_clusters_info[cluster_name] = cluster_info # 仍以 cluster_name 作为键
# return json.dumps(all_clusters_info, indent=4, ensure_ascii=False)
return all_clusters_info
except Exception as e:
return json.dumps({
'error': f"系统错误: {str(e)}"
}, indent=4)
def process_kubeconfigs():
"""
检测当前目录下的 kubestage 文件夹中的 kubeconfig 格式文件,
计算每个文件的大写 MD5 ,将其改名成对应的 MD5 ,
并按照规则 [md5[0]/md5[1]/md5[2]/md5] 的层级形式存储到当前目录下的 savekubes 目录中
如果 MD5 冲突,则记录冲突文件信息并跳过该文件
记录每个集群kubeconfig在savekubes目录里的存储地址和其它信息到数据表
"""
# 定义路径
current_dir = Path.cwd()
app_dir = current_dir / "app"
source_dir = app_dir / "kubestage"
target_dir = app_dir / "savekubes"
db_path = app_dir / "mk8s.db" # SQLite 数据库路径
# 确保目标目录存在
target_dir.mkdir(parents=True, exist_ok=True)
# 连接 SQLite 数据库(自动创建文件)
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# 创建表(如果不存在)
cursor.execute('''
CREATE TABLE IF NOT EXISTS mk8s (
md5_hash TEXT PRIMARY KEY,
server_url TEXT NOT NULL,
now_path TEXT NOT NULL,
original_filename TEXT NOT NULL,
timestamp TEXT NOT NULL
)
''')
conn.commit()
# 记录已处理的 MD5 值
md5_map = {}
for file_path in source_dir.glob("*"):
if not file_path.is_file():
continue
try:
# 读取并解析 YAML
with open(file_path, "rb") as f:
file_content = f.read()
config_data = yaml.safe_load(file_content)
# 验证基础结构
if not all(key in config_data for key in ["apiVersion", "clusters", "contexts"]):
raise ValueError("缺少必要字段: apiVersion, clusters 或 contexts")
if not isinstance(config_data["clusters"], list) or not isinstance(config_data["contexts"], list):
raise ValueError("clusters 或 contexts 必须是列表类型")
# 提取 server URL
server_url = None
for cluster in config_data["clusters"]:
if "cluster" in cluster and "server" in cluster["cluster"]:
server_url = cluster["cluster"]["server"]
break
if not server_url:
raise ValueError("未找到有效的 server URL")
# 计算 MD5
md5_hash = hashlib.md5(file_content).hexdigest().upper()
# 检查 MD5 冲突
if md5_hash in md5_map:
print(f"MD5 冲突: 文件 {file_path}{md5_map[md5_hash]} 具有相同的 MD5 值 ({md5_hash}),跳过。")
continue
# 记录 MD5 映射
md5_map[md5_hash] = str(file_path)
# 构造目标路径并移动文件
sub_dir = target_dir / md5_hash[0] / md5_hash[1] / md5_hash[2]
target_file_path = sub_dir / md5_hash
sub_dir.mkdir(parents=True, exist_ok=True)
os.rename(file_path, target_file_path)
# print(f"已处理: {file_path} -> {target_file_path}")
print("集群新增成功! kubeconfig在: %s" % target_file_path)
# 插入到 SQLite 数据库
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
print(md5_hash, server_url, target_file_path, file_path.name, timestamp)
cursor.execute(
"INSERT INTO mk8s (md5_hash, server_url, now_path, original_filename, timestamp) "
"VALUES (?, ?, ?, ?, ?)",
(md5_hash, server_url, str(target_file_path), file_path.name, timestamp)
)
conn.commit()
# except yaml.YAMLError as e:
# error_mark = getattr(e, "problem_mark", None)
# if error_mark:
# error_line = error_mark.line + 1
# error_column = error_mark.column + 1
# error_message = (
# f"YAML 格式错误:第{error_line}行,第{error_column}列:{e.problem}"
# )
# else:
# error_message = f"YAML 解析失败:{str(e)}"
# print(f"文件 {file_path} 不是有效的 kubeconfig 格式({error_message},跳过。")
# except ValueError as e:
# print(f"文件 {file_path} 不是有效的 kubeconfig 格式({str(e)},跳过。")
# except Exception as e:
# print(f"处理文件 {file_path} 时发生未知错误:{str(e)},跳过。")
except:
import traceback
traceback.print_exc()
cursor.execute("SELECT * FROM mk8s;")
rows = cursor.fetchall()
print(rows)
# 关闭数据库连接
conn.close()
if __name__ == "__main__":
# get_multiple_cluster()
# get_multiple_cluster()
# ret = get_cluster_nodes_by_server("192_168_0_3-6443")
# print(ret)
# process_kubeconfigs()
pass

View File

@ -0,0 +1,92 @@
import re
import json
def parse_k8s_pods(input_text):
header_mapping = {
"NAMESPACE": "pod_namespace",
"NAME": "pod_name",
"READY": "pod_ready",
"STATUS": "pod_running",
"RESTARTS": "pod_restart",
"AGE": "pod_age",
"IP": "pod_ip",
"NODE": "pod_node",
"NOMINATED NODE": "pod_nominated_node",
"READINESS GATES": "pod_readiness_gates" # 检查列名映射
}
lines = [line.strip() for line in input_text.strip().split('\n')]
if not lines:
return {"total": 0, "rows": []}
raw_headers = re.split(r'\s{2,}', lines[0])
rows = []
for line in lines[1:]:
values = re.split(r'\s{2,}', line.strip())
if len(values) == len(raw_headers):
mapped = {}
for i, raw_h in enumerate(raw_headers):
if raw_h in header_mapping:
mapped[header_mapping[raw_h]] = values[i]
rows.append(mapped)
return {
"total": len(rows),
"rows": rows
}
def parse_k8s_nodes(input_text):
# 定义表头映射关系(原始表头 -> 目标字段名)
header_mapping = {
"NAME": "node_name",
"STATUS": "node_status",
"ROLES": "node_role",
"AGE": "node_age",
"VERSION": "node_version",
"INTERNAL-IP": "node_internalip",
"EXTERNAL-IP": "node_externalip",
"OS-IMAGE": "node_osversion",
"KERNEL-VERSION": "node_kernelversion",
"CONTAINER-RUNTIME": "node_containeruntime",
"LABELS":"node_labels",
}
lines = [line.strip() for line in input_text.strip().split('\n')]
# 处理表头(应用映射关系)
raw_headers = re.split(r'\s{2,}', lines[0])
headers = [header_mapping[h] for h in raw_headers if h in header_mapping] # 确保只保留存在映射的字段
rows = []
for line in lines[1:]:
values = re.split(r'\s{2,}', line.strip())
if len(values) == len(raw_headers): # 使用原始表头长度进行匹配(保持列数一致)
# 按原始表头顺序映射到目标字段名
mapped_values = {header_mapping[raw_headers[i]]: values[i] for i in range(len(raw_headers)) if raw_headers[i] in header_mapping}
rows.append(mapped_values)
result = {
"total": len(rows),
"rows": rows
}
return result
if __name__ == "__main__":
# 示例输入(你的 kubectl 输出)
input_text1 = '''NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME
k8s-master Ready control-plane 4d19h v1.28.2 192.168.0.3 <none> Ubuntu 22.04.1 LTS 5.15.0-91-generic containerd://1.7.24'''
# 执行转换
# output1 = parse_k8s_nodes(input_text1)
# print(output1)
input_text2 = '''NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
kube-flannel kube-flannel-ds-sbkgm 1/1 Running 0 14m 192.168.0.2 k8s-worker-20250408132243 <none> <none>'''
result = parse_k8s_pods(input_text2)
print(json.dumps(result, indent=2))

View File

@ -0,0 +1,16 @@
import socket
def get_local_ipv4():
"""获取本机IPv4地址"""
try:
# 获取本机所有网络接口信息
local_ip = socket.gethostbyname(socket.gethostname())
except socket.gaierror:
# 如果上述方式失败(例如,主机名无法解析),则尝试以下备用方法
local_ip = ([l for l in ([ip for ip in socket.gethostbyname_ex(socket.gethostname())[2]
if not ip.startswith("127.")][:1], [[(s.connect(('8.8.8.8', 53)), s.getsockname()[0], s.close())
for s in [socket.socket(socket.AF_INET, socket.SOCK_DGRAM)]][0][1]]) if l][0][0])
return local_ip
if __name__ == "__main__":
print(get_local_ipv4())

240
app/k8sManager/ssh_utils.py Normal file
View File

@ -0,0 +1,240 @@
# 暂时不用
# from appPublic import sshx
# 后面有空了再改成g.debug
import time
import os
import re
from appPublic.log import debug
import paramiko
import socket
import traceback
def ssh_execute_command(host, port, username, password, commands, real_time_log=False,
remote_exec=True, scp_map=dict()):
try:
# 创建 SSH 对象
ssh = paramiko.SSHClient()
# 允许连接不在 know_hosts 文件中的主机
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
# 连接服务器
ssh.connect(hostname=host, port=port, username=username, password=password)
all_results = []
if scp_map:
# 创建 SFTP 客户端对象
sftp = ssh.open_sftp()
# 构建脚本内部调用脚本json
for sf,df in scp_map.items():
# 上传文件
debug(f"远程拷贝 {sf=} => {df=}")
sftp.put(sf, df)
# 关闭 SFTP 连接
sftp.close()
if remote_exec:
# 通用流程
result = ""
error = ""
for command in commands:
stdin, stdout, stderr = ssh.exec_command(f'{command}', get_pty=True)
stdin.flush()
if real_time_log:
debug(f"开始执行命令: {command=}, 请耐心等待...")
# 实时读取标准输出
for line in iter(stdout.readline, ""):
debug(f'{line=}')
result += line
# 实时读取标准错误输出
for line in iter(stderr.readline, ""):
debug(f'{line=}')
error += line
else:
result = stdout.read().decode(errors="replace")
error = stderr.read().decode(errors="replace")
all_results.append((result, error))
if real_time_log:
debug(f"命令 {command=} 执行结束")
# 关闭连接
ssh.close()
return all_results
except Exception as e:
debug(f"SSH连接或执行命令时出错: {e=}")
return [e]
# ----------------------------------------以下是非Root用户进行Root操作基座-------------------------------------------
def ssh_execute_command_noroot(host, port, username, password, commands, real_time_log=False,
remote_exec=True, scp_map=dict(), temp_dir="/tmp/ssh_temp", sudo_timeout=500):
"""
增强版SSH执行命令函数支持普通用户向root目录传输文件和执行sudo命令
sudo_timeout参数控制sudo命令的超时时间
"""
try:
# 创建SSH连接
ssh = paramiko.SSHClient()
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
ssh.connect(hostname=host, port=port, username=username, password=password)
all_results = []
# 创建临时目录(如果需要文件传输)
if scp_map:
# 创建临时目录
create_temp_cmd = f"mkdir -p {temp_dir} && chmod 700 {temp_dir}"
stdin, stdout, stderr = ssh.exec_command(create_temp_cmd)
create_error = stderr.read().decode(errors="replace")
if create_error:
raise Exception(f"创建临时目录失败: {create_error}")
# 创建SFTP客户端
sftp = ssh.open_sftp()
# 上传文件到临时目录
temp_scp_map = {}
for local_path, remote_path in scp_map.items():
# 确定临时目标路径
temp_remote_path = f"{temp_dir}/{os.path.basename(remote_path)}"
debug(f"上传文件 {local_path} => {temp_remote_path}")
sftp.put(local_path, temp_remote_path)
temp_scp_map[temp_remote_path] = remote_path
# 关闭SFTP连接
sftp.close()
# 将文件从临时目录移动到目标位置需要sudo权限
for temp_path, final_path in temp_scp_map.items():
# 确保目标目录存在
mkdir_cmd = f"sudo mkdir -p $(dirname {final_path})"
execute_sudo_command(ssh, mkdir_cmd, password, real_time_log, sudo_timeout, username)
# 移动文件
move_cmd = f"sudo mv {temp_path} {final_path}"
execute_sudo_command(ssh, move_cmd, password, real_time_log, sudo_timeout, username)
# 设置文件权限
chmod_cmd = f"sudo chmod 644 {final_path}"
if final_path.endswith('.sh'): # 脚本文件设置可执行权限
chmod_cmd = f"sudo chmod 755 {final_path}"
execute_sudo_command(ssh, chmod_cmd, password, real_time_log, sudo_timeout, username)
# 执行远程命令(如果需要)
if remote_exec:
for command in commands:
# 执行需要sudo权限的命令
result, error = execute_sudo_command(ssh, command, password, real_time_log, sudo_timeout, username)
all_results.append((result, error))
# 清理临时目录
if scp_map:
cleanup_cmd = f"rm -rf {temp_dir}"
stdin, stdout, stderr = ssh.exec_command(cleanup_cmd)
# 关闭SSH连接
ssh.close()
return all_results
except Exception as e:
debug(f"SSH操作错误: {traceback.format_exc()}")
raise e
return [(None, str(e))]
def execute_sudo_command(ssh, command, password, real_time_log, sudo_timeout, username):
"""
执行需要sudo权限的命令处理密码交互和超时
"""
sudo_cmd = f"sudo -S -p '[sudo] password: ' {command}" # -k参数确保每次都需要密码
# sudo_cmd = f"sudo -k -S -p '[sudo] password: ' {command}" # 正确方式
stdin, stdout, stderr = ssh.exec_command(sudo_cmd, get_pty=True)
# 真实的sudo命令执行
# sudo -p '[sudo] password: ' echo hello
# 设置命令超时
channel = stdout.channel
channel.settimeout(timeout=sudo_timeout)
# 处理密码提示
password_prompt = False
initial_output = ""
try:
debug("等待sudo密码提示...")
start_time = time.time()
while True:
ready = False
# 检查stdout
if channel.recv_ready():
# 此处发现chunk长度可能为一个标准kubeconfig长度导致无法正确读取所有输出
chunk = channel.recv(5800).decode(errors="replace")
initial_output += chunk
debug(f"stdout: {chunk.strip()}")
if re.search(r"\[sudo\] password:", chunk):
password_prompt = True
stdin.write(f"{password}\n")
stdin.flush()
break
ready = True
# 检查stderr
if channel.recv_stderr_ready():
chunk = channel.recv_stderr(5800).decode(errors="replace")
initial_output += chunk
debug(f"stderr: {chunk.strip()}")
if re.search(r"\[sudo\] password:", chunk):
password_prompt = True
stdin.write(f"{password}\n")
stdin.flush()
break
ready = True
# 超时检测
if time.time() - start_time > sudo_timeout:
raise Exception(f"等待sudo密码提示超时{sudo_timeout}秒): {sudo_cmd}")
if not ready:
time.sleep(1.5) # 避免CPU占用过高
# 如果没有收到密码提示但命令执行超时,可能是权限问题
if not password_prompt:
# 等待一段时间,确保没有密码提示
time.sleep(3)
# debug(f"ssh初步连接初始输出: {initial_output}")
if not re.search(r"\[sudo\] password:", initial_output):
raise Exception(f"未收到密码提示可能sudo配置不允许该用户执行此命令: {sudo_cmd}")
except socket.timeout:
raise Exception(f"命令执行超时({sudo_timeout}秒): {sudo_cmd}")
# 收集命令输出
result = initial_output if not password_prompt else ""
error = ""
try:
if real_time_log:
debug(f"执行命令: {sudo_cmd}")
# 实时读取标准输出
while True:
if channel.recv_ready():
line = channel.recv(5800).decode(errors="replace")
debug(f"输出: {line.strip()}")
result += line
if channel.recv_stderr_ready():
line = channel.recv_stderr(5800).decode(errors="replace")
debug(f"错误: {line.strip()}")
error += line
if channel.exit_status_ready():
break
time.sleep(1.5) # 避免CPU占用过高
else:
# 非实时模式读取输出
result += channel.recv(-1).decode(errors="replace") if channel.recv_ready() else ""
error += channel.recv_stderr(-1).decode(errors="replace") if channel.recv_stderr_ready() else ""
except socket.timeout:
raise Exception(f"命令执行超时({sudo_timeout}秒): {sudo_cmd}")
# 获取命令退出状态
exit_status = channel.recv_exit_status()
# 检查sudo执行是否失败
if exit_status != 0:
if "incorrect password attempt" in error.lower():
error = f"密码错误无法执行sudo命令: {sudo_cmd}"
elif "not allowed to run sudo" in error.lower():
error = f"用户 {username} 没有sudo权限执行此命令: {sudo_cmd}"
return result, error

100
app/ldap/ldapOperate.py Normal file
View File

@ -0,0 +1,100 @@
from ldap3 import Server, Connection, ALL, NTLM, SUBTREE,MODIFY_REPLACE
import json
# LDAP服务器信息
# ldap_server_uri = 'ldap://127.0.0.1:7389' # 或者 ldaps://your-ldap-server-secure
ldap_server_uri = 'ldap://10.8.64.15' # 或者 ldaps://your-ldap-server-secure
ldap_user = 'cn=admin,dc=test,dc=com'
ldap_password = '123456'
ldap_base = 'dc=test,dc=com'
# 创建LDAP服务器对象
server = Server(ldap_server_uri, get_info=ALL)
# 创建连接对象并绑定用户
conn = Connection(server, user=ldap_user, password=ldap_password, auto_bind=True)
def get_all_ldap_user():
# 搜索条目
search_filter = '(objectClass=person)'
search_attribute = ['cn', 'sn', 'mail']
conn.search(search_base=ldap_base,
search_filter=search_filter,
search_scope=SUBTREE,
attributes=search_attribute)
result=[ json.loads(x.entry_to_json())for x in conn.entries]
return result
def get_all_ldap_cn():
# 搜索条目
search_filter = '(objectClass=posixGroup)'
search_attribute = ['cn', 'objectClass', 'gidNumber']
conn.search(search_base=ldap_base,
search_filter=search_filter,
search_scope=SUBTREE,
attributes=search_attribute)
result=[ json.loads(x.entry_to_json())for x in conn.entries]
return result
def get_one_cn(cn):
# 搜索条目
search_filter = f'(&(cn={cn})(objectClass=posixGroup))'
search_attribute = ['cn', 'objectClass', 'gidNumber']
conn.search(search_base=ldap_base,
search_filter=search_filter,
search_scope=SUBTREE,
attributes=search_attribute
)
if conn.entries is None:
return None
else:
return json.loads(conn.entries[0].entry_to_json())
''''
传参示例
# uid="test_add1"
# plaintext_password="654321"
# uid_number=123456
# cn="test"
# add_ldap_user(uid,plaintext_password,cn)
'''
def add_ldap_user(uid,uid_number,plaintext_password,cn ):
cn_attr=get_one_cn(cn)
new_user_dn=f"uid={uid},ou=test,{ldap_base}"
new_user_attrs={
"objectClass": ["top", "posixAccount", "inetOrgPerson", "shadowAccount"],
"uidNumber":uid_number,
"gidNumber":cn_attr["attributes"]["gidNumber"],
'sn':[uid],
'loginShell': ["/bin/bash"],
'homeDirectory':["/srv/nfs/"+uid],
'cn':[cn]
}
flag=conn.add(new_user_dn,new_user_attrs["objectClass"],new_user_attrs)
print(conn.result)
if flag is True:
return modify_password(new_user_dn, plaintext_password)
else:
return conn.result
def modify_password(new_user_dn,plaintext_password):
mod_attrs = {
'userPassword': (
MODIFY_REPLACE,[plaintext_password])
}
conn.modify(new_user_dn, mod_attrs)
return conn.result
def delete_ldap_user(uid):
user_dn=f"uid={uid},ou=test,{ldap_base}"
conn.delete(user_dn)
return conn.result

94
app/pcapi.py Normal file
View File

@ -0,0 +1,94 @@
from ahserver.serverenv import ServerEnv
from ahserver.webapp import webapp
from ahserver.auth_api import get_client_ip
from ahserver.auth_api import AuthAPI
from appPublic.argsConvert import ArgsConvert
from appPublic.jsonConfig import getConfig
from appPublic.log import debug
from appPublic.worker import awaitify
from aiohttp import BasicAuth
from storage.common import get_storage_json #示例
# from ldap.ldapOperate import * #目前没有ldap服务器
# k8s多集群管理核心接口
from k8sManager.multiple_clusters import *
async def checkuserpasswd(obj, request, user, passwd):
auth = request.headers.get('Authorization')
if auth is None:
debug(f'auth is None, {request.headers=}')
return False
if auth.startswith('Basic '):
auther = BasicAuth('x')
m = auther.decode(auth)
username = m.login
password = m.password
config = getConfig()
if username != config.authentication.user:
debug(f'{username=},{password=}, user not match')
return False
if password != config.authentication.password:
debug(f'{username=},{password=}, password not match')
return False
ip = get_client_ip(None, request)
if ip not in config.authentication.iplist:
debug(f'{username=},{password=}, ip not in allowed ip pools')
return False
return True
debug(f'not a basic authentication')
return False
async def create_namespaced_job_v1(namespace, jobdesc):
batch_v1 = client.BatchV1Api()
f = awaitify(batch_v1.create_namespaced_job)
return await f(namespace=namespace, body=jobdesc)
async def determine_accommodat_by_kubeconfig_v1(params):
f = awaitify(determine_accommodat_by_kubeconfig)
return await f(params)
def paramify(data, ns):
ac = ArgsConvert('${', '}$')
d = ac.convert(data, ns)
return d
def init_func():
AuthAPI.checkUserPermission = checkuserpasswd
g = ServerEnv()
# 示例代码
g.create_namespaced_job_v1 = create_namespaced_job_v1
g.paramify = paramify
g.debug = debug
###ldap相关
# g.add_ldap_user=add_ldap_user
# g.get_all_ldap_user=get_all_ldap_user
# g.get_all_ldap_cn=get_all_ldap_cn
# g.get_one_cn=get_one_cn
# g.modify_password=modify_password
# g.delete_ldap_user=delete_ldap_user
### k8s多集群相关
g.new_cluster_install = new_cluster_install
g.get_multiple_cluster = get_multiple_cluster
g.get_multiple_cluster_pod = get_multiple_cluster_pod
g.get_cluster_nodes_by_server = get_cluster_nodes_by_server
g.get_cluster_pods_by_server = get_cluster_pods_by_server
g.delete_cluster_node = delete_cluster_node
g.node_state_switch = node_state_switch
g.yaml_apply_delete = yaml_apply_delete
g.get_cluster_nodes_by_kubeconfig = get_cluster_nodes_by_kubeconfig
g.determine_accommodat_by_kubeconfig = determine_accommodat_by_kubeconfig
g.get_cluster_pods_by_kubeconfig = get_cluster_pods_by_kubeconfig
g.node_label_opt = node_label_opt
g.get_storage_json=get_storage_json
g.result_dict={
"status":False,
"info":"operate failed",
"data":{}
}
if __name__ == '__main__':
webapp(init_func)

0
app/slurm/__init__.py Normal file
View File

87
app/slurm/job.py Normal file
View File

@ -0,0 +1,87 @@
from . import parse_job
from . import sshClient
import json
def get_history_job_command(query):
command="sacct -a -p "
if "startStartTime" in query:
command=command+"-S "+query["startStartTime"]+" "
if "group" in query:
command=command+"-g "+query["group"]+" "
if "jobId" in query:
command=command+"-j "+query["jobId"]+" "
if "accountUserName" in query:
command=command+"-u "+query["accountUserName"]+" "
if "jobIdList" in query:
command = command + "-j"
for jobId in query["jobIdList"]:
command=command+"-j "+jobId+" "
command=command+"--format=JobId,JobName%30,User%50,state,partition,NodeList,AllocCPUS,Submit,Start,End,Group,Workdir%100,Priority,ReqTRES%50"+" "
command=command+" "+"-X"
return command
'''
获取历史作业
'''
def get_history_list(data):
command=get_history_job_command(data)
return sshClient.exec_command(command)
def get_history_list_json(data):
result=get_history_list(data)
result_json=parse_job.process_data(result)
return result_json
'''
提交作业
'''
def submit_job(command):
command=command.replace("\r"," ")
return sshClient.exec_command(command)
'''
恢复作业
'''
def resume_job(jobId):
command="sudo scontrol resume "+jobId
return sshClient.exec_command(command)
'''
挂起作业
'''
def suspend_job(jobId):
command="sudo scontrol suspend "+jobId
return sshClient.exec_command(command)
'''
杀掉作业
'''
def kill_job(jobId):
command="sudo scancel "+jobId
return sshClient.exec_command(command)
'''
获取实时作业
'''
def get_real_time_list(query):
command="squeue -a "
return sshClient.exec_command(command)
'''
获取实时作业
'''
def get_real_time_list_json(query):
command="squeue -a --json"
result= sshClient.exec_command(command)
std_out=result["stdout"]
std_out=json.loads(std_out)
return std_out

57
app/slurm/node.py Normal file
View File

@ -0,0 +1,57 @@
from . import sshClient
'''
获取节点详情
'''
def get_node_details_json(NodeName):
command = "scontrol show node "
if NodeName is not None:
command=command+NodeName
print(command)
result= sshClient.exec_command(command)
data_str=result["stdout"]
# 按空行分割字符串,得到每个节点的数据
nodes_data = data_str.strip().split('\n\n')
# 初始化一个列表来存储所有节点的字典
nodes_list = []
# 遍历每个节点的数据
for node_data in nodes_data:
# 初始化一个字典来存储当前节点的键值对
node_dict = {}
# 按行分割当前节点的数据
lines = node_data.strip().split('\n')
for line in lines:
if "OS" in line:
node_dict["OS"]=line.split("=")[1]
else:
# 按空格分割键值对
key_value_pairs = line.strip().split()
for pair in key_value_pairs:
pair_list= pair.split('=')
if len(pair_list) < 2:
key=pair_list[0]
value=""
else:
key=pair_list[0]
value=pair_list[1]
# 将键和值添加到字典中
node_dict[key] = value
# 将当前节点的字典添加到列表中
nodes_list.append(node_dict)
return nodes_list
def update_node(dict_data):
command="sudo scontrol update"
if dict_data["NodeName"] is not None:
command=command+" "+"NodeName="+dict_data["NodeName"]
command=command+" "+"State="+dict_data["State"]
command=command+" "+"Reason="+dict_data["Reason"]
result= sshClient.exec_command(command)
return result

201
app/slurm/parse_job.py Normal file
View File

@ -0,0 +1,201 @@
import re
from datetime import datetime
from typing import List
import json
class jobJsonVO:
def __init__(self):
self.jobId = None
self.jobName = None
self.accountUserName = None
self.status = None
self.queueName = None
self.execHosts = None
self.numProcessors = None
self.submitTime = None
self.startTime = None
self.endTime = None
self.userGroup = None
self.workDir = None
self.userPriority = None
self.gpuCardNum = None
self.runningTime = None
self.formatRunningTime = None
self.jobProcessorTime = None
self.jobGpuCardTime = None
def get_start_time(self):
return self.startTime
def set_start_time(self, startTime):
self.startTime = startTime
def get_end_time(self):
return self.endTime
def set_end_time(self, endTime):
self.endTime = endTime
def get_status(self):
return self.status
def getNumProcessors(self):
return self.numProcessors
def getRunningTime(self):
return self.runningTime
class JobConstants:
DONE = "DONE"
EXIT = "EXIT"
RUN = "RUN"
PEND = "PEND"
CANCELLED = "CANCELLED"
def parse_status(status: str) -> str:
if status == "COMPLETED":
return JobConstants.DONE
if status == "FAILED":
return JobConstants.EXIT
if status == "RUNNING":
return JobConstants.RUN
if status == "PENDING":
return JobConstants.PEND
if "CANCELLED" in status:
return JobConstants.CANCELLED
return status
def parse_slurm_str_to_str(date_str: str) -> str:
# Implement your date parsing logic here
date_str=date_str.replace("T"," ")
date_str=date_str.replace("Z"," ")
return date_str
def handle_alloc_tres_get_gpus(tres_str: str) -> int:
# Implement your GPU card number extraction logic here
return 0
def calculate_processor_time(job: jobJsonVO) -> float:
# Implement your processor time calculation logic here
processors = job.getNumProcessors()
runningTime = job.getRunningTime()
processorsRunningTime = processors * runningTime
return processorsRunningTime
def calculate_gpu_card_time(job: jobJsonVO) -> float:
# Implement your GPU card time calculation logic here
return 0.0
def parse_date(date_str: str) -> datetime:
# 假设这是一个将字符串解析为 datetime 对象的函数
# 这里使用默认的日期格式,您可以根据实际需求调整
return datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
def get_now_date() -> datetime:
# 返回当前时间的 datetime 对象
return datetime.now()
def get_running_time(item: jobJsonVO) -> int:
t = item.get_start_time()
if t is None:
return 0
start_time_temp = parse_slurm_str_to_str(item.get_start_time())
item.set_start_time(start_time_temp)
if start_time_temp == "-":
return 0
start_time_date = datetime.now() # 默认值为当前时间
if item.get_status() != JobConstants.PEND:
start_time_date = parse_date(start_time_temp)
running_time = 0
status = item.get_status()
if status in [JobConstants.DONE, JobConstants.EXIT]:
end_time_temp = start_time_temp if item.get_end_time() is None else parse_slurm_str_to_str(item.get_end_time())
end_time_date = parse_date(end_time_temp)
item.set_end_time(end_time_temp)
running_time = int((end_time_date - start_time_date).total_seconds() * 1000) # 转换为毫秒
elif status == JobConstants.RUN:
running_time = int((get_now_date() - start_time_date).total_seconds() * 1000) # 转换为毫秒
elif status == JobConstants.CANCELLED:
end_time_date = parse_date(item.get_end_time()) if item.get_end_time() else start_time_date
running_time = int((end_time_date - start_time_date).total_seconds() * 1000) # 转换为毫秒
elif status == JobConstants.PEND:
running_time = 0
return running_time
def format_running_time(job: jobJsonVO) -> str:
# Implement your running time formatting logic here
return ""
def process_data(data: dict) -> List[jobJsonVO]:
try:
item_list = data["stdout"].split("\n")
job_json_list = []
for i in range(len(item_list)):
if len(item_list) < 1:
return []
if i < 1:
continue
words = item_list[i].split("|")
word_list = [word.strip() for word in words if word.strip()]
if len(word_list) < 14:
continue
job_json = jobJsonVO()
try:
jobId = int(word_list[0])
except ValueError:
continue
job_json.jobId = word_list[0]
job_json.jobName = word_list[1]
job_json.accountUserName = word_list[2]
job_json.status = parse_status(word_list[3])
job_json.queueName = word_list[4]
job_json.execHosts = word_list[5]
try:
job_json.numProcessors = int(word_list[6])
job_json.submitTime = parse_slurm_str_to_str(word_list[7])
job_json.startTime = parse_slurm_str_to_str(word_list[8])
except (ValueError, IndexError):
continue
if job_json.status == "DONE" or len(word_list) >= 10:
if word_list[9] != "Unknown":
job_json.endTime = parse_slurm_str_to_str(word_list[9])
job_json.userGroup = word_list[10]
job_json.workDir = word_list[11]
job_json.userPriority = int(word_list[12])
job_json.gpuCardNum = handle_alloc_tres_get_gpus(word_list[13])
job_json.runningTime = get_running_time(job_json)
job_json.formatRunningTime = format_running_time(job_json)
job_json.jobProcessorTime = calculate_processor_time(job_json)
job_json.jobGpuCardTime = calculate_gpu_card_time(job_json)
job_dict = job_json.__dict__
job_json_list.append(job_dict)
job_json_list.reverse()
return job_json_list
except Exception as e:
print(f"An error occurred: {e}")
raise Exception("CLUSTER_ERROR")

90
app/slurm/partition.py Normal file
View File

@ -0,0 +1,90 @@
from . import sshClient
import json
'''
创建队列
'''
def create_partition(dict):
command = "sudo scontrol create "
for key, value in dict.items():
command=command+key+"="+str(value)+" "
print(command)
return sshClient.exec_command(command)
'''
更新队列
'''
def update_partition(dict):
command = "sudo scontrol update "
for key, value in dict.items():
command=command+key+"="+str(value)+" "
return sshClient.exec_command(command)
'''
删除队列
'''
def delete_partition(dict):
command = "sudo scontrol delete "
for key, value in dict.items():
command = command + key + "=" + str(value) + " "
return sshClient.exec_command(command)
'''
查询队列详情
'''
def list_partition_detail(PartitionName):
command = "sudo scontrol show part "
if PartitionName is not None:
command = command + PartitionName + " "
return sshClient.exec_command(command)
'''
查询队列信息简略
'''
def list_partition_info(query):
command = "sudo sinfo "
return sshClient.exec_command(command)
'''
查询队列信息简略 带json
'''
def list_partition_detail_json(query):
command = "scontrol show part "
if query["partitionName"] is not None:
command=command+ " "+query["partitionName"]
result=sshClient.exec_command(command)
data_str = result["stdout"]
# 按空行分割字符串,得到每个节点的数据
partition_data = data_str.strip().split('\n\n')
# 初始化一个列表来存储所有节点的字典
partition_list = []
# 遍历每个节点的数据
for partition_data in partition_data:
# 初始化一个字典来存储当前节点的键值对
partition_dict = {}
# 按行分割当前节点的数据
lines = partition_data.strip().split('\n')
for line in lines:
# 按空格分割键值对
key_value_pairs = line.strip().split()
for pair in key_value_pairs:
pair_list= pair.split('=')
if len(pair_list)<2:
key=pair_list[0]
value=""
else:
key=pair_list[0]
value=pair_list[1]
# 将键和值添加到字典中
partition_dict[key] = value
# 将当前节点的字典添加到列表中
partition_list.append(partition_dict)
return partition_list

55
app/slurm/sshClient.py Normal file
View File

@ -0,0 +1,55 @@
import paramiko
def exec_command(command):
# 设置SSH连接参数
hostname = '10.8.64.15'
port = 22 # SSH端口,默认是22
username = 'ceni'
password = '1qazXSW@34'
# hostname = '127.0.0.1'
# port = 722 # SSH端口,默认是22
# username = 'ceni'
# password = '1qazXSW@34'
# 创建SSH客户端
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(hostname, port, username, password)
# 执行命令
stdin, stdout, stderr = client.exec_command(command)
result_out=stdout.read().decode("utf-8")
result_error=stderr.read().decode("utf-8")
result={
"stdout": result_out,
"stderr": result_error
}
# 关闭连接
# 关闭连接
client.close()
return result
def exec_command_hostname(command,hostname,port,username,password):
# 创建SSH客户端
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect(hostname, port, username, password)
# 执行命令
stdin, stdout, stderr = client.exec_command(command)
result_out=stdout.read().decode("utf-8")
result_error=stderr.read().decode("utf-8")
result={
"stdout": result_out,
"stderr": result_error
}
# 关闭连接
# 关闭连接
client.close()
return result

0
app/storage/__init__.py Normal file
View File

32
app/storage/common.py Normal file
View File

@ -0,0 +1,32 @@
from slurm import sshClient
import json
def df_to_json(df_output):
# 解析输出
lines = df_output.strip().split("\n")
headers = lines[0].split()
data = []
for line in lines[1:]:
values = line.split()
entry = {
headers[0]: values[0], # filesystem
headers[1]: values[1], # type
headers[2]: values[2], # size
headers[3]: values[3], # used
headers[4]: values[4], # avail
headers[5]: values[5], # use_percent
headers[6]: values[6], # mounted_on
}
data.append(entry)
return data
def get_storage_json(point):
command="df -h --output=source,fstype,size,used,avail,pcent,target"
if point is not None:
command=command+" "+point
result=sshClient.exec_command(command)
stdout=result["stdout"]
print(result)
result_json=df_to_json(stdout)
print(result_json)
return result_json

View File

@ -0,0 +1,336 @@
from kubernetes import client, config
import json
import re
from typing import Dict, Any, List, Tuple
import yaml
import time
def get_node_info(kubeconfig):
try:
# 加载配置
kubeconfig = yaml.safe_load(kubeconfig)
config.load_kube_config_from_dict(kubeconfig)
v1 = client.CoreV1Api()
api_client = client.ApiClient()
# 获取节点指标和 Pod 列表
node_metrics_path = "/apis/metrics.k8s.io/v1beta1/nodes"
node_metrics_response = api_client.call_api(
node_metrics_path, 'GET', auth_settings=['BearerToken'], response_type='object')[0]
node_metrics = {node['metadata']['name']: node.get('usage', {})
for node in node_metrics_response.get('items', [])}
# 获取所有 Pod 及其资源请求
pods = v1.list_pod_for_all_namespaces().items
node_pod_resources = {} # 存储每个节点上 Pod 的资源请求
print(pods)
for pod in pods:
if pod.spec.node_name and pod.status.phase in ["Running", "Pending"]:
node_name = pod.spec.node_name
if node_name not in node_pod_resources:
node_pod_resources[node_name] = {
"cpu": 0,
"memory": 0,
"gpu": 0
}
# 累加容器请求的资源
for container in pod.spec.containers:
if container.resources and container.resources.requests:
# CPU (转换为 millicores)
cpu_request = container.resources.requests.get("cpu", "0m")
cpu_millis = int(float(cpu_request.rstrip("m"))) if "m" in cpu_request else int(float(cpu_request) * 1000)
node_pod_resources[node_name]["cpu"] += cpu_millis
# Memory (转换为 bytes)
memory_request = container.resources.requests.get("memory", "0")
memory_bytes = int(float(memory_request.rstrip("KiMiGi")))
if "Ki" in memory_request:
memory_bytes *= 1024
elif "Mi" in memory_request:
memory_bytes *= 1024 * 1024
elif "Gi" in memory_request:
memory_bytes *= 1024 * 1024 * 1024
node_pod_resources[node_name]["memory"] += memory_bytes
# GPU
gpu_request = container.resources.requests.get("nvidia.com/gpu", "0")
node_pod_resources[node_name]["gpu"] += int(gpu_request)
# 获取节点列表并计算资源使用情况
nodes = v1.list_node().items
rows = []
for node in nodes:
node_name = node.metadata.name
internal_ip = next((address.address for address in node.status.addresses
if address.type == "InternalIP"), "未分配")
external_ip = next((address.address for address in node.status.addresses
if address.type == "ExternalIP"), "未分配")
status = node.status.conditions[-1].status if node.status.conditions else "Unknown"
status = "已就绪" if status == "True" else "未就绪"
# 节点角色
roles = []
role_labels = [
"node-role.kubernetes.io/control-plane",
"node-role.kubernetes.io/master",
"node-role.kubernetes.io/worker"
]
for label in role_labels:
if label in node.metadata.labels:
roles.append(label.split("/")[-1])
roles_str = "控制节点" if roles else "工作节点"
# 节点运行时间
running_time = time.time() - node.metadata.creation_timestamp.timestamp()
node_age = running_time
# 节点信息
k8s_version = node.status.node_info.kubelet_version
os_image = node.status.node_info.os_image
kernel_version = node.status.node_info.kernel_version
container_runtime = node.status.node_info.container_runtime_version
# 自定义标签
labels = node.metadata.labels
kyy_labels = [f"{k}={v}" for k, v in labels.items() if k.startswith('kyy-')]
# 实时资源使用情况
cpu_usage = node_metrics.get(node_name, {}).get('cpu', 'undefined')
if cpu_usage and isinstance(cpu_usage, str):
cpu_usage = int(cpu_usage.replace("n", ""))
cpu_usage = f'{(cpu_usage / 1000000 / 1000):.3f}'
memory_usage = node_metrics.get(node_name, {}).get('memory', 'undefined')
if memory_usage and isinstance(memory_usage, str):
memory_usage = int(memory_usage.replace("Ki", ""))
memory_usage = f"{(memory_usage / 1024 / 1024):.3f}Gi"
# 节点总资源
total_cpu = float(node.status.allocatable.get("cpu", "0"))
total_memory = parse_resource_value(node.status.allocatable.get("memory", "0")) / (1024 ** 1) #内存默认Mi转成Gi
total_gpu = int(node.status.allocatable.get("nvidia.com/gpu", "0"))
# 已分配资源
allocated_cpu = node_pod_resources.get(node_name, {}).get("cpu", 0) / 1000.0 # 转换为 cores
allocated_memory = node_pod_resources.get(node_name, {}).get("memory", 0) / (1024 ** 3) # 转换为 Gi
allocated_gpu = node_pod_resources.get(node_name, {}).get("gpu", 0)
# 可用资源
available_cpu = total_cpu - allocated_cpu
available_memory = total_memory - allocated_memory
available_gpu = total_gpu - allocated_gpu
node_info = {
# "node_name": node_name,
# "node_status": status,
# "node_role": roles_str,
# "node_age": node_age,
# "node_version": k8s_version,
# "node_internalip": internal_ip,
# "node_externalip": external_ip,
# "node_osversion": os_image,
# "node_kernelversion": kernel_version,
# "node_containeruntime": container_runtime,
# "node_labels": kyy_labels,
# "node_cpurate": cpu_usage,
# "node_memrate": memory_usage,
# 新增资源信息
"node_total_cpu": f"{total_cpu:.2f}",
"node_allocated_cpu": f"{allocated_cpu:.2f}",
"node_available_cpu": f"{available_cpu:.2f}",
"node_cpu_usage_percent": f"{(allocated_cpu / total_cpu * 100):.1f}%" if total_cpu > 0 else "0%",
"node_total_memory": f"{total_memory:.2f}Gi",
"node_allocated_memory": f"{allocated_memory:.2f}Gi",
"node_available_memory": f"{available_memory:.2f}Gi",
"node_memory_usage_percent": f"{(allocated_memory / total_memory * 100):.1f}%" if total_memory > 0 else "0%",
"node_total_gpu": total_gpu,
"node_allocated_gpu": allocated_gpu,
"node_available_gpu": available_gpu,
"node_gpu_usage_percent": f"{(allocated_gpu / total_gpu * 100):.1f}%" if total_gpu > 0 else "0%"
}
rows.append(node_info)
result = {
"total": len(rows),
"rows": rows
}
print(f"=== node_info={result}")
return result
except Exception as e:
import traceback
traceback.print_exc()
print(f"获取节点信息失败: {e}")
raise e
# 辅助函数:解析资源值
def parse_resource_value(value: str) -> float:
"""解析 Kubernetes 资源值(如 "1.5", "500m", "2Gi")为统一单位"""
if not value:
return 0.0
# 处理 CPU (cores 或 millicores)
if value.endswith('m'):
return float(value[:-1]) / 1000.0 # 转换为 cores
elif re.match(r'^\d+(\.\d+)?$', value):
return float(value) # 已经是 cores
# 处理内存 (Ki, Mi, Gi, Ti)
elif value.endswith('Ki'):
return float(value[:-2]) / (1024 ** 1) # 转换为 Gi
elif value.endswith('Mi'):
return float(value[:-2]) / (1024 ** 2)
elif value.endswith('Gi'):
return float(value[:-2])
elif value.endswith('Ti'):
return float(value[:-2]) * 1024
return float(value) # 默认按原单位返回
def get_pod_info(kubeconfig):
try:
# config.load_kube_config()
kubeconfig = yaml.safe_load(kubeconfig)
config.load_kube_config_from_dict(kubeconfig)
v1 = client.CoreV1Api()
api_client = client.ApiClient()
namespaces = v1.list_namespace().items
non_system_namespaces = [ns.metadata.name for ns in namespaces if
not ns.metadata.name.startswith(('kube-', 'default', 'local', 'ingress-'))]
rows = []
for namespace in non_system_namespaces:
pods = v1.list_namespaced_pod(namespace).items
pod_metrics_path = f"/apis/metrics.k8s.io/v1beta1/namespaces/{namespace}/pods"
pod_metrics_response = api_client.call_api(
pod_metrics_path, 'GET', auth_settings=['BearerToken'], response_type='object')[0]
pod_metrics = {pod['metadata']['name']: pod.get("containers",[{}])[0].get('usage', {})
for pod in pod_metrics_response.get('items', [])}
# debug(f"### pod_metrics_response={pod_metrics_response}")
for pod in pods:
pod_name = pod.metadata.name
if pod.status.container_statuses:
ready_count = sum(1 for cs in pod.status.container_statuses if cs.ready)
else:
ready_count = 0
ready_status = f"{ready_count}/{len(pod.spec.containers)}"
readiness_conditions = [{"type": cond.type, "status": cond.status}
for cond in pod.status.conditions if cond.type == "Ready"]
phase = pod.status.phase
restart_count = sum(cs.restart_count for cs in pod.status.container_statuses) if pod.status.container_statuses else 0
running_time = time.time() - pod.metadata.creation_timestamp.timestamp()
pod_age = running_time
pod_ip = pod.status.pod_ip if pod.status.pod_ip else "Unknown"
node_name = pod.spec.node_name if pod.spec.node_name else "Pod未被调度到节点"
nominated_node = pod.status.nominated_node_name if pod.status.nominated_node_name else ""
if phase == "Pending":
pod_ip = "Pending状态,未分配 IP"
node_name = "Pending状态,未分配节点"
nominated_node = "Pending状态,未分配节点"
# ✅ 提取容器的资源限制limits
cpu_limit = "未设置"
memory_limit = "未设置"
gpu_limit = "未设置"
if pod.spec.containers:
container = pod.spec.containers[0] # 假设只取第一个容器
if container.resources and container.resources.limits:
limits = container.resources.limits
cpu_limit = limits.get("cpu", "未设置")
memory_limit = limits.get("memory", "未设置")
gpu_limit = limits.get("nvidia.com/gpu", "未设置") # 只支持 NVIDIA GPU
# ✅ 获取 metrics 数据(已有逻辑不变)
cpu_usage = pod_metrics.get(pod_name, {}).get('cpu', 'undefined')
if cpu_usage and isinstance(cpu_usage, str):
cpu_usage = int(cpu_usage.replace("n", ""))
cpu_usage = f'{(cpu_usage / 1000000 / 1000):.3f}'
memory_usage = pod_metrics.get(pod_name, {}).get('memory', 'undefined')
if memory_usage and isinstance(memory_usage, str):
memory_usage = int(memory_usage.replace("Ki", ""))
memory_usage = f"{(memory_usage / 1024):.3f}Mi"
if phase in ["Pending", "Succeeded", "Failed"]:
cpu_usage = "Pod未运行,无资源使用数据"
memory_usage = "Pod未运行,无资源使用数据"
# ✅ 新增 GPU 使用情况字段(暂时用占位符)
gpu_usage = "0%" # 如果你有 DCGM / Prometheus 可替换为实际值
pod_info = {
"pod_namespace": namespace,
"pod_name": pod_name,
"pod_ready": ready_status,
"pod_running": phase,
"pod_restart": str(restart_count),
"pod_age": pod_age,
"pod_ip": pod_ip,
"pod_node": node_name,
"pod_nominated_node": nominated_node,
"pod_cpurate": cpu_usage,
"pod_memrate": memory_usage,
# ✅ 新增字段
"pod_cpu_limit": cpu_limit,
"pod_memory_limit": memory_limit,
"pod_gpu_limit": gpu_limit,
"pod_gpu_usage": gpu_usage,
}
rows.append(pod_info)
result = {
"total": len(rows),
"rows": rows
}
return result
except Exception as e:
raise "获取Pod信息失败: %s" % e.reason
if __name__ == "__main__":
kubeconfig = """apiVersion: v1
clusters:
- cluster:
certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURCVENDQWUyZ0F3SUJBZ0lJTGd4THlGMjM3QmN3RFFZSktvWklodmNOQVFFTEJRQXdGVEVUTUJFR0ExVUUKQXhNS2EzVmlaWEp1WlhSbGN6QWVGdzB5TlRBME1ETXdOelE1TXpWYUZ3MHpOVEEwTURFd056VTBNelZhTUJVeApFekFSQmdOVkJBTVRDbXQxWW1WeWJtVjBaWE13Z2dFaU1BMEdDU3FHU0liM0RRRUJBUVVBQTRJQkR3QXdnZ0VLCkFvSUJBUURQUm5hdkZmNXBTWWUvVmJLc0s2SnhEazhyc2hsc2h5WnNNRk8xZDVhZG45Z055T0wwR2NtbEsrQ1EKVklKSnF3RklJeSsxUVlEd3VRMytzczEwYmV2Y2lqM1BRanluaXJRRkNhRlA0NHh2ZkEyK2thV1FYeTVncGwrMwpjSkI1K1MxVmx2Vi9aSHQ5SXgwNjFCdHB4dE5oMUkxNS9IYk4rWmVNNnEvd3lxUW93Y01ub2pyNDltYkxxOWNwCnFSem5LL2FwWXlBYnljUk9uWWlIZ0FjQWdsclFOTjBKUEJZd2dRd0pIUmlIcGhtVFBkdmY2ckxkNFR0dFl2OXgKdmZIRDNjVUdwZkVBUElaNUJBVi9ZM3p5V0pSbDQzSFV2Ri9jemNDQ01jOVlUd3VXaEpxb2doUUZUdnNuSVZzTwovNEtKQzRwQXFSenJlZFRWdExmMXgzQlRpVCt0QWdNQkFBR2pXVEJYTUE0R0ExVWREd0VCL3dRRUF3SUNwREFQCkJnTlZIUk1CQWY4RUJUQURBUUgvTUIwR0ExVWREZ1FXQkJUZjRZbzBpOVpIZC9ObzdkYWZrZVRTbzVzdzN6QVYKQmdOVkhSRUVEakFNZ2dwcmRXSmxjbTVsZEdWek1BMEdDU3FHU0liM0RRRUJDd1VBQTRJQkFRRERLalJWVVp1YwppckJ4QWdZWnBBeW5NSHdWQTF6YStVT09FM1U0MEMyVTN0VGgrK1BsN2o2ZGJZTWNWdEFvTXhxaDRiVjNQRW5SCmtWcWNaN2NjS3FGSXduZlhHK0ZGTVVwazVoTk0xLzc2UXBobi9OWk8zSStSVjFtV0VOU1BzM1FZdEVoWktXUlgKYWRXZ0krK0x1cUZyZVpTVzVjRXNnMWZDODFtd3dhTXdkRHZWcFJZMFEwWlBsMGFqTURsSlNDaDNOSXpQOS82bwpndXBrY1JSdWtvRGlscWVraXlrRWJ5OVJCWHZIbXo3Q0sxQ1ZnZXZJTDZrVnRPRFF2Rm10Qm1WemlRNWFDcXJOCmtZNmd6OUNGMkdKc2M4UkZrcWQxbzdMelhPakJsTkdzN2k2WmdEOE1Ca2tiank2RmZDZWVndmxOOGFCU2VmblEKZ2ZNOVptbnRpMVNDCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K
server: https://192.168.0.3:6443
name: kubernetes
contexts:
- context:
cluster: kubernetes
user: kubernetes-admin
name: kubernetes-admin@kubernetes
current-context: kubernetes-admin@kubernetes
kind: Config
preferences: {}
users:
- name: kubernetes-admin
user:
client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURJRENDQWdpZ0F3SUJBZ0lIVGZPdmU4TzBJVEFOQmdrcWhraUc5dzBCQVFzRkFEQVZNUk13RVFZRFZRUUQKRXdwcmRXSmxjbTVsZEdWek1CNFhEVEkxTURRd016QTNORGt6TlZvWERUSTJNRFF3TXpBM05UUXpOMW93TkRFWApNQlVHQTFVRUNoTU9jM2x6ZEdWdE9tMWhjM1JsY25NeEdUQVhCZ05WQkFNVEVHdDFZbVZ5Ym1WMFpYTXRZV1J0CmFXNHdnZ0VpTUEwR0NTcUdTSWIzRFFFQkFRVUFBNElCRHdBd2dnRUtBb0lCQVFEWVJJT3h0TWFkOWs2T1JsL1UKZ2ZnZVJDQkpjZmMrc2ZFbzkxeW4vc05KZFVIbWRuamtMaC9wRjcwZkdoVWZ3R2t5dzR0WkdpTFFNR0xwclpyeAphVTdJT0R3a3I2ejl1SkQzaHlFZExhZGpZT0NOMHJhUFNpV05GV1QwSVN2UVBjZzNGQkQ2YmFHb2RtSmN5YnBPCk5qY1VZZmh5WEVqRXMwOU92QzhhZUJCbm9Na1RkRk53dlFaYXE2LzR3eTUyN0k3aUdIUVdvL21JS1VUVHhzRFgKMzJnVXErZmRVMEh5STJJeWhNMGdwT29uNURCVmRUbWsyMkZsVHk0ZWJ3Q3R4QmMvRCtpelhuZFpVd2tHMExMVwpqTEc4L3JkWTZ4WFJDVkhHM1BWNURRK0JvNEpnMTUwWWFSUnBKeDJYSGxad3N5OFBZcWVLcTM0b1pxczRTRndmCjJCY3JBZ01CQUFHalZqQlVNQTRHQTFVZER3RUIvd1FFQXdJRm9EQVRCZ05WSFNVRUREQUtCZ2dyQmdFRkJRY0QKQWpBTUJnTlZIUk1CQWY4RUFqQUFNQjhHQTFVZEl3UVlNQmFBRk4vaGlqU0wxa2QzODJqdDFwK1I1TktqbXpEZgpNQTBHQ1NxR1NJYjNEUUVCQ3dVQUE0SUJBUUFTR0phc1EyQXpLdVNZWFdtMGlYOUhnWTNZQUJGMHpYRzRKZU5lCjREekxkOHF2TXlqRGMwUWFWSUtNbWswemhrV1ZIQzNKSEZWalRXcDBUNFE0TlVBMk8rOXFob1p0a25NL3dsQlUKS0Zab3ZHNFd6SU1sdVJwL21ZRUIzL3dHbkFPV01MdEtBSWJ3d3FRVWl4VW5KYkxCeG4xQ1k5ZERzb1o4VmZZMQp4N2R0WDBJWjJkbU1ETTVLV1lrbW5tQWJBR0tXazZBR3pVWEpWNmlTU3laYjlWLzNuN3hmZlpZRkVDQXBQNk91CjhmRGdIVjBCdEMxS3VmU0tsTitLMnF2aXAzMlRjRHdoTEVHQWQ2aU9qYzhBRXlHelJmOWY4M0xUSGJ2dGtibjYKR0VQQlBQSExSTFlQWEh0OE9LbHdNOThwQWxkSkIyWEJ6UEttc0JFeGFOSWRXd2FTCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K
client-key-data: LS0tLS1CRUdJTiBSU0EgUFJJVkFURSBLRVktLS0tLQpNSUlFcEFJQkFBS0NBUUVBMkVTRHNiVEduZlpPamtaZjFJSDRIa1FnU1hIM1BySHhLUGRjcC83RFNYVkI1blo0CjVDNGY2UmU5SHhvVkg4QnBNc09MV1JvaTBEQmk2YTJhOFdsT3lEZzhKSytzL2JpUTk0Y2hIUzJuWTJEZ2pkSzIKajBvbGpSVms5Q0VyMEQzSU54UVErbTJocUhaaVhNbTZUalkzRkdINGNseEl4TE5QVHJ3dkduZ1FaNkRKRTNSVApjTDBHV3F1ditNTXVkdXlPNGhoMEZxUDVpQ2xFMDhiQTE5OW9GS3ZuM1ZOQjhpTmlNb1ROSUtUcUorUXdWWFU1CnBOdGhaVTh1SG04QXJjUVhQdy9vczE1M1dWTUpCdEN5MW95eHZQNjNXT3NWMFFsUnh0ejFlUTBQZ2FPQ1lOZWQKR0drVWFTY2RseDVXY0xNdkQyS25pcXQrS0dhck9FaGNIOWdYS3dJREFRQUJBb0lCQVFDQ1djRjZ3YmdaQzVWTApvZFV1MCt1RjZvLy9WS2F1YmpncDlmWXQ5NXNqVW42Vzl2OWtvUHh3MVBNVHBQZm9mR09yeWpyYVNLdUZDalVFCkhiUlBINmJ4ZlJ1YkRSdmFqWDByQkpLTDhMRjhiNjdKTEtFR2VxMXBmT1N0VkxVQXZjeElqbHF4WnBUU1loQmwKVnQxcE9MbzRHZGpTclJiYklDeUVDMTdrdUV0QytZV3lFb3E5MmlLNVdMTHdHM2hwVzhyVlVLVzZ2T0cyd0l4bAp0RWhMSGpOOWtnb1VVa2pORG9tK2FlcVVxeXhDeUZEdll4UmdhVTd0Y3pJSk52SUk3aDYxaExQbEZtMmxGQ0xlCjhjeTdKUDMyV1ZDSUpUMHhRNkJJRTdvVld4WWIvMzFVSHYrTHg0UHlBcFpiZ3piMjlvQm54VjhneUxnVjZDWW0Kd1psQlQ4S2hBb0dCQU9tMFZqTkVHVm5EaXNsTDFVVkNKYzFCVU1KcjNwalQvV0g4d2s0UzJYWmhwRWdVQmpQYgpDM3Y5czkxNHh6SjhXYWFtUFZPVGZMRmxzRWFLNnJpMFhjQkhXQi9ob1R1aDVKaDByS1RNWWFMTm9SdU00VCt6Ci9zUG1aY1ZMVXcxdHFmd3U5YlVpSTJCQURQNFM2MUFubk5hSnF1UmFWRk8vT1pqZUkvbHJzMVBSQW9HQkFPem0KVTNvcjNuSDh4WHI2WDNJUjRDM3l3TkZLaHNVVE44VmdWNWRVL0U5RmRHTldUVzRkWHdCK01jeUlQMlFLbjlycwpmcU9Cb0c3NlRKVHF0YzVobjY5Q014c1lVNVdPcDhOZW9oaXplY1luSTFjTk94TmZwdzZDdUZVb1pmTFFxU1dICmJ4dEVEaFkrcXJjR2FLZ3VzMk1uMkJ2cEg1bUhCTk5DL05pSVZ1WTdBb0dBZFlnVEhkOHVuSjBockJCdUpsR1kKN3p2YzRKb2RMV0RYZWpNQ2lQOGp6RXhZc1VNWXgzVnV0aUdtRmtpS2JWSnFSOHdzNVY0MEJJY3VlcHVjWmQyWApsSDZNekNQTjBVNmV4eWxPTmVidlowL2dxUmxWb3BMa0dpTkJwVkkzWjNaeVdYaElhNXJLamJwSWpuSjNVeTFJCnpBQWFLSk5nKzJrZEQwc1FibnlDaURFQ2dZQVFDZVA2OEg5bDdqd2NnRmozNnhmblpIa0RjbTAvYUhhdEtVR2sKNEQ4WXl0WC9aN2RrVGg3QmRNbkFWRFVlZTgyb3o3d2ZLOGFGM1BKVVhyT2lYbCttU1BBVzFJWE1LVlZZVjg3WApwMGNHVUY0SEpjRXJKWjIwME1yVUVTRWQyRnlyU3NrTjZvU2RvdTZCNTdBc09zVXdZR0UwT290R0pLc0I5cFlSCnZ1RkxRd0tCZ1FEZVFuRElPaUQ2SEpmc2loTC8xZ3dnS0hVeVc2WGYrNFhQODd3ZlVXT1N0SEpza29oTkZHdk8KSnpNdUFvc2V2UGFWdElCSXBZbFgycUlaaHlhdyt2VW9BUTZYRnR3WjM1QWo1T1VPbVFQQUJWbkVXZUJZRzdSaQpaZmhEU2NTek5xb3ozWFpjMnA4a2VMWE1XOWJsTDNNOTdOMFRLbExuZ0NrSTdoaXJMVGE2T0E9PQotLS0tLUVORCBSU0EgUFJJVkFURSBLRVktLS0tLQo="""
# 加载配置
# kubeconfig = yaml.safe_load(kubeconfig)
# config.load_kube_config_from_dict(kubeconfig)
# 测试获取节点信息
# try:
# node_info = get_node_info(kubeconfig)
# print(json.dumps(node_info, indent=4, ensure_ascii=False))
# except Exception as e:
# print(f"Error: {e}")
try:
pod_info = get_pod_info(kubeconfig)
print(json.dumps(pod_info, indent=4, ensure_ascii=False))
except Exception as e:
print(f"Error: {e}")

View File

@ -0,0 +1,54 @@
import re
def parse_resource_value(value_str, resource_type, unit):
"""
解析资源值并转换为可读格式假设 value_str 仅为数字已去除单位
:param value_str: 仅包含数字的字符串已去除单位
:param resource_type: 'cpu' 'memory'
:param unit: 原始单位用于决定如何处理该数值 'm', 'n', 'Ki', 'Mi', 'Gi'
:return: 转换后的可读值和目标单位
"""
print(111)
# 直接转换输入字符串为浮点数(不再提取单位)
try:
value = float(value_str.strip())
except ValueError:
raise ValueError("无法解析输入字符串为数字")
print(222)
if resource_type == 'cpu':
# CPU利用率的转换根据unit参数判断原始单位
if unit == 'n': # 纳秒
return value / 1e9, '%'
elif unit == 'm': # 毫核
return value / 1000, 'cores'
else:
# 默认认为是核心数(单位是 core 或直接以整数表示)
return value, 'cores'
elif resource_type == 'memory':
# 内存相关的单位转换
units_dict = {'Ki': 1, 'Mi': 1024, 'Gi': 1024 * 1024}
if unit in units_dict:
bytes_val = value * 1024 * units_dict[unit] # Ki/Mi/Gi -> 字节
elif unit == 'B' or unit == '': # 字节或无单位
bytes_val = value
else:
raise ValueError(f"不支持的内存单位: {unit}")
print(444)
# 将字节转换为MB或GB
if bytes_val < 1024 * 1024 * 1024:
return bytes_val / (1024 * 1024), 'MB'
else:
return bytes_val / (1024 * 1024 * 1024), 'GB'
else:
raise ValueError("未知的资源类型,应为 'cpu''memory'")
if __name__ == "__main__":
numeric_part = re.sub(r'\D', '', '80739445n')
numeric_part2 = re.sub(r'\D', '', '4792336Ki')
print(f'CPU:{parse_resource_value(numeric_part, "cpu", unit="n")}') # CPU利用率
print(f'内存:{parse_resource_value(numeric_part2, "memory", unit="Ki")}') # 内存利用率

View File

@ -0,0 +1,261 @@
def ssh_execute_commands(host, port, username, password, commands, real_time_log=False):
try:
import paramiko
# 创建 SSH 对象
ssh = paramiko.SSHClient()
# 允许连接不在 know_hosts 文件中的主机
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
# 连接服务器
ssh.connect(hostname=host, port=port, username=username, password=password)
all_results = []
result = ""
error = ""
for command in commands:
# stdin, stdout, stderr = ssh.exec_command(f'sudo -S {command}', get_pty=True)
# stdin.write(password + '\n')
stdin, stdout, stderr = ssh.exec_command(f'{command}', get_pty=True)
stdin.flush()
if real_time_log:
print(f"开始执行命令: {command}")
# 实时读取标准输出
for line in iter(stdout.readline, ""):
print(line, end="")
result += line
# 实时读取标准错误输出
for line in iter(stderr.readline, ""):
print(line, end="")
error += line
else:
result = stdout.read().decode()
error = stderr.read().decode()
all_results.append((result, error))
if real_time_log:
print(f"命令 {command} 执行结束")
# 关闭连接
ssh.close()
return all_results
except Exception as e:
print(f"SSH 连接或执行命令时出错: {e}")
return None
def new_cluster(params):
# 随后填充远程操控k8s主逻辑
"""
用于接收cpcc端传递过来的k8s安装指令参数进行远程sshx调用操作内网机器进行集群节点的安装
参数示例
{'cluster_type': '0', 'host': '192.168.0.3', 'port': '22', 'user': 'ysh', 'password': 'Kyy@123456'}
"""
host = params.get("host")
port = int(params.get("port"))
username = params.get("user")
password = params.get("password")
commands = ['kubectl get nodes', 'kubectl get pods --all-namespaces', 'kubectl get services --all-namespaces']
results = ssh_execute_commands(host, port, username, password, commands, real_time_log=True)
if results:
# print("所有命令执行的整体结果:")
for result, error in results:
if result:
print("执行结果:")
print(result)
if error:
print("错误信息:")
print(error)
return results
import json
import argparse
import logging
from kubernetes import client, config
from kubernetes.client.rest import ApiException
import time
def setup_logging():
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
def format_runtime(seconds):
if seconds < 60:
return f"{int(seconds)}s"
elif seconds < 3600:
minutes = int(seconds // 60)
return f"{minutes}m"
elif seconds < 86400:
hours = int(seconds // 3600)
return f"{hours}h"
else:
days = int(seconds // 86400)
return f"{days}d"
def get_node_info():
try:
config.load_kube_config()
v1 = client.CoreV1Api()
api_client = client.ApiClient()
node_metrics_path = "/apis/metrics.k8s.io/v1beta1/nodes"
nodes = v1.list_node().items
node_metrics_response = api_client.call_api(
node_metrics_path, 'GET', auth_settings=['BearerToken'], response_type='object')[0]
node_metrics = {node['metadata']['name']: node.get('usage', {})
for node in node_metrics_response.get('items', [])}
rows = []
for node in nodes:
node_name = node.metadata.name
internal_ip = next((address.address for address in node.status.addresses
if address.type == "InternalIP"), "Unknown")
external_ip = next((address.address for address in node.status.addresses
if address.type == "ExternalIP"), "Unknown")
status = node.status.conditions[-1].status if node.status.conditions else "Unknown"
roles = []
role_labels = [
"node-role.kubernetes.io/control-plane",
"node-role.kubernetes.io/master",
"node-role.kubernetes.io/worker"
]
for label in role_labels:
if label in node.metadata.labels:
roles.append(label.split("/")[-1])
roles_str = ",".join(roles) if roles else "None"
running_time = time.time() - node.metadata.creation_timestamp.timestamp()
node_age = format_runtime(running_time)
k8s_version = node.status.node_info.kubelet_version
os_image = node.status.node_info.os_image
kernel_version = node.status.node_info.kernel_version
container_runtime = node.status.node_info.container_runtime_version
labels = node.metadata.labels
cpu_usage = node_metrics.get(node_name, {}).get('cpu', 'undefined')
memory_usage = node_metrics.get(node_name, {}).get('memory', 'undefined')
node_info = {
"node_name": node_name,
"node_status": status,
"node_role": roles_str,
"node_age": node_age,
"node_version": k8s_version,
"node_internalip": internal_ip,
"node_externalip": external_ip,
"node_osversion": os_image,
"node_kernelversion": kernel_version,
"node_containeruntime": container_runtime,
"node_labels": labels,
"node_cpurate": cpu_usage,
"node_memrate": memory_usage
}
rows.append(node_info)
result = {
"total": len(rows),
"rows": rows
}
return result
except ApiException as e:
logging.error(f"获取节点信息时出错: {e}")
return {"total": 0, "rows": []}
def get_pod_info():
try:
config.load_kube_config()
v1 = client.CoreV1Api()
api_client = client.ApiClient()
namespaces = v1.list_namespace().items
non_system_namespaces = [ns.metadata.name for ns in namespaces if
not ns.metadata.name.startswith(('kube-', 'default', 'local'))]
rows = []
for namespace in non_system_namespaces:
pods = v1.list_namespaced_pod(namespace).items
pod_metrics_path = f"/apis/metrics.k8s.io/v1beta1/namespaces/{namespace}/pods"
pod_metrics_response = api_client.call_api(
pod_metrics_path, 'GET', auth_settings=['BearerToken'], response_type='object')[0]
pod_metrics = {pod['metadata']['name']: pod.get('usage', {})
for pod in pod_metrics_response.get('items', [])}
for pod in pods:
pod_name = pod.metadata.name
if pod.status.container_statuses:
ready_count = sum(1 for cs in pod.status.container_statuses if cs.ready)
else:
ready_count = 0
ready_status = f"{ready_count}/{len(pod.spec.containers)}"
readiness_conditions = [{"type": cond.type, "status": cond.status}
for cond in pod.status.conditions if cond.type == "Ready"]
phase = pod.status.phase
restart_count = sum(cs.restart_count for cs in pod.status.container_statuses) if pod.status.container_statuses else 0
running_time = time.time() - pod.metadata.creation_timestamp.timestamp()
pod_age = format_runtime(running_time)
pod_ip = pod.status.pod_ip if pod.status.pod_ip else "Unknown"
node_name = pod.spec.node_name if pod.spec.node_name else "Pod 未被调度到节点"
nominated_node = pod.status.nominated_node_name if pod.status.nominated_node_name else "调度器未提名节点"
if phase == "Pending":
pod_ip = "Pod 处于 Pending 状态,未分配 IP"
node_name = "Pod 处于 Pending 状态,未被调度到节点"
nominated_node = "Pod 处于 Pending 状态,调度器未提名节点"
readiness_gates = []
cpu_usage = pod_metrics.get(pod_name, {}).get('cpu', 'undefined')
memory_usage = pod_metrics.get(pod_name, {}).get('memory', 'undefined')
if phase in ["Pending", "Succeeded", "Failed"]:
cpu_usage = "Pod 未运行,无资源使用数据"
memory_usage = "Pod 未运行,无资源使用数据"
pod_info = {
"pod_namespace": namespace,
"pod_name": pod_name,
"pod_ready": ready_status,
"pod_running": phase,
"pod_restart": restart_count,
"pod_age": pod_age,
"pod_ip": pod_ip,
"pod_node": node_name,
"pod_nominated_node": nominated_node,
"pod_readiness_gates": readiness_gates,
"pod_cpurate": cpu_usage,
"pod_memrate": memory_usage
}
rows.append(pod_info)
result = {
"total": len(rows),
"rows": rows
}
return result
except ApiException as e:
logging.error(f"获取Pod信息时出错: {e}")
return {"total": 0, "rows": []}
if __name__ == "__main__":
# params = {'cluster_type': '0', 'host': '192.168.0.3', 'port': '22', 'user': 'root', 'password': 'Yuanshenhong.1'}
# new_cluster(params)
parser = argparse.ArgumentParser(description='获取Kubernetes节点和Pod实时信息')
parser.add_argument('--interval', type=int, default=300, help='刷新间隔(秒)')
args = parser.parse_args()
setup_logging()
while True:
node_info = get_node_info()
pod_info = get_pod_info()
result = {
"节点信息": node_info,
"Pod信息": pod_info
}
logging.info(json.dumps(result, indent=4, ensure_ascii=False))
time.sleep(args.interval)

View File

@ -0,0 +1,79 @@
def ssh_execute_command(host, port, username, password, commands, real_time_log=False,
remote_exec=True, scp_map=dict()):
try:
import os
import paramiko
# 创建 SSH 对象
ssh = paramiko.SSHClient()
# 允许连接不在 know_hosts 文件中的主机
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
# 连接服务器
ssh.connect(hostname=host, port=port, username=username, password=password)
all_results = []
if scp_map:
sftp = ssh.open_sftp()
for sf, df in scp_map.items():
# 1. 上传到 /tmp/ 目录
tmp_path = f"/tmp/{os.path.basename(df)}"
print(f"上传 {sf} 到临时目录 {tmp_path}")
sftp.put(sf, tmp_path)
# 2. 用 sudo 移动到目标目录
cmd = f"echo {password} | sudo mv {tmp_path} {df}"
print(f"用 sudo 移动 {tmp_path}{df}")
stdin, stdout, stderr = ssh.exec_command(cmd)
exit_status = stdout.channel.recv_exit_status()
if exit_status != 0:
print(f"移动失败: {stderr.read().decode()}")
else:
print("移动成功")
sftp.close()
if remote_exec:
# 通用流程
result = ""
error = ""
for command in commands:
stdin, stdout, stderr = ssh.exec_command(f'{command}', get_pty=True)
stdin.flush()
if real_time_log:
print(f"开始执行命令: {command=}, 请耐心等待...")
# 实时读取标准输出
for line in iter(stdout.readline, ""):
print(f'{line=}')
result += line
# 实时读取标准错误输出
for line in iter(stderr.readline, ""):
print(f'{line=}')
error += line
else:
result = stdout.read().decode()
error = stderr.read().decode()
all_results.append((result, error))
if real_time_log:
print(f"命令 {command=} 执行结束")
# 关闭连接
ssh.close()
return all_results
except Exception as e:
print(f"SSH连接或执行命令时出错: {e=}")
return [e]
if __name__ == "__main__":
# 测试代码
host = ""
port = 22
username = ""
password = ""
commands = ["sudo", "apt-get update"]
scp_map = {
"local_file.txt": "/remote/path/remote_file.txt"
}
results = ssh_execute_command(host, port, username, password, commands, real_time_log=True, scp_map=scp_map)
for result, error in results:
print(f"Result: {result}")
print(f"Error: {error}")
# This code is a simplified version of the SSH command execution utility.
# It uses the paramiko library to connect to a remote server and execute commands.
# The code includes functionality for uploading files via SFTP and executing commands with real-time logging.
# This code is a simplified version of the SSH command execution utility.

View File

@ -0,0 +1,70 @@
def ssh_execute_commands(host, port, username, password, commands, real_time_log=False):
try:
import paramiko
# 创建 SSH 对象
ssh = paramiko.SSHClient()
# 允许连接不在 know_hosts 文件中的主机
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
# 连接服务器
ssh.connect(hostname=host, port=port, username=username, password=password)
all_results = []
result = ""
error = ""
for command in commands:
# stdin, stdout, stderr = ssh.exec_command(f'sudo -S {command}', get_pty=True)
# stdin.write(password + '\n')
stdin, stdout, stderr = ssh.exec_command(f'{command}', get_pty=True)
stdin.flush()
if real_time_log:
print(f"开始执行命令: {command}")
# 实时读取标准输出
for line in iter(stdout.readline, ""):
print(line, end="")
result += line
# 实时读取标准错误输出
for line in iter(stderr.readline, ""):
print(line, end="")
error += line
else:
result = stdout.read().decode()
error = stderr.read().decode()
all_results.append((result, error))
if real_time_log:
print(f"命令 {command} 执行结束")
# 关闭连接
ssh.close()
return all_results
except Exception as e:
print(f"SSH 连接或执行命令时出错: {e}")
return None
def new_cluster(params):
# 随后填充远程操控k8s主逻辑
"""
用于接收cpcc端传递过来的k8s安装指令参数进行远程sshx调用操作内网机器进行集群节点的安装
参数示例
{'cluster_type': '0', 'host': '192.168.0.3', 'port': '22', 'user': 'ysh', 'password': 'Kyy@123456'}
"""
host = params.get("host")
port = int(params.get("port"))
username = params.get("user")
password = params.get("password")
commands = ['cd /install/ && ./k8s_install_1804.sh master','cd /install/ && cat join_command.txt']
results = ssh_execute_commands(host, port, username, password, commands, real_time_log=True)
if results:
print("所有命令执行的整体结果:")
for result, error in results:
if result:
print("执行结果:")
print(result)
if error:
print("错误信息:")
print(error)
return results
if __name__ == "__main__":
params = {'cluster_type': '0', 'host': '192.168.0.2', 'port': '22', 'user': 'root', 'password': 'Yuanshenhong.1'}
new_cluster(params)

55
conf/config.json Normal file
View File

@ -0,0 +1,55 @@
{
"logger":{
"name":"pcapi",
"levelname":"info",
"logfile":"$[workdir]$/logs/pcapi.log"
},
"authentication":{
"user":"root",
"password":"Kyy@123456",
"iplist":[
"47.93.12.75",
"127.0.0.1",
"117.50.205.57",
"10.60.179.61",
"114.246.236.28",
"115.190.98.166",
"61.48.132.253",
"114.246.239.237",
"223.72.41.93",
"111.201.209.76"
]
},
"filesroot":"$[workdir]$/files",
"website":{
"paths":[
["$[workdir]$/wwwroot",""]
],
"client_max_size":10000,
"host":"0.0.0.0",
"port":9001,
"coding":"utf-8",
"indexes":[
"index.dspy",
"index.md"
],
"startswiths":[
{
"leading":"/idfile",
"registerfunction":"idFileDownload"
}
],
"processors":[
[".dspy","dspy"],
[".md","md"]
],
"session_max_time":3000,
"session_issue_time":2500
},
"langMapping":{
"zh-Hans-CN":"zh-cn",
"zh-CN":"zh-cn",
"en-us":"en",
"en-US":"en"
}
}

0
files/README.md Normal file
View File

202
files/components.yaml Normal file
View File

@ -0,0 +1,202 @@
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
k8s-app: metrics-server
name: metrics-server
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
k8s-app: metrics-server
rbac.authorization.k8s.io/aggregate-to-admin: "true"
rbac.authorization.k8s.io/aggregate-to-edit: "true"
rbac.authorization.k8s.io/aggregate-to-view: "true"
name: system:aggregated-metrics-reader
rules:
- apiGroups:
- metrics.k8s.io
resources:
- pods
- nodes
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
k8s-app: metrics-server
name: system:metrics-server
rules:
- apiGroups:
- ""
resources:
- nodes/metrics
verbs:
- get
- apiGroups:
- ""
resources:
- pods
- nodes
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
k8s-app: metrics-server
name: metrics-server-auth-reader
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: extension-apiserver-authentication-reader
subjects:
- kind: ServiceAccount
name: metrics-server
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
k8s-app: metrics-server
name: metrics-server:system:auth-delegator
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:auth-delegator
subjects:
- kind: ServiceAccount
name: metrics-server
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
k8s-app: metrics-server
name: system:metrics-server
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:metrics-server
subjects:
- kind: ServiceAccount
name: metrics-server
namespace: kube-system
---
apiVersion: v1
kind: Service
metadata:
labels:
k8s-app: metrics-server
name: metrics-server
namespace: kube-system
spec:
ports:
- name: https
port: 443
protocol: TCP
targetPort: https
selector:
k8s-app: metrics-server
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
k8s-app: metrics-server
name: metrics-server
namespace: kube-system
spec:
selector:
matchLabels:
k8s-app: metrics-server
strategy:
rollingUpdate:
maxUnavailable: 0
template:
metadata:
labels:
k8s-app: metrics-server
spec:
containers:
- args:
- --cert-dir=/tmp
- --secure-port=10250
- --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname
- --kubelet-use-node-status-port
- --metric-resolution=15s
- --kubelet-insecure-tls
image: registry.aliyuncs.com/google_containers/metrics-server:v0.7.2
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 3
httpGet:
path: /livez
port: https
scheme: HTTPS
periodSeconds: 10
name: metrics-server
ports:
- containerPort: 10250
name: https
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /readyz
port: https
scheme: HTTPS
initialDelaySeconds: 20
periodSeconds: 10
resources:
requests:
cpu: 100m
memory: 200Mi
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop:
- ALL
readOnlyRootFilesystem: true
runAsNonRoot: true
runAsUser: 1000
seccompProfile:
type: RuntimeDefault
volumeMounts:
- mountPath: /tmp
name: tmp-dir
nodeSelector:
kubernetes.io/os: linux
priorityClassName: system-cluster-critical
serviceAccountName: metrics-server
volumes:
- emptyDir: {}
name: tmp-dir
---
apiVersion: apiregistration.k8s.io/v1
kind: APIService
metadata:
labels:
k8s-app: metrics-server
name: v1beta1.metrics.k8s.io
spec:
group: metrics.k8s.io
groupPriorityMinimum: 100
insecureSkipTLSVerify: true
service:
name: metrics-server
namespace: kube-system
version: v1beta1
versionPriority: 100

View File

@ -0,0 +1,610 @@
apiVersion: v1
kind: Namespace
metadata:
labels:
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
name: ingress-nginx
---
apiVersion: v1
automountServiceAccountToken: true
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx
namespace: ingress-nginx
---
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx-admission
namespace: ingress-nginx
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx
namespace: ingress-nginx
rules:
- apiGroups:
- ""
resources:
- namespaces
verbs:
- get
- apiGroups:
- ""
resources:
- configmaps
- pods
- secrets
- endpoints
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- services
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses/status
verbs:
- update
- apiGroups:
- networking.k8s.io
resources:
- ingressclasses
verbs:
- get
- list
- watch
- apiGroups:
- ""
resourceNames:
- ingress-nginx-leader
resources:
- configmaps
verbs:
- get
- update
- apiGroups:
- ""
resources:
- configmaps
verbs:
- create
- apiGroups:
- coordination.k8s.io
resourceNames:
- ingress-nginx-leader
resources:
- leases
verbs:
- get
- update
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- create
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- discovery.k8s.io
resources:
- endpointslices
verbs:
- list
- watch
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx-admission
namespace: ingress-nginx
rules:
- apiGroups:
- ""
resources:
- secrets
verbs:
- get
- create
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx
rules:
- apiGroups:
- ""
resources:
- configmaps
- endpoints
- nodes
- pods
- secrets
- namespaces
verbs:
- list
- watch
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs:
- list
- watch
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- apiGroups:
- ""
resources:
- services
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- apiGroups:
- networking.k8s.io
resources:
- ingresses/status
verbs:
- update
- apiGroups:
- networking.k8s.io
resources:
- ingressclasses
verbs:
- get
- list
- watch
- apiGroups:
- discovery.k8s.io
resources:
- endpointslices
verbs:
- list
- watch
- get
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx-admission
rules:
- apiGroups:
- admissionregistration.k8s.io
resources:
- validatingwebhookconfigurations
verbs:
- get
- update
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx
namespace: ingress-nginx
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: ingress-nginx
subjects:
- kind: ServiceAccount
name: ingress-nginx
namespace: ingress-nginx
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx-admission
namespace: ingress-nginx
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: ingress-nginx-admission
subjects:
- kind: ServiceAccount
name: ingress-nginx-admission
namespace: ingress-nginx
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: ingress-nginx
subjects:
- kind: ServiceAccount
name: ingress-nginx
namespace: ingress-nginx
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx-admission
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: ingress-nginx-admission
subjects:
- kind: ServiceAccount
name: ingress-nginx-admission
namespace: ingress-nginx
---
apiVersion: v1
data:
allow-snippet-annotations: "true"
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx-controller
namespace: ingress-nginx
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx-controller
namespace: ingress-nginx
spec:
externalTrafficPolicy: Local
ipFamilies:
- IPv4
ipFamilyPolicy: SingleStack
ports:
- appProtocol: http
name: http
port: 80
protocol: TCP
targetPort: http
- appProtocol: https
name: https
port: 443
protocol: TCP
targetPort: https
selector:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
type: LoadBalancer
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx-controller-admission
namespace: ingress-nginx
spec:
ports:
- appProtocol: https
name: https-webhook
port: 443
targetPort: webhook
selector:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
type: ClusterIP
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx-controller
namespace: ingress-nginx
spec:
minReadySeconds: 0
revisionHistoryLimit: 10
selector:
matchLabels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
template:
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
spec:
containers:
- args:
- /nginx-ingress-controller
- --publish-service=$(POD_NAMESPACE)/ingress-nginx-controller
- --election-id=ingress-nginx-leader
- --controller-class=k8s.io/ingress-nginx
- --ingress-class=nginx
- --configmap=$(POD_NAMESPACE)/ingress-nginx-controller
- --validating-webhook=:8443
- --validating-webhook-certificate=/usr/local/certificates/cert
- --validating-webhook-key=/usr/local/certificates/key
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
image: registry.cn-hangzhou.aliyuncs.com/google_containers/nginx-ingress-controller:v1.5.1
imagePullPolicy: IfNotPresent
lifecycle:
preStop:
exec:
command:
- /wait-shutdown
livenessProbe:
failureThreshold: 5
httpGet:
path: /healthz
port: 10254
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
name: controller
ports:
- containerPort: 80
name: http
protocol: TCP
- containerPort: 443
name: https
protocol: TCP
- containerPort: 8443
name: webhook
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /healthz
port: 10254
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
resources:
requests:
cpu: 100m
memory: 90Mi
securityContext:
allowPrivilegeEscalation: true
capabilities:
add:
- NET_BIND_SERVICE
drop:
- ALL
runAsUser: 101
volumeMounts:
- mountPath: /usr/local/certificates/
name: webhook-cert
readOnly: true
dnsPolicy: ClusterFirst
nodeSelector:
kubernetes.io/os: linux
serviceAccountName: ingress-nginx
terminationGracePeriodSeconds: 300
volumes:
- name: webhook-cert
secret:
secretName: ingress-nginx-admission
---
apiVersion: batch/v1
kind: Job
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx-admission-create
namespace: ingress-nginx
spec:
template:
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx-admission-create
spec:
containers:
- args:
- create
- --host=ingress-nginx-controller-admission,ingress-nginx-controller-admission.$(POD_NAMESPACE).svc
- --namespace=$(POD_NAMESPACE)
- --secret-name=ingress-nginx-admission
env:
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
image: registry.cn-hangzhou.aliyuncs.com/google_containers/kube-webhook-certgen:v1.1.1
imagePullPolicy: IfNotPresent
name: create
securityContext:
allowPrivilegeEscalation: false
nodeSelector:
kubernetes.io/os: linux
restartPolicy: OnFailure
securityContext:
fsGroup: 2000
runAsNonRoot: true
runAsUser: 2000
serviceAccountName: ingress-nginx-admission
---
apiVersion: batch/v1
kind: Job
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx-admission-patch
namespace: ingress-nginx
spec:
template:
metadata:
labels:
app.kubernetes.io/component: admission-webhook
app.kubernetes.io/instance: ingress-nginx
app.kubernetes.io/name: ingress-nginx
app.kubernetes.io/part-of: ingress-nginx
app.kubernetes.io/version: 1.5.1
name: ingress-nginx-admission-patch
spec:
containers:
- args:
- patch
- --webhook-name=ingress-nginx-admission
- --namespace=$(POD_NAMESPACE)
- --patch-mutating=false
- --secret-name=ingress-nginx-admission
- --patch-failure-policy=Fail
env:
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
image: registry.cn-hangzhou.aliyuncs.com/google_containers/kube-webhook-certgen:v1.1.1
imagePullPolicy: IfNotPresent
name: patch
securityContext:
allowPrivilegeEscalation: false
nodeSelector:
kubernetes.io/os: linux
restartPolicy: OnFailure
securityContext:
fsGroup: 2000
runAsNonRoot: true
runAsUser: 2000
serviceAccountName: ingress-nginx-admission

209
files/kube-flannel.yml Normal file
View File

@ -0,0 +1,209 @@
---
kind: Namespace
apiVersion: v1
metadata:
name: kube-flannel
labels:
k8s-app: flannel
pod-security.kubernetes.io/enforce: privileged
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
labels:
k8s-app: flannel
name: flannel
rules:
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- patch
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
labels:
k8s-app: flannel
name: flannel
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: flannel
subjects:
- kind: ServiceAccount
name: flannel
namespace: kube-flannel
---
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
k8s-app: flannel
name: flannel
namespace: kube-flannel
---
kind: ConfigMap
apiVersion: v1
metadata:
name: kube-flannel-cfg
namespace: kube-flannel
labels:
tier: node
k8s-app: flannel
app: flannel
data:
cni-conf.json: |
{
"name": "cbr0",
"cniVersion": "0.3.1",
"plugins": [
{
"type": "flannel",
"delegate": {
"hairpinMode": true,
"isDefaultGateway": true
}
},
{
"type": "portmap",
"capabilities": {
"portMappings": true
}
}
]
}
net-conf.json: |
{
"Network": "10.244.0.0/16",
"EnableNFTables": false,
"Backend": {
"Type": "vxlan"
}
}
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: kube-flannel-ds
namespace: kube-flannel
labels:
tier: node
app: flannel
k8s-app: flannel
spec:
selector:
matchLabels:
app: flannel
template:
metadata:
labels:
tier: node
app: flannel
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kubernetes.io/os
operator: In
values:
- linux
hostNetwork: true
priorityClassName: system-node-critical
tolerations:
- operator: Exists
effect: NoSchedule
serviceAccountName: flannel
initContainers:
- name: install-cni-plugin
image: ghcr.io/flannel-io/flannel-cni-plugin:v1.6.2-flannel1
command:
- cp
args:
- -f
- /flannel
- /opt/cni/bin/flannel
volumeMounts:
- name: cni-plugin
mountPath: /opt/cni/bin
- name: install-cni
image: ghcr.io/flannel-io/flannel:v0.26.4
command:
- cp
args:
- -f
- /etc/kube-flannel/cni-conf.json
- /etc/cni/net.d/10-flannel.conflist
volumeMounts:
- name: cni
mountPath: /etc/cni/net.d
- name: flannel-cfg
mountPath: /etc/kube-flannel/
containers:
- name: kube-flannel
image: ghcr.io/flannel-io/flannel:v0.26.4
command:
- /opt/bin/flanneld
args:
- --ip-masq
- --kube-subnet-mgr
resources:
requests:
cpu: "100m"
memory: "50Mi"
securityContext:
privileged: false
capabilities:
add: ["NET_ADMIN", "NET_RAW"]
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: EVENT_QUEUE_DEPTH
value: "5000"
volumeMounts:
- name: run
mountPath: /run/flannel
- name: flannel-cfg
mountPath: /etc/kube-flannel/
- name: xtables-lock
mountPath: /run/xtables.lock
volumes:
- name: run
hostPath:
path: /run/flannel
- name: cni-plugin
hostPath:
path: /opt/cni/bin
- name: cni
hostPath:
path: /etc/cni/net.d
- name: flannel-cfg
configMap:
name: kube-flannel-cfg
- name: xtables-lock
hostPath:
path: /run/xtables.lock
type: FileOrCreate

View File

@ -0,0 +1,39 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: nfs-client-provisioner
labels:
app: nfs-client-provisioner
spec:
replicas: 1
strategy:
type: Recreate ## 设置升级策略为删除再创建(默认为滚动更新)
selector:
matchLabels:
app: nfs-client-provisioner
template:
metadata:
labels:
app: nfs-client-provisioner
spec:
serviceAccountName: nfs-client-provisioner
containers:
- name: nfs-client-provisioner
#image: gcr.io/k8s-staging-sig-storage/nfs-subdir-external-provisioner:v4.0.0
image: registry.cn-beijing.aliyuncs.com/xngczl/nfs-subdir-external-provisione:v4.0.0
volumeMounts:
- name: nfs-client-root
mountPath: /persistentvolumes
env:
- name: PROVISIONER_NAME ## Provisioner的名称,以后设置的storageclass要和这个保持一致
value: k8s-sigs.io/nfs-subdir-external-provisioner
- name: NFS_SERVER ## NFS服务器地址,需和valumes参数中配置的保持一致
value: 192.168.0.3
- name: NFS_PATH ## NFS服务器数据存储目录,需和valumes参数中配置的保持一致
value: /d/k8s_nss
volumes:
- name: nfs-client-root
nfs:
server: 192.168.0.3 ## NFS服务器地址
path: /d/k8s_nss ## NFS服务器数据存储目录
readOnly: false

60
files/nfs-rbac.yaml Normal file
View File

@ -0,0 +1,60 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: nfs-client-provisioner
namespace: default # 替换成你要部署的 Namespace
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: nfs-client-provisioner-runner
rules:
- apiGroups: [""]
resources: ["persistentvolumes"]
verbs: ["get", "list", "watch", "create", "delete"]
- apiGroups: [""]
resources: ["persistentvolumeclaims"]
verbs: ["get", "list", "watch", "update"]
- apiGroups: ["storage.k8s.io"]
resources: ["storageclasses"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["events"]
verbs: ["create", "update", "patch"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: run-nfs-client-provisioner
subjects:
- kind: ServiceAccount
name: nfs-client-provisioner
namespace: default
roleRef:
kind: ClusterRole
name: nfs-client-provisioner-runner
apiGroup: rbac.authorization.k8s.io
---
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: leader-locking-nfs-client-provisioner
namespace: default
rules:
- apiGroups: [""]
resources: ["endpoints"]
verbs: ["get", "list", "watch", "create", "update", "patch"]
---
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: leader-locking-nfs-client-provisioner
namespace: default
subjects:
- kind: ServiceAccount
name: nfs-client-provisioner
namespace: default
roleRef:
kind: Role
name: leader-locking-nfs-client-provisioner
apiGroup: rbac.authorization.k8s.io

13
files/storage_class.yaml Normal file
View File

@ -0,0 +1,13 @@
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: nfs-storage-class
annotations:
storageclass.kubernetes.io/is-default-class: "false"
allowVolumeExpansion: true
provisioner: k8s-sigs.io/nfs-subdir-external-provisioner
reclaimPolicy: Delete
volumeBindingMode: Immediate
parameters:
pathPattern: "${.PVC.namespace}/${.PVC.name}"
onDelete: delete

5
install.sh Normal file
View File

@ -0,0 +1,5 @@
#!/bin/bash
pip3 -V
pip3 list
pip3 install kubernetes packaging ldap3 paramiko python-dateutil aiohttp-socks asyncssh nanoid redis -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3 install --upgrade cryptography pyOpenSSL

0
logs/README.md Normal file
View File

3
requirements.txt Normal file
View File

@ -0,0 +1,3 @@
git+https://git.kaiyuancloud.cn/yumoqing/apppublic
git+https://git.kaiyuancloud.cn/yumoqing/sqlor
git+https://git.kaiyuancloud.cn/yumoqing/ahserver

0
script/README.md Normal file
View File

23
script/delete_allimage.sh Normal file
View File

@ -0,0 +1,23 @@
#!/bin/bash
# 停止并删除所有容器和 Pod
echo "Stopping all containers..."
crictl stop $(crictl ps -q) || true
echo "Removing all containers..."
crictl rm $(crictl ps -a -q) || true
echo "Stopping all pods..."
crictl stopp $(crictl pods -q) || true
echo "Removing all pods..."
crictl rmp $(crictl pods -q) || true
# 删除所有镜像crictl 方式)
echo "Deleting all images via crictl..."
crictl images --quiet | xargs -r crictl rmi || true
# 删除所有镜像ctr 方式)
echo "Deleting all images via ctr in k8s.io namespace..."
ctr -n=k8s.io images list --quiet | xargs -r ctr -n=k8s.io image rm || true
echo "Deleting all images via ctr in default namespace..."
ctr -n=default images list --quiet | xargs -r ctr -n=default image rm || true
echo "All images and containers have been deleted."

53
script/export_images.sh Normal file
View File

@ -0,0 +1,53 @@
#!/bin/bash/
# 设置 Kubernetes 版本和镜像仓库地址
K8S_VERSION="v1.28.2"
ALIYUN_REGISTRY="registry.aliyuncs.com/google_containers" # 阿里云 Kubernetes 镜像源
FLANNEL_REPO="ghcr.io/flannel-io" # Flannel 镜像仓库
NETWORK_PLUGIN="flannel"
NETWORK_PLUGIN_VERSION="v0.26.4"
NETWORK_PLUGIN_CNI="flannel-cni-plugin"
NETWORK_PLUGIN_CNI_VERSION="v1.6.2-flannel1"
# Kubernetes 控制平面镜像列表(阿里云镜像源)
KUBERNETES_IMAGES=(
"${ALIYUN_REGISTRY}/kube-apiserver:${K8S_VERSION}"
"${ALIYUN_REGISTRY}/kube-controller-manager:${K8S_VERSION}"
"${ALIYUN_REGISTRY}/kube-scheduler:${K8S_VERSION}"
"${ALIYUN_REGISTRY}/kube-proxy:${K8S_VERSION}"
"${ALIYUN_REGISTRY}/pause:3.9"
"${ALIYUN_REGISTRY}/etcd:3.5.9-0"
"${ALIYUN_REGISTRY}/coredns:v1.10.1"
)
# 网络插件镜像Flannel
NETWORK_IMAGES=(
"${FLANNEL_REPO}/${NETWORK_PLUGIN}:${NETWORK_PLUGIN_VERSION}"
)
NETWORK_CNI_IMAGES=(
"${FLANNEL_REPO}/${NETWORK_PLUGIN_CNI}:${NETWORK_PLUGIN_CNI_VERSION}"
)
# 合并所有镜像
ALL_IMAGES=("${KUBERNETES_IMAGES[@]}" "${NETWORK_IMAGES[@]}" "${NETWORK_CNI_IMAGES[@]}")
# 导出本地已存在的镜像(在源节点运行)
function export_images() {
echo "==> 正在导出本地已存在的 Kubernetes v${K8S_VERSION} 镜像..."
mkdir -p /opt/k8s-images
cd /opt/k8s-images || exit
for image in "${ALL_IMAGES[@]}"; do
echo "正在检查并导出镜像:${image}"
if ctr -n=k8s.io images list --quiet | grep -q "${image}"; then
output_file="${image//\//_}.tar"
ctr -n=k8s.io images export ${output_file} ${image} --platform=linux/amd64
echo "✅ 成功导出:${output_file}"
else
echo "⚠️ 镜像 ${image} 不存在于本地,跳过!"
fi
done
}
# 根据需要选择执行导出或导入
export_images # 在源节点运行,导出镜像

116
script/generate_apitoken.sh Executable file
View File

@ -0,0 +1,116 @@
#!/bin/bash
# 定义变量
NAMESPACE="my-namespace"
SERVICE_ACCOUNT="my-sa"
# YAML 内容(确保 Deployment 明确使用 ServiceAccount
all_resources_yaml='
apiVersion: v1
kind: Namespace
metadata:
name: '"$NAMESPACE"'
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: '"$SERVICE_ACCOUNT"'
namespace: '"$NAMESPACE"'
---
apiVersion: v1
kind: Service
metadata:
name: my-mysql-service
namespace: '"$NAMESPACE"'
spec:
type: NodePort
selector:
app: mysql
ports:
- protocol: TCP
port: 3306
targetPort: 3306
nodePort: 30060
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: mysql-deployment
namespace: '"$NAMESPACE"'
spec:
replicas: 1
selector:
matchLabels:
app: mysql
template:
metadata:
labels:
app: mysql
spec:
serviceAccountName: '"$SERVICE_ACCOUNT"' # 关键:强制 Pod 使用该 ServiceAccount
containers:
- name: mysql
image: mysql:8.0
env:
- name: MYSQL_ROOT_PASSWORD
value: "123456"
resources:
limits:
cpu: "300m"
memory: "512Mi"
'
# 创建资源函数
create_resources() {
echo "$all_resources_yaml" | kubectl apply -f -
if [ $? -ne 0 ]; then
echo "资源创建失败"
exit 1
fi
# 新增:等待 Secret 生成(最多 10 秒)
echo "等待 ServiceAccount 的 Secret 生成..."
for i in {1..10}; do
local secret_name=$(kubectl get serviceaccount "$SERVICE_ACCOUNT" -n "$NAMESPACE" -o jsonpath='{.secrets[0].name}' 2>/dev/null)
if [ -n "$secret_name" ]; then
break
fi
sleep 1
done
}
# 删除资源函数
delete_resources() {
echo "$all_resources_yaml" | kubectl delete -f -
if [ $? -ne 0 ]; then
echo "资源创建失败"
exit 1
fi
}
# 获取 Token 函数(优化错误提示)
get_service_account_token() {
local secret_name=$(kubectl get serviceaccount "$SERVICE_ACCOUNT" -n "$NAMESPACE" -o jsonpath='{.secrets[0].name}' 2>/dev/null)
if [ -z "$secret_name" ]; then
echo "错误ServiceAccount 的 Secret 未生成,请检查 Pod 是否正常运行"
exit 1
fi
local token=$(kubectl get secret -n "$NAMESPACE" "$secret_name" -o jsonpath='{.data.token}' | base64 -d)
echo "ApiToken: $token"
}
# 执行流程
create_resources
#echo "资源创建完成"
#kubectl get all -n "$NAMESPACE"
#echo "正在获取 ServiceAccount 的 Token..."
#get_service_account_token
#delete_resources

74
script/import_images.sh Normal file
View File

@ -0,0 +1,74 @@
#!/bin/bash
# 设置 Kubernetes 版本和镜像仓库地址
K8S_VERSION="v1.28.2"
ALIYUN_REGISTRY="registry.aliyuncs.com/google_containers" # 阿里云 Kubernetes 镜像源
HANGZHOU_ALIYUN_REGISTRY="registry.cn-hangzhou.aliyuncs.com/google_containers" # 杭州阿里云镜像站
FLANNEL_REPO="ghcr.io/flannel-io" # Flannel 镜像仓库
NETWORK_PLUGIN="flannel"
NETWORK_PLUGIN_VERSION="v0.26.4"
NETWORK_PLUGIN_CNI="flannel-cni-plugin"
NETWORK_PLUGIN_CNI_VERSION="v1.6.2-flannel1"
# Kubernetes 控制平面镜像列表(阿里云镜像源)
KUBERNETES_IMAGES=(
"${ALIYUN_REGISTRY}/kube-apiserver:${K8S_VERSION}"
"${ALIYUN_REGISTRY}/kube-controller-manager:${K8S_VERSION}"
"${ALIYUN_REGISTRY}/kube-scheduler:${K8S_VERSION}"
"${ALIYUN_REGISTRY}/kube-proxy:${K8S_VERSION}"
"${ALIYUN_REGISTRY}/pause:3.9"
"${ALIYUN_REGISTRY}/etcd:3.5.9-0"
"${ALIYUN_REGISTRY}/coredns:v1.10.1"
"${ALIYUN_REGISTRY}/metrics-server:v0.7.2"
"${HANGZHOU_ALIYUN_REGISTRY}/kube-webhook-certgen:v1.1.1"
"${HANGZHOU_ALIYUN_REGISTRY}/nginx-ingress-controller:v1.5.1"
)
# 网络插件镜像Flannel
NETWORK_IMAGES=(
"${FLANNEL_REPO}/${NETWORK_PLUGIN}:${NETWORK_PLUGIN_VERSION}"
)
NETWORK_CNI_IMAGES=(
"${FLANNEL_REPO}/${NETWORK_PLUGIN_CNI}:${NETWORK_PLUGIN_CNI_VERSION}"
)
# 合并所有镜像
ALL_IMAGES=("${KUBERNETES_IMAGES[@]}" "${NETWORK_IMAGES[@]}" "${NETWORK_CNI_IMAGES[@]}")
# 导入镜像并自动修复配置(在目标节点运行)
function import_images() {
echo "==> 正在导入镜像到目标节点..."
# 2. 停止 containerd 服务
sudo systemctl stop containerd
# 3. 进入镜像目录
cd /opt/k8s-images || exit
# 4. 清理旧镜像(根据你的镜像仓库地址过滤)
echo "正在清理旧镜像..."
for img in $(ctr -n=k8s.io images list --quiet); do
if [[ $img == ${ALIYUN_REGISTRY}* || $img == ${FLANNEL_REPO}* || $img == ${HANGZHOU_ALIYUN_REGISTRY}* ]]; then
ctr -n=k8s.io images rm $img || true
fi
done
sudo systemctl start containerd
# 5. 导入所有 tar 文件
for file in *.tar; do
echo "正在导入镜像:${file}"
ctr -n=k8s.io images import ${file} --platform=linux/amd64
echo "✅ 导入成功:${file}"
done
# 6. 启动 containerd 并验证
#sudo systemctl restart containerd
echo "已导入的镜像列表:"
ctr -n=k8s.io images list | grep -E "${ALIYUN_REGISTRY}|${HANGZHOU_ALIYUN_REGISTRY}|${FLANNEL_REPO}"
crictl images
}
# 根据需要选择执行导入
import_images # 在目标节点运行,导入镜像

660
script/k8s_install.sh Normal file
View File

@ -0,0 +1,660 @@
#!/bin/bash
# 部分ubuntu操作系统在安装包时会出现交互式图形界面弹窗的形式,此处我们忽略
# 交互式提示的本质DEBIAN_FRONTEND=noninteractive 是控制apt/dpkg非交互的核心,其他UCF变量和配置文件是补充,确保配置文件冲突时自动选择新 / 旧版本,避免弹窗。
# packagekit 的作用:该服务主要用于图形化包管理,在服务器环境中可停止但无需mask,mask会导致系统无法正常管理该服务及其依赖。
# 禁用包管理交互式提示(不影响系统服务)
export DEBIAN_FRONTEND=noninteractive
export UCF_FORCE_CONFFNEW=1
export UCF_FORCE_CONFFMISS=1
export UCF_FORCE_CONFFIGNORE=1
# 配置apt和dpkg的非交互行为
echo 'Dpkg::Options {
"--force-confdef";
"--force-confnew";
}' > /etc/apt/apt.conf.d/99noninteractive
echo 'force-confold' > /etc/dpkg/dpkg.cfg.d/force-confold
# 优化仅停止packagekit不mask
systemctl stop packagekit
echo "########## 安装K8S必须root用户下执行 ###########"
# 检查是否为root用户
if [ "$(id -u)" != "0" ]; then
echo "请以root用户身份运行此脚本"
exit 1
fi
# 新ubuntu18.04设备环境先换阿里源:
cp /etc/apt/sources.list /etc/apt/sources.list.bak
tee /etc/apt/sources.list << EOF
deb http://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
deb http://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
deb-src http://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
EOF
apt-get update -y
apt upgrade -y
apt install -y libtss2-esys0 -f
# 设置脚本在出错时立即退出,并将错误信息输出
set -e
# set -o pipefail
# 函数:输出日志信息
log_info() {
echo "[INFO] $1"
}
# 函数:输出错误信息并退出
log_error() {
echo "[ERROR] $1" >&2
exit 1
}
# 关闭防火墙
# log_info "关闭防火墙..."
# ufw disable || log_error "关闭防火墙失败"
# selinux相关操作
log_info "安装selinux-utils..."
apt install -y selinux-utils || log_error "安装selinux-utils失败"
log_info "设置SELinux为Permissive模式..."
if grep -q "SELINUX=enforcing" /etc/selinux/config || grep -q "SELINUX=permissive" /etc/selinux/config; then
echo "SELinux已开启"
setenforce 0 || log_error "设置SELinux模式失败"
sed -i 's/^SELINUX=enforcing$/SELINUX=permissive/' /etc/selinux/config || log_error "修改SELinux配置文件失败"
else
echo "SELinux未开启"
fi
#安装htop,vim,net-tools
apt install vim htop net-tools -y || log_error "安装htop,vim,net-tools失败"
# 禁止swap分区
log_info "禁止swap分区..."
swapoff -a || log_error "禁止swap分区失败"
# 注释掉swap一行
sed -i '/swap/s/^/#/' /etc/fstab || log_error "注释swap行失败"
# 桥接的IPV4流量传递到iptables 的链
log_info "配置桥接的IPV4流量传递到iptables的链..."
cat > /etc/sysctl.d/k8s.conf <<EOF
net.bridge.bridge-nf-call-ip6tables = 1
net.bridge.bridge-nf-call-iptables = 1
EOF
sysctl --system || log_error "使sysctl配置生效失败"
# 新增k8s镜像源
log_info "新增k8s镜像源..."
curl -s https://mirrors.aliyun.com/kubernetes/apt/doc/apt-key.gpg | apt-key add - || log_error "添加k8s镜像源的密钥失败"
echo "deb https://mirrors.aliyun.com/kubernetes/apt/ kubernetes-xenial main" > /etc/apt/sources.list.d/kubernetes.list
apt-get update -y || log_error "更新apt源失败"
# 安装nfs
# log_info "安装nfs-common..."
# apt-get install -y nfs-common || log_error "安装nfs-common失败"
apt install -y aptitude
# 更新系统并安装必要工具
log_info "更新系统并安装必要工具..."
apt update -y || log_error "系统更新或升级失败"
apt install -y curl apt-transport-https ipvsadm gnupg2 software-properties-common || log_error "安装必要工具失败"
# 安装docker
log_info "正在跳过安装docker..."
# 删除原有的Docker软件源
# if [ -f /etc/apt/sources.list.d/docker.list ]; then
# rm /etc/apt/sources.list.d/docker.list
# fi
# 添加阿里云的Docker镜像源
# 备份现有文件
# if [ -f /usr/share/keyrings/docker-archive-keyring.gpg ]; then
# mv /usr/share/keyrings/docker-archive-keyring.gpg /usr/share/keyrings/docker-archive-keyring.gpg.bak
# fi
# 覆盖现有文件
curl -fsSL https://mirrors.aliyun.com/docker-ce/linux/ubuntu/gpg | gpg --batch --yes --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://mirrors.aliyun.com/docker-ce/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
# 更新apt源
apt update -y || log_error "更新apt源失败"
# apt install docker-ce=5:20.10.24~3-0~ubuntu-focal docker-ce-cli=5:20.10.24~3-0~ubuntu-focal containerd.io --allow-downgrades -y || log_error "安装docker失败"
apt install containerd --allow-downgrades -y || log_error "安装containerd失败"
systemctl enable containerd || log_error "启动containerd服务失败"
# 配置containerd-crictl
if [ ! -f /etc/crictl.yaml ]; then
sudo tee /etc/crictl.yaml > /dev/null <<EOF
runtime-endpoint: unix:///var/run/containerd/containerd.sock
image-endpoint: unix:///var/run/containerd/containerd.sock
timeout: 10
debug: false
pull-image-on-create: false
EOF
fi
# 安装kubeadm、kubelet、kubectl等
log_info "安装kubeadm、kubelet、kubectl等..."
# wget https://pkgs.k8s.io/core:/stable:/v1.21/deb/Release.key -O apt-key.gpg || log_error "下载kubeadm等的密钥失败"
# apt-key add apt-key.gpg && rm -f apt-key.gpg || log_error "导入&删除apt-key.gpg文件失败"
curl -s https://mirrors.aliyun.com/kubernetes/apt/doc/apt-key.gpg | apt-key add - || log_error "添加k8s镜像源的密钥失败"
# curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.28/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
# echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.28/deb/ /" | sudo tee /etc/apt/sources.list.d/kubernetes.list
echo "deb https://mirrors.aliyun.com/kubernetes/apt/ kubernetes-xenial main" | tee /etc/apt/sources.list.d/kubernetes.list
apt-get update -y || log_error "更新apt源以安装kubeadm等失败"
apt install -y kubelet=1.28.2-00 kubeadm=1.28.2-00 kubectl=1.28.2-00 --allow-downgrades --allow-change-held-packages || log_error "安装kubeadm,kubelet,kubectl失败"
apt-mark hold kubeadm kubelet kubectl # 防止自动升级导致的问题
systemctl enable kubelet && systemctl start kubelet || log_error "启动kubelet服务失败"
# 备份docker的daemon.json文件
# if [ -f /etc/docker/daemon.json ]; then
# cp /etc/docker/daemon.json /etc/docker/daemon.json.bak
# fi
# 配置docker的daemon.json
# cat <<EOF > /etc/docker/daemon.json
# {"registry-mirrors":["https://registry.docker-cn.com","https://registry.cn-hangzhou.aliyuncs.com"],"exec-opts": ["native.cgroupdriver=systemd"]}
# EOF
# 重新加载docker配置并重启docker服务
systemctl daemon-reload
# systemctl restart docker
# 初始化节点
sudo modprobe br_netfilter
sudo sysctl net.bridge.bridge-nf-call-iptables=1
# 加载必要内核模块
sudo modprobe overlay
sudo modprobe br_netfilter
# 编辑 `/etc/modules-load.d/k8s.conf` 添加以下内容:
cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf
overlay
br_netfilter
EOF
# 编辑 `/etc/sysctl.d/k8s.conf` 配置网络参数:
cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf
net.bridge.bridge-nf-call-iptables = 1
net.bridge.bridge-nf-call-ip6tables = 1
net.ipv4.ip_forward = 1
EOF
# 生效配置
sudo sysctl --system
# 将containerd默认配置写入文件
mkdir -p /etc/containerd
containerd config default > /etc/containerd/config.toml
# 创建目录
sudo mkdir -p /etc/containerd/certs.d
mkdir -p /etc/containerd/certs.d/docker.io
mkdir -p /etc/containerd/certs.d/registry.k8s.io
mkdir -p /etc/containerd/certs.d/gcr.io
## 定义阿里云镜像源地址
ALIYUN_DOCKER="https://registry.docker-cn.com"
ALIYUN_K8S="https://registry.aliyuncs.com/google_containers"
ALIYUN_GCR="$ALIYUN_K8S" # gcr.io 同样使用阿里云镜像源
# 配置文件路径
CONFIG_TOML="/etc/containerd/config.toml"
CERTS_DIR="/etc/containerd/certs.d"
# 1. 修改 containerd 配置文件
echo "正在配置 containerd 的镜像加速..."
if ! grep -q 'config_path' "$CONFIG_TOML"; then
# 在 config.toml 中添加 config_path 配置
sudo sed -i '$a\ [plugins."io.containerd.grpc.v1.cri".registry]\n config_path = "'"$CERTS_DIR"'"' "$CONFIG_TOML"
fi
# 2. 创建 certs.d 目录(如果不存在)
sudo mkdir -p "$CERTS_DIR"
# 3. 配置 Docker Hub 镜像加速
echo "配置 Docker Hub 镜像加速..."
sudo mkdir -p "$CERTS_DIR/docker.io"
cat <<EOF | sudo tee "$CERTS_DIR/docker.io/hosts.toml"
server = "https://docker.io"
[host."$ALIYUN_DOCKER"]
capabilities = ["pull", "resolve"]
EOF
# 4. 配置 Kubernetes 官方镜像源
echo "配置 Kubernetes 官方镜像加速..."
sudo mkdir -p "$CERTS_DIR/registry.k8s.io"
cat <<EOF | sudo tee "$CERTS_DIR/registry.k8s.io/hosts.toml"
server = "https://registry.k8s.io"
[host."$ALIYUN_K8S"]
capabilities = ["pull", "resolve"]
EOF
# 5. 配置 Google Container Registry (gcr.io)
echo "配置 Google Container Registry 镜像加速..."
sudo mkdir -p "$CERTS_DIR/gcr.io"
cat <<EOF | sudo tee "$CERTS_DIR/gcr.io/hosts.toml"
server = "https://gcr.io"
[host."$ALIYUN_GCR"]
capabilities = ["pull", "resolve"]
EOF
# 5. 修复 pause 镜像地址(使用阿里云镜像)
sudo sed -i 's|sandbox_image = "registry.k8s.io/pause:.*"|sandbox_image = "registry.aliyuncs.com/google_containers/pause:3.9"|g' /etc/containerd/config.toml
# --- 修正配置项 ---
# 1. 检查并设置 [plugins."io.containerd.grpc.v1.cri".containerd].systemd_cgroup = true
# echo "Checking/fixing 'systemd_cgroup' configuration..."
# if ! grep -q 'systemd_cgroup = true' "$CONFIG_TOML"; then
# # 使用 sed 直接替换整行,无需捕获组
# sed -i "/^\s*systemd_cgroup\s*=\s*.*/c\
# systemd_cgroup = true" "$CONFIG_TOML"
# echo "Modified 'systemd_cgroup' to 'true'."
# else
# echo "'systemd_cgroup' is already set to 'true'."
# fi
# 2. 检查并设置 [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options].SystemdCgroup = true
# 检查配置文件是否存在
if [ ! -f "$CONFIG_TOML" ]; then
echo "Error: Config file not found at $CONFIG_TOML"
exit 1
fi
echo "Checking/fixing 'SystemdCgroup' configuration..."
# 使用 sed 修改 SystemdCgroup 的值为 true,保留缩进
if ! grep -q '^\s*SystemdCgroup\s*=\s*true' "$CONFIG_TOML"; then
# 替换等号右侧的值,保留左侧的缩进和键名
sed -i 's/^\(\s*SystemdCgroup\s*=\s*\).*/\1true/' "$CONFIG_TOML"
echo "Modified 'SystemdCgroup' to 'true'. 修改后的值为:"
grep '^\s*SystemdCgroup\s*=\s*true' "$CONFIG_TOML"
else
echo "'SystemdCgroup' is already set to 'true'."
fi
# 3. 重启 containerd 服务
echo "Restarting containerd..."
sudo systemctl restart containerd
if [ $? -eq 0 ]; then
echo "containerd restarted successfully."
else
echo "Failed to restart containerd. Check logs for errors."
fi
# 验证配置
echo "Verifying configuration..."
crictl info | grep -i "systemd_cgroup" && crictl info | grep -i "SystemdCgroup"
echo "containerd配置初始纠正完成."
echo "开始更新containerd配置以适配GPU实例"
# 检查是否有 NVIDIA GPU
if lspci | grep -i nvidia > /dev/null 2>&1; then
log_info "检测到NVIDIA GPU,开始配置nvidia-container-runtime..."
dpkg -i /opt/*.deb || log_error "安装nvidia-container-runtime及其依赖失败!"
# 配置 containerd 支持 nvidia runtime
CONTAINERD_CONFIG="/etc/containerd/config.toml"
if ! grep -q '\[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia\]' "$CONTAINERD_CONFIG"; then
cat <<EOF >> "$CONTAINERD_CONFIG"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
privileged_without_host_devices = false
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
BinaryName = "/usr/bin/nvidia-container-runtime"
EOF
fi
# 重启 containerd
systemctl restart containerd
log_info "nvidia-container-runtime 配置完成,containerd已重启"
else
log_info "未检测到NVIDIA GPU,跳过nvidia-container-runtime配置"
fi
# 修改 DNS 为阿里云公共 DNS提升镜像拉取速度
# sudo tee /etc/resolv.conf <<EOF
# nameserver 223.5.5.5
# nameserver 223.6.6.6
# nameserver 8.8.8.8
# nameserver 114.114.114.114
# EOF
# 5. 验证配置是否生效
# sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock info
crictl info
# 开启ip转发
# 处理[ERROR FileContent--proc-sys-net-ipv4-ip_forward]: /proc/sys/net/ipv4/ip_forward contents are not set to 1问题
if ! grep -q "^net.ipv4.ip_forward = 1" /etc/sysctl.conf; then
echo "net.ipv4.ip_forward = 1" | sudo tee -a /etc/sysctl.conf > /dev/null
sudo sysctl -p
fi
# nfs_server_ip="192.168.0.3" # 替换为实际的NFS服务器IP
# nfs_share_path="/d/k8s_nss"
echo "======== 动态获取NFS服务器IP和共享目录 ========"
nfs_server_ip="$2" # 替换为实际的NFS服务器IP
nfs_share_path="$3" # 替换为实际的NFS服务器共享目录
# 不改变原有逻辑的基础上,将 K8s相关数据目录迁移到 $nfs_share_path 目录
log_info "迁移K8s相关数据目录到$nfs_share_path挂载点..."
# 迁移containerd数据目录
if [ ! -d $nfs_share_path/containerd ]; then
mkdir -p $nfs_share_path/containerd
fi
if [ -d /var/lib/containerd ] && [ ! -L /var/lib/containerd ]; then
systemctl stop containerd
mv /var/lib/containerd/* $nfs_share_path/containerd/ 2>/dev/null || true
rm -rf /var/lib/containerd
ln -sf $nfs_share_path/containerd /var/lib/
systemctl start containerd
fi
# 迁移kubelet数据目录
if [ ! -d $nfs_share_path/kubelet ]; then
mkdir -p $nfs_share_path/kubelet
fi
if [ ! -L /var/lib/kubelet ]; then
systemctl stop kubelet
mv /var/lib/kubelet/* $nfs_share_path/kubelet/ 2>/dev/null || true
rm -rf /var/lib/kubelet
ln -sf $nfs_share_path/kubelet /var/lib/
systemctl start kubelet
fi
# 迁移kubeadm数据目录
if [ ! -d $nfs_share_path/kubeadm ]; then
mkdir -p $nfs_share_path/kubeadm
fi
if [ ! -L /var/lib/kubeadm ]; then
mv /var/lib/kubeadm/* $nfs_share_path/kubeadm/ 2>/dev/null || true
rm -rf /var/lib/kubeadm
ln -sf $nfs_share_path/kubeadm /var/lib/
fi
# 迁移etcd数据目录仅master节点
if [ "$1" == "master" ]; then
if [ ! -d $nfs_share_path/etcd ]; then
mkdir -p $nfs_share_path/etcd
fi
if [ ! -L /var/lib/etcd ]; then
systemctl stop kubelet 2>/dev/null || true
mv /var/lib/etcd/* $nfs_share_path/etcd/ 2>/dev/null || true
rm -rf /var/lib/etcd
ln -sf $nfs_share_path/etcd /var/lib/
systemctl start kubelet 2>/dev/null || true
fi
fi
# 权限修正
chown -R root:root $nfs_share_path/containerd $nfs_share_path/kubelet $nfs_share_path/kubeadm $nfs_share_path/etcd 2>/dev/null || true
log_info "K8s数据目录迁移完成,所有数据将存储于$nfs_share_path下"
# 判断是主节点还是副节点
if [ "$1" == "master" ]; then
# 写入hosts
# if ! grep -q "k8s-master" /etc/hosts; then
# echo "127.0.0.1 k8s-master" | sudo tee -a /etc/hosts > /dev/null
# fi
# 修改主机名,这里假设新主机名为 k8s-node,可根据实际情况修改
hostnamectl set-hostname k8s-master || log_error "修改主机名失败"
# 防火墙开放端口
log_info "开放防火墙端口..."
# 安装并配置 ufw仅开放必要端口
# 开放 Kubernetes 控制平面端口
sudo ufw allow 6443/tcp
sudo ufw allow 10257/tcp
sudo ufw allow 2379:2380/tcp
# 开放 kubelet 和组件通信端口(仅限集群内部)
# 注意10250 端口需严格限制访问,避免暴露到公网
sudo ufw allow 10250:10252/tcp
# 开放 NodePort 服务端口范围
sudo ufw allow 30000:32767/tcp
# 开放 CNI 插件端口(如 Calico
sudo ufw allow 4789/udp
sudo ufw allow 179/tcp
# 开放 Ingress 端口(如 Nginx Ingress
sudo ufw allow 80/tcp
sudo ufw allow 443/tcp
# sudo ufw enable
# 主节点安装步骤
log_info "正在master节点进行安装core和初始化"
# kubeadm config images list
# 导入本地镜像减少拉取时间
chmod 755 /opt/import_images.sh && /opt/import_images.sh
sleep 1
log_info "初始化主节点..."
# kubeadm init --image-repository=registry.aliyuncs.com/google_containers --pod-network-cidr=10.244.0.0/16 --service-cidr=10.96.0.0/12 || log_error "主节点初始化失败"
# kubeadm init --config=kubeadm.yaml --pod-network-cidr=10.244.0.0/16 --service-cidr=10.96.0.0/12
kubeadm init --image-repository=registry.aliyuncs.com/google_containers --pod-network-cidr=10.244.0.0/16 --service-cidr=10.96.0.0/12 --kubernetes-version=v1.28.2 || log_error "主节点初始化失败"
# sudo chmod 644 /etc/kubernetes/pki/*
# sudo chown -R root:root /etc/kubernetes/pki
# 在主节点上执行以下命令来生成副节点加入的 join 指令
log_info "生成工作节点加入的join指令..."
join_command=$(kubeadm token create --print-join-command 2>/dev/null)
# join_command=$(kubeadm token create --print-join-command --ttl 0 2>/dev/null)
if [ -z "$join_command" ]; then
log_error "生成join指令失败"
else
echo "$join_command" > join_command.txt
echo "已将join命令保存到join_command.txt文件中,请在新窗口cat查看并拷贝到worker node进行集群注册"
# 这里可以继续执行后面的步骤
# 配置kubectl
log_info "配置kubectl..."
mkdir -p $HOME/.kube
cp -i /etc/kubernetes/admin.conf $HOME/.kube/config || log_error "复制kubeconfig文件失败"
chown $(id -u):$(id -g) $HOME/.kube/config || log_error "更改kubeconfig文件权限失败"
echo "master节点安装完毕..."
sleep 1
# 安装网络插件
log_info "正在安装网络插件(flannel)"
kubectl apply -f /opt/kube-flannel.yml || log_error "本地安装flannel网络插件失败"
log_info "正在安装MetricsServer插件"
kubectl apply -f /opt/components.yaml || log_error "本地安装MetricsServer插件失败"
log_info "正在安装Ingress-nginx-controller插件"
kubectl apply -f /opt/ingress-nginx-controller.yaml || log_error "本地安装ingress-nginx-controller插件失败"
log_info "正在安装GPU模式必要插件"
kubectl apply -f /opt/nvidia-device-plugin.yml || log_error "本地安装GPU模式必要插件失败"
log_info "正在安装nfs-client-provisioner插件"
aptitude -y install nfs-kernel-server nfs-common=1:1.3.4-2.5ubuntu3.7
if [ $? -ne 0 ]; then
echo "NFS 服务器端安装失败,请检查网络连接或软件源。"
exit 1
fi
# 创建集群共享目录
# 检查 NFS 共享目录是否存在,若不存在则创建
# 目前是控制节点承担所有共享存储,后期需要换成动态的NFS服务器
mkdir -p $nfs_share_path
# 定义要添加到 /etc/exports 的配置行
line="$nfs_share_path *(rw,sync,no_root_squash,no_subtree_check)"
# 检查 /etc/exports 文件是否已经包含指定行
if ! grep -qF "$line" /etc/exports; then
# 若不包含,则添加该行
echo "$line" >> /etc/exports
if [ $? -ne 0 ]; then
echo "共享目录配置文件修改失败,请检查文件权限。"
exit 1
else
echo "成功添加共享目录配置。"
fi
else
echo "共享目录配置已存在,无需重复添加。"
fi
# 启动 NFS 服务
echo "启动 NFS 服务..."
systemctl restart nfs-kernel-server
if [ $? -ne 0 ]; then
echo "NFS 服务启动失败,请检查配置文件。"
exit 1
fi
kubectl apply -f /opt/storage_class.yaml || log_error "集群存储类nfs-storage-class初始化失败"
#kubectl apply -f /opt/nfs-provisioner-deploy.yaml || log_error "动态存储nfs-provisioner-deploy初始化失败"
echo "!!! 此处更换成读取动态的NFS服务器: xxx.xx.xx.xxx 及共享目录: /a/b/c !!!"
nfs_provisioner_yaml='
apiVersion: apps/v1
kind: Deployment
metadata:
name: nfs-client-provisioner
labels:
app: nfs-client-provisioner
spec:
replicas: 1
strategy:
type: Recreate ## 设置升级策略为删除再创建(默认为滚动更新)
selector:
matchLabels:
app: nfs-client-provisioner
template:
metadata:
labels:
app: nfs-client-provisioner
spec:
serviceAccountName: nfs-client-provisioner
containers:
- name: nfs-client-provisioner
#image: gcr.io/k8s-staging-sig-storage/nfs-subdir-external-provisioner:v4.0.0
image: registry.cn-beijing.aliyuncs.com/xngczl/nfs-subdir-external-provisione:v4.0.0
volumeMounts:
- name: nfs-client-root
mountPath: /persistentvolumes
env:
- name: PROVISIONER_NAME ## Provisioner的名称,以后设置的storageclass要和这个保持一致
value: k8s-sigs.io/nfs-subdir-external-provisioner
- name: NFS_SERVER ## NFS服务器地址,需和valumes参数中配置的保持一致
value: '"$nfs_server_ip"' ## 替换为实际的NFS服务器IP
- name: NFS_PATH ## NFS服务器数据存储目录,需和valumes参数中配置的保持一致
value: '"$nfs_share_path"' ## 替换为实际的NFS服务器共享目录
volumes:
- name: nfs-client-root
nfs:
server: '"$nfs_server_ip"' ## NFS服务器地址
path: '"$nfs_share_path"' ## NFS服务器数据存储目录
readOnly: false
'
echo "$nfs_provisioner_yaml" | kubectl apply -f -
if [ $? -ne 0 ]; then
echo "nfs动态工具链创建失败"
exit 1
fi
kubectl apply -f /opt/nfs-rbac.yaml || log_error "集群共享存储权限nfs-rbac初始化失败"
# 修改 deployment.yaml 文件,设置 NFS 服务器地址和共享目录
# sed -i 's|NFS_SERVER|your_nfs_server_ip|g' deployment.yaml
# sed -i 's|NFS_PATH|your_nfs_shared_directory|g' deployment.yaml
# # 创建资源
# kubectl apply -f rbac.yaml
# kubectl apply -f deployment.yaml
# kubectl apply -f class.yaml
sleep 3
# 查询组件状态
log_info "查询组件状态..."
# 检查是否有组件状态为 Unhealthy
if kubectl get componentstatuses 2>/dev/null | grep -q 'Unhealthy'; then
echo "检测到组件状态为 Unhealthy, 开始修复..."
# 注释掉 --port=0 参数(添加备份文件)
sed -i.bak '/--port=0/s/^/#/' /etc/kubernetes/manifests/kube-controller-manager.yaml
sed -i.bak '/--port=0/s/^/#/' /etc/kubernetes/manifests/kube-scheduler.yaml
echo "已生成备份文件: kube-controller-manager.yaml.bak 和 kube-scheduler.yaml.bak"
echo "修复完成,等待组件重启..."
else
echo "所有组件状态正常,无需修复。"
fi
sleep 5
systemctl restart kubelet.service || log_error "重启kubelet服务失败"
log_info "30秒后再次查看组件状态..."
sleep 30
# 再次查看组件状态(需要稍等)
kubectl get cs || log_info "再次获取组件状态失败"
echo "验证集群状态(安装完毕后手动执行),查看pod状态"
log_info "查看pod状态..."
kubectl get nodes || log_info "获取节点状态失败"
kubectl get pods --all-namespaces || log_info "获取所有命名空间的pod状态失败"
fi
elif [ "$1" == "worker" ]; then
# 修改主机名
apt install telnet -y
aptitude -y install nfs-common=1:1.3.4-2.5ubuntu3.7
# 写入hosts
# if ! grep -q "k8s-worker" /etc/hosts; then
# echo "127.0.0.1 k8s-worker" | sudo tee -a /etc/hosts > /dev/null
# fi
# 这里假设新主机名为 k8s-node,可根据实际情况修改
hostnamectl set-hostname "k8s-worker-$(date +%Y%m%d%H%M%S)" || log_error "修改主机名失败"
# 副节点安装步骤
log_info "正在worker节点进行安装"
apt update -y || log_error "更新apt源失败"
# 从节点重启kubeadm,可解决曾启动过导致端口被占用的问题
log_info "从节点重启kubeadm,可解决曾启动过导致端口被占用的问题..."
kubeadm reset -f|| log_error "重置kubeadm失败"
# 获取主节点的join命令假设已提前获取并保存为join_command.txt
# 导入本地网络插件部分镜像减少拉取时间
chmod 755 /opt/import_images.sh && /opt/import_images.sh
echo "请输入加入对方kubernetes集群的命令: (任何时候)"
# read join_command
# eval "$join_command" || log_error "加入k8s集群失败"
else
echo "请指定正确的节点类型,master或worker"
exit 1
fi
# 检查安装过程是否有错误(这里只是简单示例,实际可能需要更详细的检查)
if [ $? -ne 0 ]; then
log_error "安装过程中出现错误,请手动解决后再重新执行"
fi
log_info "安装脚本执行完毕"
# 输出安装完成提示
log_info "Kubernetes 安装脚本执行完毕,请根据提示进行后续操作。"
log_info "如果是主节点,请在新窗口cat join_command.txt查看并拷贝到worker node进行集群注册"
log_info "如果是worker节点,请在新窗口输入主节点提供的join命令进行集群注册"
log_info "请注意,在执行完脚本后,可能需要等待一段时间以确保所有组件正常运行。"
log_info "可以使用 'kubectl get nodes' 和 'kubectl get pods --all-namespaces' 命令来检查集群状态。"
log_info "如果有任何问题,请检查日志或联系管理员Ahexl。"
log_info "感谢使用本脚本,祝您使用愉快!"

87
script/k8s_uninstall.sh Normal file
View File

@ -0,0 +1,87 @@
#!/bin/bash
# 停止K8s相关服务
echo "停止K8s相关服务..."
ps -aux | grep Opera | grep -v grep | awk '{print $2}' | xargs kill -9
systemctl stop kubelet
echo "移除缓存忘卡配置"
ip link delete cni0
systemctl stop kube-apiserver
systemctl stop nfs-kernel-server
rm -rf /k8sdata/*
# 执行kubeadm reset
echo "执行kubeadm reset..."
kubeadm reset -f
apt-get purge kubelet kubectl kubeadm kubernetes-cni -y --allow-change-held-packages
rm -rf /etc/cni/net.d
rm -rf /var/lib/kubelet /var/lib/kubernetes
rm -rf /etc/kubernetes/manifests
rm -rf /etc/kubernetes/pki
rm -rf /etc/kubernetes
rm -rf /var/lib/etcd
rm -rf /var/lib/cni
rm -rf /var/lib/docker
rm -rf /var/lib/containerd
rm -rf /var/lib/etcd
rm -rf /var/lib/kubelet
rm -rf /var/lib/kube-proxy
# 删除K8s配置文件
echo "删除K8s配置文件..."
sudo rm -rf /etc/kubernetes
# 删除K8s相关二进制文件假设在/usr/local/bin
echo "删除K8s相关二进制文件..."
sudo rm /usr/local/bin/kube*
# 清理Containerd数据谨慎操作
echo "清理Containerd数据..."
sudo rm -rf /var/lib/containerd /usr/bin/containerd*
sudo apt purge -y containerd containerd.io cri-tools --allow-change-held-packages
rm -rf /etc/containerd /var/lib/containerd /run/containerd
rm -f /etc/systemd/system/multi-user.target.wants/containerd.service
rm /lib/systemd/system/containerd.service
systemctl daemon-reload
# 清理iptables规则
echo "清理iptables规则..."
# sudo iptables -F && sudo iptables -t nat -F && sudo iptables -t mangle -F && sudo iptables -X
# ipvsadm --clear
apt autoremove -y
apt autoclean -y
apt clean -y
apt update -y
# 停止docker
echo "停止docker并清理..."
docker rmi $(docker images -q)
docker stop $(docker ps -aq) && docker rm $(docker ps -aq) && sudo systemctl stop docker
sudo systemctl stop docker.service
sudo systemctl stop docker.socket
rm -rf /etc/docker/daemon.json
rm -rf /usr/bin/docker-compose
# 清理Docker
apt-get purge docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin docker-ce-rootless-extras -y --allow-change-held-packages
apt purge -y containerd.io containerd
# 检查并删除当前root用户的kubeconfig文件
echo "检查并删除当前root用户的kubeconfig文件..."
sudo rm -rf $HOME/.kube/config
# kubord
echo "清理kuboard相关配置..."
sed -i '/\/opt \*(rw,sync,no_root_squash)/d' /etc/exports
rm -rf /etc/apt/sources.list.d/docker*
rm -rf /etc/apt/sources.list.d/kubernetes*
rm -rf /etc/apt/sources.list.d/kuboard*
apt autoremove -y
apt autoclean -y
apt clean -y
echo "恭喜你!!! K8s相关内容已清理, 可准备重新安装。"

Binary file not shown.

Binary file not shown.

Binary file not shown.

15
script_test/readyaml.sh Executable file
View File

@ -0,0 +1,15 @@
#!/bin/bash
nfs_dynamic_yaml=""
while IFS= read -r line; do
nfs_dynamic_yaml+="$line\n" # 手动添加换行符
done < ../files/nfs-provisioner-deploy.yaml
echo -e "内容:\n$nfs_dynamic_yaml"
echo "$nfs_dynamic_yaml" | kubectl apply -f -
if [ $? -ne 0 ]; then
echo "资源创建失败"
exit 1
fi

View File

@ -0,0 +1,11 @@
info('test .....{params_kw=}')
#debug(f"accept cpcc node: {params_kw=}")
try:
result_dict["data"] = delete_cluster_node(params_kw)
result_dict["status"] = True
result_dict["info"] = "operate success"
except:
import traceback
debug(traceback.format_exc())
return result_dict

View File

@ -0,0 +1,11 @@
info('test .....{params_kw=}')
#debug(f"accept cpcc node: {params_kw=}")
try:
result_dict["data"] = delete_cluster_pod(params_kw)
result_dict["status"] = True
result_dict["info"] = "operate success"
except:
import traceback
debug(traceback.format_exc())
return result_dict

View File

@ -0,0 +1,11 @@
info('test .....{params_kw=}')
#debug(f"delete_cpcpod接收cpcc参数{params_kw=}")
try:
result_dict["data"] = yaml_apply_delete(params_kw)
result_dict["status"] = True
result_dict["info"] = "operate success"
except Exception as e:
print(e)
return result_dict

View File

@ -0,0 +1,16 @@
info('test .....{params_kw=}')
#endpoint=params_kw["endpoint"]
#if "_" not in endpoint and "-" not in endpoint:
# result_dict["info"] = "endpoint format not allowed"
# return result_dict
#debug(f"accept cpcc node: {params_kw=}")
try:
result_dict["data"] = determine_accommodat_by_kubeconfig(params_kw)
result_dict["status"] = True
result_dict["info"] = "operate success"
except:
import traceback
debug(traceback.format_exc())
return result_dict

View File

@ -0,0 +1,16 @@
info('test .....{params_kw=}')
#endpoint=params_kw["endpoint"]
#if "_" not in endpoint and "-" not in endpoint:
# result_dict["info"] = "endpoint format not allowed"
# return result_dict
#debug(f"accept cpcc node: {params_kw=}")
try:
result_dict["data"] = get_cluster_nodes_by_kubeconfig(params_kw)
result_dict["status"] = True
result_dict["info"] = "operate success"
except:
import traceback
debug(traceback.format_exc())
return result_dict

View File

@ -0,0 +1,16 @@
info('test .....{params_kw=}')
#endpoint=params_kw["endpoint"]
#if "_" not in endpoint and "-" not in endpoint:
# result_dict["info"] = "endpoint format not allowed"
# return result_dict
#debug(f"accept cpcc node: {params_kw=}")
try:
result_dict["data"] = get_cluster_pods_by_kubeconfig(params_kw)
result_dict["status"] = True
result_dict["info"] = "operate success"
except:
import traceback
debug(traceback.format_exc())
return result_dict

View File

@ -0,0 +1,10 @@
info('test .....{params_kw=}')
try:
result_dict["data"] = get_multiple_cluster()
result_dict["status"] = True
result_dict["info"] = "operate success"
except Exception as e:
print(e)
result_dict["data"] = [result]
return result_dict

View File

@ -0,0 +1,10 @@
info('test .....{params_kw=}')
try:
result_dict["data"] = get_multiple_cluster_pod()
result_dict["status"] = True
result_dict["info"] = "operate success"
except Exception as e:
print(e)
result_dict["data"] = [result]
return result_dict

View File

@ -0,0 +1,11 @@
info('test .....{params_kw=}')
debug(f"接收cpcc参数{params_kw=}")
try:
result_dict["data"] = new_cluster_install(params_kw)
result_dict["status"] = True
result_dict["info"] = "operate success"
except Exception as e:
print(e)
return result_dict

View File

@ -0,0 +1,11 @@
info('test .....{params_kw=}')
debug(f"接收cpcc参数{params_kw=}")
try:
result_dict["data"] = new_cluster_install(params_kw)
result_dict["status"] = True
result_dict["info"] = "operate success"
except Exception as e:
print(e)
return result_dict

View File

@ -0,0 +1,11 @@
info('test .....{params_kw=}')
debug(f"node_label_opt接收cpcc参数{params_kw=}")
try:
result_dict["data"] = node_label_opt(params_kw)
result_dict["status"] = True
result_dict["info"] = "operate success"
except Exception as e:
print(e)
return result_dict

View File

@ -0,0 +1,11 @@
info('test .....{params_kw=}')
debug(f"接收cpcc参数{params_kw=}")
try:
result_dict["data"] = node_state_switch(params_kw)
result_dict["status"] = True
result_dict["info"] = "operate success"
except Exception as e:
print(e)
return result_dict

View File

@ -0,0 +1,11 @@
info('test .....{params_kw=}')
#debug(f"update_cpcpod接收cpcc参数{params_kw=}")
try:
result_dict["data"] = yaml_apply_delete(params_kw)
result_dict["status"] = True
result_dict["info"] = "operate success"
except Exception as e:
print(e)
return result_dict

View File

@ -0,0 +1,12 @@
info('test .....{params_kw=}')
debug(f"接收cpcc参数{params_kw=}")
try:
result_dict["data"] = yaml_apply_delete(params_kw)
result_dict["status"] = True
result_dict["info"] = "operate success"
except Exception as e:
debug(f'{e}')
result_dict["info"] = e
return result_dict

View File

@ -0,0 +1,73 @@
async def create_pod(ns={}):
import hashlib
ns['pvcname'] = hashlib.md5(str(time.time()).encode()).hexdigest()[:10]
ns['podname'] = ns['pvcname']
ns['containername'] = ns['pvcname']
ns['volumename'] = ns['pvcname']
ns['namespace'] = ns['namespace'] if ns.get('namespace') else 'default'
namespace = ns['namespace'] # 使用的命名空间
core_api = client.CoreV1Api()
# 创建 PVC
#create_persistent_volume_claim(core_api, namespace)
pvc = client.V1PersistentVolumeClaim(
metadata=client.V1ObjectMeta(name=ns['pvcname']),
spec=client.V1PersistentVolumeClaimSpec(
access_modes=["ReadWriteOnce"],
resources=client.V1ResourceRequirements(
requests={"storage": str(ns['storage']) + "Gi"}
)
)
)
core_api.create_namespaced_persistent_volume_claim(namespace=namespace, body=pvc)
print("PVC created.")
# 创建 Pod
# create_pod(core_api, namespace)
pod = client.V1Pod(
metadata=client.V1ObjectMeta(name=ns['podname']),
spec=client.V1PodSpec(
containers=[
client.V1Container(
name=ns['containername'],
image=ns['image'], # 使用 Nginx 容器
command=["tail", "-f", "/dev/null"], # 确保容器保持运行
resources=client.V1ResourceRequirements(
requests={
"cpu": str(ns['cpu']), # 请求 1 个 CPU
"memory": str(ns['memory']) + "Gi", # 请求 5 GB 内存
},
limits={
"cpu": str(ns['cpu']), # 限制最多使用 4 个 CPU
"memory": str(ns['memory']) + "Gi", # 限制最多使用 8 GB 内存
"nvidia.com/gpu": ns['gpu'],
"nvidia.com/gpumem": ns['gpumem']
},
),
volume_mounts=[
client.V1VolumeMount(
name=ns['volumename'],
mount_path="/usr/share/", # 挂载路径
)
],
)
],
volumes=[
client.V1Volume(
name=ns['volumename'],
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
claim_name=ns['pvcname']
),
)
],
)
)
core_api.create_namespaced_pod(namespace=namespace, body=pod)
ns['status'] = True
ns['msg'] = '创建实例成功'
return ns
ret = await create_pod(params_kw)
return ret

View File

@ -0,0 +1,103 @@
async def get_available_resources(ns={}):
# 创建 API 实例
v1 = client.CoreV1Api()
# 获取所有节点
nodes = v1.list_node()
# 获取所有 Pod
pods = v1.list_pod_for_all_namespaces()
# 存储节点资源信息
node_resources = {}
total_allocatable = {
'cpu': 0,
'memory': 0,
'gpu': 0,
'storage': 0
}
total_used = {
'cpu': 0,
'memory': 0,
'gpu': 0,
'storage': 0
}
for node in nodes.items:
name = node.metadata.name
allocatable = node.status.allocatable
node_resources[name] = {
'cpu_allocatable': int(allocatable.get('cpu', '0').rstrip('m')) / 1000 if 'm' in allocatable.get('cpu', '0') else int(allocatable.get('cpu', '0')),
'memory_allocatable': int(allocatable.get('memory', '0').rstrip('Ki')) / 1024 / 1024,
'gpu_allocatable': int(allocatable.get('nvidia.com/gpu', '0')),
'storage_allocatable': int(allocatable.get('ephemeral-storage', '0').rstrip('Ki')) / 1024 / 1024
}
# 累加总可分配资源
total_allocatable['cpu'] += node_resources[name]['cpu_allocatable']
total_allocatable['memory'] += node_resources[name]['memory_allocatable']
total_allocatable['gpu'] += node_resources[name]['gpu_allocatable']
total_allocatable['storage'] += node_resources[name]['storage_allocatable']
# 初始化已分配
node_resources[name].update({
'cpu_used': 0,
'memory_used': 0,
'gpu_used': 0,
'storage_used': 0
})
# 遍历所有 Pod,统计每个节点的已分配资源
for pod in pods.items:
if pod.spec.node_name: # 确保 Pod 已被调度到节点
node_name = pod.spec.node_name
for container in pod.spec.containers:
reque = container.resources.requests or {}
node_resources[node_name]['cpu_used'] += float(reque.get('cpu', '0').rstrip('m')) / 1000 if 'm' in reque.get('cpu', '0') else float(reque.get('cpu', '0'))
node_resources[node_name]['memory_used'] += int(reque.get('memory', '0').rstrip('Mi')) if 'Mi' in reque.get('memory', '0') else int(reque.get('memory', '0').rstrip('Gi')) * 1024
node_resources[node_name]['gpu_used'] += int(reque.get('nvidia.com/gpu', '0'))
node_resources[node_name]['storage_used'] += int(reque.get('ephemeral-storage', '0').rstrip('Mi')) if 'Mi' in reque.get('ephemeral-storage', '0') else 0
# 计算总已使用资源
for node_name, resources in node_resources.items():
total_used['cpu'] += resources['cpu_used']
total_used['memory'] += resources['memory_used']
total_used['gpu'] += resources['gpu_used']
total_used['storage'] += resources['storage_used']
# 计算节点的资源占用情况
print(f"Node: {node_name}")
print(f" CPU Remaining: {resources['cpu_allocatable'] - resources['cpu_used']} cores")
print(f" Memory Remaining: {resources['memory_allocatable'] - resources['memory_used']} Mi")
print(f" GPU Remaining: {resources['gpu_allocatable'] - resources['gpu_used']} GPUs")
print(f" Storage Remaining: {resources['storage_allocatable'] - resources['storage_used']} Mi")
print()
# 计算总剩余资源和使用百分比
total_remaining = {key: total_allocatable[key] - total_used[key] for key in total_allocatable}
usage_percentage = {key: (total_used[key] / total_allocatable[key] * 100 if total_allocatable[key] > 0 else 0) for key in total_allocatable}
# 输出总资源和使用情况
print("Cluster Resource Summary:")
print(f" Total Allocatable CPU: {total_allocatable['cpu']} cores")
print(f" Total Allocatable Memory: {total_allocatable['memory']} Mi")
print(f" Total Allocatable GPU: {total_allocatable['gpu']} GPUs")
print(f" Total Allocatable Storage: {total_allocatable['storage']} Mi")
print()
print(f" CPU Usage Percentage: {usage_percentage['cpu']:.2f}%")
print(f" Memory Usage Percentage: {usage_percentage['memory']:.2f}%")
print(f" GPU Usage Percentage: {usage_percentage['gpu']:.2f}%")
print(f" Storage Usage Percentage: {usage_percentage['storage']:.2f}%")
print()
# 返回数据
return {
"total_allocatable": total_allocatable,
"total_used": total_used,
"total_remaining": total_remaining,
"usage_percentage": usage_percentage
}
ret = await get_available_resources(params_kw)
return ret

View File

@ -0,0 +1,11 @@
info('test .....{params_kw=}')
print(params_kw)
uid=params_kw["uid"]
uid_number=params_kw["uid_number"]
plaintext_password=params_kw["plaintext_password"]
cn=params_kw["cn"]
result=add_ldap_user(uid,uid_number,plaintext_password,cn)
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,7 @@
info('test .....{params_kw=}')
uid=params_kw["uid"]
result=delete_ldap_user(uid)
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,6 @@
info('test .....{params_kw=}')
result=get_all_ldap_cn()
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,6 @@
info('test .....{params_kw=}')
result=get_all_ldap_user()
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,7 @@
info('test .....{params_kw=}')
cn=params_kw["cn"]
result=get_one_cn(cn)
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,91 @@
# -*- coding: utf-8 -*-
# @Time: 2024/12/5 17:07
import kubernetes.client
from kubernetes.client.rest import ApiException
from kubernetes import config
# 加载 Kubernetes 配置(如果在集群内运行则可以忽略)
config.load_kube_config()
def check_resource_availability(cpu_request, memory_request, storage_request, gpu_request):
"""
检查集群是否有足够资源来创建 Pod。
:param cpu_request: 请求的 CPU 核数
:param memory_request: 请求的内存单位为MiB
:param storage_request: 请求的存储单位为GiB
:param gpu_request: 请求的 GPU 数量
:return: 是否有足够资源
"""
v1 = kubernetes.client.CoreV1Api()
# 获取所有节点的信息
nodes = v1.list_node()
for node in nodes.items:
cpu_capacity = node.status.capacity['cpu']
memory_capacity = node.status.capacity['memory']
storage_capacity = node.status.capacity.get('ephemeral-storage', '0Gi') # 有些节点存储没有显示
gpu_capacity = 0 # 默认无 GPU
if 'nvidia.com/gpu' in node.status.capacity:
gpu_capacity = node.status.capacity['nvidia.com/gpu']
# 将单位转换成整数进行比较
cpu_capacity = int(cpu_capacity)
memory_capacity = int(memory_capacity[:-2]) # 去掉最后的Mi
storage_capacity = int(storage_capacity[:-2]) # 去掉最后的Gi
gpu_capacity = int(gpu_capacity)
# 判断该节点是否满足资源请求
if (cpu_capacity >= cpu_request and
memory_capacity >= memory_request and
storage_capacity >= storage_request and
gpu_capacity >= gpu_request):
print(f"Node {node.metadata.name} has enough resources.")
return True
return False
def create_pod(cpu_request, memory_request, storage_request, gpu_request):
"""创建 Pod,先检查资源是否足够"""
if not check_resource_availability(cpu_request, memory_request, storage_request, gpu_request):
print("No node has enough resources to fulfill the request.")
return
# 如果资源足够,则创建 Pod
v1 = kubernetes.client.CoreV1Api()
pod_manifest = {
"apiVersion": "v1",
"kind": "Pod",
"metadata": {"name": "my-pod"},
"spec": {
"containers": [{
"name": "my-container",
"image": "nginx", # 可替换为需要的镜像
"resources": {
"requests": {
"cpu": f"{cpu_request}m", # 单位是millicpu
"memory": f"{memory_request}Mi", # 单位是Mi
"ephemeral-storage": f"{storage_request}Gi", # 单位是Gi
},
"limits": {
"cpu": f"{cpu_request}m",
"memory": f"{memory_request}Mi",
"ephemeral-storage": f"{storage_request}Gi",
}
}
}]
}
}
try:
# 创建 Pod
v1.create_namespaced_pod(namespace="default", body=pod_manifest)
print("Pod created successfully.")
except ApiException as e:
print(f"Error creating pod: {e}")
# 示例使用:请求 1 CPU、2G 内存、30G 存储、0 GPU
create_pod(1000, 2048, 30, 0) # 1000m CPU, 2048Mi 内存, 30Gi 存储, 0 GPU

View File

@ -0,0 +1,33 @@
async def server_instance_delete(ns={}):
from kubernetes.client.rest import ApiException
# 加载 kubeconfig 配置
# config.load_kube_config()
# 设置你要删除的 pod 名称和所在的命名空间
namespace = ns['namespace'] if ns.get('namespace') else 'default'
podname = ns.get('podname')
pvcname = ns.get('pvcname')
# 创建 Pod API 客户端
v1 = client.CoreV1Api()
try:
# 删除 Pod
v1.delete_namespaced_pod(name=podname, namespace=namespace)
print(f"Pod {podname} 删除成功")
v1.delete_namespaced_persistent_volume_claim(name=pvcname, namespace=namespace)
print(f"PVC {pvcname} 删除成功")
return {
'status': True,
'msg': '实例删除成功'
}
except ApiException as e:
print(f"删除 Pod/PVC 失败: {e}")
return {
'status': False,
'msg': '实例删除失败, %s' % str(e)
}
ret = await server_instance_delete(params_kw)
return ret

View File

@ -0,0 +1,7 @@
info('test .....{params_kw=}')
result=get_history_list(params_kw)
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,7 @@
info('test .....{params_kw=}')
result=get_history_list_json(params_kw)
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,7 @@
info('test .....{params_kw=}')
result=get_real_time_list(params_kw)
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,7 @@
info('test .....{params_kw=}')
result=get_real_time_list_json(params_kw)
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,7 @@
info('test .....{params_kw=}')
result=kill_job(params_kw["jobId"])
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,7 @@
info('test .....{params_kw=}')
result=resume_job(params_kw["jobId"])
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,7 @@
info('test .....{params_kw=}')
result=submit_job(params_kw["command"])
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,7 @@
info('test .....{params_kw=}')
result=suspend_job(params_kw["jobId"])
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,20 @@
info('test .....{params_kw=}')
print(params_kw)
if "PartitionName" in params_kw:
result_partition=list_partition_detail_json(params_kw)
if len(result_partition)>0:
result=get_node_details_json(result_partition[0]["Nodes"])
else:
result=[]
result_dict["data"]=result
return result_dict
else:
result=get_node_details_json(params_kw["NodeName"])
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,7 @@
info('test .....{params_kw=}')
result=update_node(params_kw)
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,7 @@
info('test .....{params_kw=}')
result=create_partition(params_kw)
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,7 @@
info('test .....{params_kw=}')
result=delete_partition(params_kw)
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,5 @@
result=list_partition_detail_json(params_kw)
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,6 @@
info('test .....{params_kw=}')
result=list_partition_info(params_kw)
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,7 @@
info('test .....{params_kw=}')
result=update_partition(params_kw)
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,7 @@
info('test .....{params_kw=}')
result=get_storage_json(params_kw["point"])
result_dict["data"]=result
return result_dict

View File

@ -0,0 +1,223 @@
# Authentication
- HTTP Authentication, scheme: basic
# ldap相关
## GET 获取全部ldapUser
GET /api/v1/ldap/get_all_ldap_user
> 返回示例
> 200 Response
```json
{
"status": "success",
"data": [
{
"attributes": {
"cn": [
"test1"
],
"mail": [],
"sn": [
"test1"
]
},
"dn": "uid=test1,ou=test,dc=test,dc=com"
},
{
"attributes": {
"cn": [
"test"
],
"mail": [],
"sn": [
"test_add2"
]
},
"dn": "uid=test_add2,ou=test,dc=test,dc=com"
}
]
}
```
### 返回结果
| 状态码 | 状态码含义 | 说明 | 数据模型 |
| --- | ------------------------------------------------------- | ---- | ------ |
| 200 | [OK](https://tools.ietf.org/html/rfc7231#section-6.3.1) | none | Inline |
## GET 删除某个LdapUser
GET /api/v1/ldap/delete_ldap_user
### 请求参数
| 名称 | 位置 | 类型 | 必选 | 说明 |
| --- | ----- | ------ | --- | ---- |
| uid | query | string | 否 | none |
> 返回示例
> 200 Response
```json
{
"status": "success",
"data":{
    "result": 0,
    "description": "success",
    "dn": "",
    "message": "",
    "referrals": null,
    "type": "delResponse"
        }
}
```
### 返回结果
| 状态码 | 状态码含义 | 说明 | 数据模型 |
| --- | ------------------------------------------------------- | ---- | ------ |
| 200 | [OK](https://tools.ietf.org/html/rfc7231#section-6.3.1) | none | Inline |
###
## GET 获取某个CN
GET /api/v1/ldap/get_one_cn
### 请求参数
| 名称 | 位置 | 类型 | 必选 | 说明 |
| --- | ----- | ------ | --- | ---- |
| cn | query | string | 否 | none |
> 返回示例
> 200 Response
```json
{
"status": "success",
"data":{
"attributes": {
"cn": [
"test"
],
"gidNumber": [
47758
],
"objectClass": [
"posixGroup",
"top"
]
},
"dn": "cn=test,ou=test,dc=test,dc=com"
}
}
```
### 返回结果
| 状态码 | 状态码含义 | 说明 | 数据模型 |
| --- | ------------------------------------------------------- | ---- | ------ |
| 200 | [OK](https://tools.ietf.org/html/rfc7231#section-6.3.1) | none | Inline |
###
## GET 新增某个LdapUser
GET /api/v1/ldap/add_ldap_user
### 请求参数
| 名称 | 位置 | 类型 | 必选 | 说明 |
| ------------------ | ----- | ------ | --- | ------ |
| uid | query | string | 否 | 集群账号 |
| uid_number | query | string | 否 | 集群账号id |
| plaintext_password | query | string | 否 | 密码 |
| cn | query | string | 否 | none |
> 返回示例
> 200 Response
```json
{
"status": "success",
"data":{
"result": 0,
"description": "success",
"dn": "",
"message": "",
"referrals": null,
"type": "modifyResponse"
    }
}
```
### 返回结果
| 状态码 | 状态码含义 | 说明 | 数据模型 |
| --- | ------------------------------------------------------- | ---- | ------ |
| 200 | [OK](https://tools.ietf.org/html/rfc7231#section-6.3.1) | none | Inline |
###
## GET 获取全部ldapCN
GET /api/v1/ldap/get_all_ldap_cn
> 返回示例
> 200 Response
```json
{
"status": "success",
"data": [
{
"attributes": {
"cn": [
"testGroup"
],
"gidNumber": [
34423
],
"objectClass": [
"posixGroup",
"top"
]
},
"dn": "cn=testGroup,ou=test,dc=test,dc=com"
},
{
"attributes": {
"cn": [
"test"
],
"gidNumber": [
47758
],
"objectClass": [
"posixGroup",
"top"
]
},
"dn": "cn=test,ou=test,dc=test,dc=com"
}
]
}
```
### 返回结果
| 状态码 | 状态码含义 | 说明 | 数据模型 |
| --- | ------------------------------------------------------- | ---- | ------ |
| 200 | [OK](https://tools.ietf.org/html/rfc7231#section-6.3.1) | none | Inline |

View File

@ -0,0 +1,15 @@
## GET V1获取历史作业json
* url: /api/v1/slurm/job/get_history_list_json
* query:
* startStartTime 在这个时间之后提交的作业
* group 集群用户组
* jobId 作业id
* accountUserName 集群账号
* jobIdList 作业id列表
* statusList 状态列表
### 查看运行的作业
`/api/v1/slurm/job/get_history_list_json?statusList=running`
### 查看2025年的作业
`/api/v1/slurm/job/get_history_list_json?startStartTime=2025-01-01T00:00:00`

View File

@ -0,0 +1,14 @@
## GET V1 修改节点
* url: /api/v1/slurm/node/update_node
* query
* NodeName:节点名称
* State状态
* Reason原因
### 将节点设置为维护状态
`/api/v1/slurm/node/update_node?NodeName=CENI-KFSJk&State=DRAIN&Reason=weihu`
### 重新启动节点
`/api/v1/slurm/node/update_node?NodeName=node01&State=RESUME`
### 将节点设置为下线状态
`/api/v1/slurm/node/update_node?NodeName=CENI-KFSJk&State=DOWN&Reason=weihu`

View File

@ -0,0 +1,317 @@
## GET V1查询队列详细json
* url /api/v1/slurm/partition/list_partition_detail_json
* resopone
```json
{
"status": "success",
"data": [
{
"PartitionName": "master",
"AllowGroups": "ALL",
"AllowAccounts": "ALL",
"AllowQos": "ALL",
"AllocNodes": "ALL",
"Default": "NO",
"QoS": "N/A",
"DefaultTime": "NONE",
"DisableRootJobs": "NO",
"ExclusiveUser": "NO",
"GraceTime": "0",
"Hidden": "NO",
"MaxNodes": "UNLIMITED",
"MaxTime": "UNLIMITED",
"MinNodes": "0",
"LLN": "NO",
"MaxCPUsPerNode": "UNLIMITED",
"Nodes": "CENI-KFSJK,CENI-CSSJK",
"PriorityJobFactor": "1",
"PriorityTier": "1",
"RootOnly": "NO",
"ReqResv": "NO",
"OverSubscribe": "NO",
"OverTimeLimit": "NONE",
"PreemptMode": "OFF",
"State": "UP",
"TotalCPUs": "16",
"TotalNodes": "2",
"SelectTypeParameters": "NONE",
"JobDefaults": "(null)",
"DefMemPerNode": "UNLIMITED",
"MaxMemPerNode": "UNLIMITED"
}
]
}
```
## GET V1创建队列
* url:/api/v1/slurm/partition/create_partition
### 创建一个队列名为kaiyuanyun
* query
* PartitionName:kaiyuanyun
`/api/v1/slurm/partition/create_partition?PartitionName=kaiyuanyun`
### 创建一个队列名为kaiyuanyun且节点包含CENI-KFSJK
* query
* PartitionName:kaiyuanyun
* Nodes:CENI-KFSJK
`/api/v1/slurm/partition/create_partition?PartitionName=kaiyuanyun&nodes=CENI-KFSJK`
### 创建一个队列名为kaiyuanyun且节点包含CENI-KFSJK且只允许testgroup
`/api/v1/slurm/partition/create_partition?PartitionName=kaiyuanyun&nodes=CENI-KFSJK&AllowGroups=testgroup`
## GET V1修改队列
* url:/api/v1/slurm/partition/update_partition
### 修改队列名为kaiyuanyun的节点 为CENI-KFSJK
* query
* PartitionName:kaiyuanyun
* Nodes:CENI-KFSJK
`/api/v1/slurm/partition/update_partition?PartitionName=kaiyuanyun&nodes=CENI-KFSJK`
### 修改队列名为kaiyuanyun 节点为CENI-KFSJK且只允许testgroup
`/api/v1/slurm/partition/update_partition?PartitionName=kaiyuanyun&nodes=CENI-KFSJK&AllowGroups=testgroup`
### 启用队列
`/api/v1/slurm/partition/update_partition?PartitionName=kaiyuanyun&state=UP`
### 禁用队列
`/api/v1/slurm/partition/update_partition?PartitionName=kaiyuanyun&state=DOWN`
## GET V1删除队列
* url:/api/v1/slurm/partition/delete_partition
### 删除kaiyuanyuan队列
* query
* PartitionName:kaiyuanyun
`/api/v1/slurm/partition/delete_partition?PartitionName=kaiyuanyu`
属性字段参考
```markdown
### 1. **PartitionName**
- **含义**:分区的名称。
- **示例**`master`
- **说明**:这是分区的唯一标识符,用户提交作业时可以指定分区。
---
### 2. **AllowGroups**
- **含义**:允许使用该分区的用户组。
- **示例**`ALL`
- **说明**`ALL` 表示所有用户组都可以使用该分区。
---
### 3. **AllowAccounts**
- **含义**:允许使用该分区的账户。
- **示例**`ALL`
- **说明**`ALL` 表示所有账户都可以使用该分区。
---
### 4. **AllowQos**
- **含义**允许使用该分区的服务质量QoS
- **示例**`ALL`
- **说明**`ALL` 表示所有 QoS 都可以在该分区中使用。
---
### 5. **AllocNodes**
- **含义**:允许分配节点的规则。
- **示例**`ALL`
- **说明**`ALL` 表示可以分配所有节点。
---
### 6. **Default**
- **含义**:是否为默认分区。
- **示例**`NO`
- **说明**`NO` 表示该分区不是默认分区。如果用户未指定分区,作业将提交到默认分区。
---
### 7. **QoS**
- **含义**分区的默认服务质量QoS
- **示例**`N/A`
- **说明**`N/A` 表示该分区没有配置默认的 QoS。
---
### 8. **DefaultTime**
- **含义**:分区的默认作业时间限制。
- **示例**`NONE`
- **说明**`NONE` 表示该分区没有配置默认的作业时间限制。
---
### 9. **DisableRootJobs**
- **含义**:是否禁止 root 用户提交作业。
- **示例**`NO`
- **说明**`NO` 表示允许 root 用户提交作业。
---
### 10. **ExclusiveUser**
- **含义**:是否允许独占用户。
- **示例**`NO`
- **说明**`NO` 表示不允许用户独占分区。
---
### 11. **GraceTime**
- **含义**:作业结束后的宽限时间(单位:秒)。
- **示例**`0`
- **说明**`0` 表示作业结束后立即释放资源。
---
### 12. **Hidden**
- **含义**:分区是否隐藏。
- **示例**`NO`
- **说明**`NO` 表示该分区对用户可见。
---
### 13. **MaxNodes**
- **含义**:单个作业可以使用的最大节点数。
- **示例**`UNLIMITED`
- **说明**`UNLIMITED` 表示没有限制。
---
### 14. **MaxTime**
- **含义**:作业的最大运行时间。
- **示例**`UNLIMITED`
- **说明**`UNLIMITED` 表示作业可以无限期运行。
---
### 15. **MinNodes**
- **含义**:单个作业可以使用的最小节点数。
- **示例**`0`
- **说明**`0` 表示作业可以使用任意数量的节点。
---
### 16. **LLN**
- **含义**是否为低延迟网络Low Latency Network分区。
- **示例**`NO`
- **说明**`NO` 表示该分区不是低延迟网络分区。
---
### 17. **MaxCPUsPerNode**
- **含义**:每个节点上可以使用的最大 CPU 核心数。
- **示例**`UNLIMITED`
- **说明**`UNLIMITED` 表示没有限制。
---
### 18. **Nodes**
- **含义**:分区中包含的节点列表。
- **示例**`CENI-KFSJK,CENI-CSSJK`
- **说明**:该分区包含 `CENI-KFSJK``CENI-CSSJK` 两个节点。
---
### 19. **PriorityJobFactor**
- **含义**:作业优先级因子。
- **示例**`1`
- **说明**`1` 表示该分区的作业优先级因子为 1。
---
### 20. **PriorityTier**
- **含义**:分区的优先级层级。
- **示例**`1`
- **说明**`1` 表示该分区的优先级层级为 1。
---
### 21. **RootOnly**
- **含义**:是否仅允许 root 用户提交作业。
- **示例**`NO`
- **说明**`NO` 表示允许所有用户提交作业。
---
### 22. **ReqResv**
- **含义**:是否要求预留资源。
- **示例**`NO`
- **说明**`NO` 表示不要求预留资源。
---
### 23. **OverSubscribe**
- **含义**:是否允许超额订阅资源。
- **示例**`NO`
- **说明**`NO` 表示不允许超额订阅资源。
---
### 24. **OverTimeLimit**
- **含义**:作业超时后的处理方式。
- **示例**`NONE`
- **说明**`NONE` 表示作业超时后不采取任何特殊处理。
---
### 25. **PreemptMode**
- **含义**:作业抢占模式。
- **示例**`OFF`
- **说明**`OFF` 表示不允许作业抢占。
---
### 26. **State**
- **含义**:分区的当前状态。
- **示例**`UP`
- **说明**`UP` 表示该分区处于可用状态。
---
### 27. **TotalCPUs**
- **含义**:分区中所有节点的总 CPU 核心数。
- **示例**`16`
- **说明**:该分区总共有 16 个 CPU 核心。
---
### 28. **TotalNodes**
- **含义**:分区中包含的节点总数。
- **示例**`2`
- **说明**:该分区包含 2 个节点。
---
### 29. **SelectTypeParameters**
- **含义**:节点选择类型的参数。
- **示例**`NONE`
- **说明**`NONE` 表示没有特殊的节点选择参数。
---
### 30. **JobDefaults**
- **含义**:作业的默认配置。
- **示例**`(null)`
- **说明**`(null)` 表示没有配置作业的默认参数。
---
### 31. **DefMemPerNode**
- **含义**:每个节点的默认内存限制。
- **示例**`UNLIMITED`
- **说明**`UNLIMITED` 表示没有默认的内存限制。
---
### 32. **MaxMemPerNode**
- **含义**:每个节点的最大内存限制。
- **示例**`UNLIMITED`
- **说明**`UNLIMITED` 表示没有最大内存限制。
---
```

View File

@ -0,0 +1,88 @@
## GET 获取全部ldapUser
* GET /api/v1/storage/common/get_storage_json
* 请求参数 point 挂载点
* 返回示例
```json
{
"status": "success",
"data": [
{
"Filesystem": "tmpfs",
"Type": "tmpfs",
"Size": "3.2G",
"Used": "1.3M",
"Avail": "3.2G",
"Use%": "1%",
"Mounted": "/run"
},
{
"Filesystem": "/dev/mapper/ubuntu--vg-ubuntu--lv",
"Type": "ext4",
"Size": "48G",
"Used": "13G",
"Avail": "33G",
"Use%": "29%",
"Mounted": "/"
},
{
"Filesystem": "tmpfs",
"Type": "tmpfs",
"Size": "16G",
"Used": "0",
"Avail": "16G",
"Use%": "0%",
"Mounted": "/dev/shm"
},
{
"Filesystem": "tmpfs",
"Type": "tmpfs",
"Size": "5.0M",
"Used": "0",
"Avail": "5.0M",
"Use%": "0%",
"Mounted": "/run/lock"
},
{
"Filesystem": "/dev/vda2",
"Type": "ext4",
"Size": "2.0G",
"Used": "253M",
"Avail": "1.6G",
"Use%": "14%",
"Mounted": "/boot"
},
{
"Filesystem": "nfsserver:/d",
"Type": "nfs4",
"Size": "1.8T",
"Used": "114G",
"Avail": "1.6T",
"Use%": "7%",
"Mounted": "/d"
},
{
"Filesystem": "tmpfs",
"Type": "tmpfs",
"Size": "3.2G",
"Used": "4.0K",
"Avail": "3.2G",
"Use%": "1%",
"Mounted": "/run/user/0"
},
{
"Filesystem": "tmpfs",
"Type": "tmpfs",
"Size": "3.2G",
"Used": "4.0K",
"Avail": "3.2G",
"Use%": "1%",
"Mounted": "/run/user/1000"
}
]
}
```

8
wwwroot/index.dspy Normal file
View File

@ -0,0 +1,8 @@
info('test .....{params_kw=}')
data = {
"k":"${key}$",
"s":"${secretkey}$"
}
ns = paramify(data, params_kw)
return ns