first commit
This commit is contained in:
commit
3f6944adb0
90
README.md
Normal file
90
README.md
Normal file
@ -0,0 +1,90 @@
|
||||
# pcapi
|
||||
算力中心API服务器
|
||||
|
||||
## 安装须知
|
||||
* 先执行sh install.sh安装升级第三方库
|
||||
* 将sqlor,ahserver,apppublic,rbac,appbase库拉取下来依次安装
|
||||
* 建议不要直接pip install -r requirement.txt,小麻烦较多
|
||||
|
||||
## 安全保证
|
||||
|
||||
* 使用https协议
|
||||
* 使用BasicAuth传输用户密码
|
||||
* 检查IP是否在允许IP集中
|
||||
* 上述检查失败的请求全部返回401
|
||||
|
||||
## 服务方代码
|
||||
|
||||
### 例子功能
|
||||
|
||||
#### 步骤1 包装K8S功能
|
||||
```
|
||||
async def create_namespaced_job_v1(namespace, jobdesc):
|
||||
batch_v1 = client.BatchV1Api()
|
||||
f = awaitify(batch_v1.create_namespaced_job)
|
||||
return await f(namespace=namespace, body=jobdesc)
|
||||
```
|
||||
上述代码用来包装k8s的以下功能
|
||||
```
|
||||
batch_v1 = client.BatchV1Api()
|
||||
batch_v1.create_namespaced_job(namespace=..., nody=...)
|
||||
```
|
||||
#### 步骤2 让新功能在dspy脚本中可用
|
||||
将下面的代码放在def init_func()函数的最后:
|
||||
```
|
||||
g.create_namespaced_job_v1 = create_namespaced_job_v1
|
||||
```
|
||||
### 功能扩充
|
||||
有新功能需要扩充时,请参照上述功能操作
|
||||
|
||||
### 数据参数话函数
|
||||
paramify(data, ns):
|
||||
data是一个python的字符串,字典或数组,其中的数据有用“${name}$"形式定义的变量,此函数将data中的参数用ns字典中的值替换
|
||||
此函数在dspy脚本中可用
|
||||
|
||||
### 后台目录协议
|
||||
* 所有后台程序在wwwroot下
|
||||
* wwwroot中只有一个api目录
|
||||
* api目录按照api的版本创建v{版本号}目录, 版本号为整数,从“1”开始
|
||||
* 每个api一个在版本目录下创建一个api名称的目录
|
||||
* api代码在api名称目录的index.dspy文件中
|
||||
|
||||
### 接口脚本例子
|
||||
```
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
data = {
|
||||
"k":"${key}$",
|
||||
"s":"${secretkey}$"
|
||||
}
|
||||
ns = paramify(data, params_kw)
|
||||
return ns
|
||||
```
|
||||
|
||||
## 请求方代码例子
|
||||
|
||||
### curl版本
|
||||
|
||||
curl --basic --user kyycloud:Kyy@123456 https://pcapi.opencomputing.cn
|
||||
|
||||
### dspy版本
|
||||
```
|
||||
# 客户方需要提供用户密码,用basic_auth_headers函数生成验证用户所需
|
||||
# 的http请求的headers,并在请求是放在headers参数中
|
||||
headers = basic_auth_headers('kyycloud','Kyy@123456')
|
||||
hc = HttpClient()
|
||||
resp = await hc.request('https://pcapi.opencomputing.cn',
|
||||
method='GET',
|
||||
params={
|
||||
"a":"1",
|
||||
"key":"87y32iuhi453u8y56",
|
||||
"secretkey":"qqqqqcccccceeeee"
|
||||
},
|
||||
headers=headers)
|
||||
info(f'{resp=}')
|
||||
return resp
|
||||
|
||||
|
||||
|
||||
|
||||
# bricks
|
||||
0
app/__init__.py
Normal file
0
app/__init__.py
Normal file
0
app/k8sManager/__init__.py
Normal file
0
app/k8sManager/__init__.py
Normal file
566
app/k8sManager/k8s_utils_linuxos_ubuntu.py
Normal file
566
app/k8sManager/k8s_utils_linuxos_ubuntu.py
Normal file
File diff suppressed because one or more lines are too long
530
app/k8sManager/k8s_utils_public.py
Normal file
530
app/k8sManager/k8s_utils_public.py
Normal file
@ -0,0 +1,530 @@
|
||||
import yaml
|
||||
from kubernetes import client, config
|
||||
from kubernetes.client.exceptions import ApiException
|
||||
from appPublic.log import debug
|
||||
import time
|
||||
import re
|
||||
import json
|
||||
import ast
|
||||
|
||||
def format_source_labels(source_selflabel, type=None):
|
||||
"""
|
||||
格式化源标签(支持多个 key-value)
|
||||
:param source_selflabel: 源标签字符串,格式如:
|
||||
pod类型: "key1:value1,key2:value2"
|
||||
node类型: "key3=value3,key4=value4"
|
||||
:param type: 标签类型 ("pod" 或 "node")
|
||||
:return: 格式化后的标签字典 {key: value}
|
||||
"""
|
||||
if not source_selflabel or len(source_selflabel.strip()) == 0:
|
||||
return {}
|
||||
label_dict = {}
|
||||
if type == "pod":
|
||||
# Pod 标签使用冒号分隔,多个用逗号分隔
|
||||
for pair in source_selflabel.strip().split(","):
|
||||
if ":" in pair:
|
||||
key, value = pair.strip().split(":", 1)
|
||||
label_dict[key.strip()] = value.strip()
|
||||
elif type == "node":
|
||||
# Node 标签使用等号分隔,多个用逗号分隔
|
||||
for pair in source_selflabel.strip().split(","):
|
||||
if "=" in pair:
|
||||
key, value = pair.strip().split("=", 1)
|
||||
label_dict[key.strip()] = value.strip()
|
||||
else:
|
||||
return {}
|
||||
return label_dict
|
||||
|
||||
def format_runtime(seconds):
|
||||
if seconds < 60:
|
||||
return f"{int(seconds)}s"
|
||||
elif seconds < 3600:
|
||||
minutes = int(seconds // 60)
|
||||
return f"{minutes}m"
|
||||
elif seconds < 86400:
|
||||
hours = int(seconds // 3600)
|
||||
return f"{hours}h"
|
||||
else:
|
||||
days = int(seconds // 86400)
|
||||
return f"{days}d"
|
||||
|
||||
def extract_model_labels(hardware_list):
|
||||
"""
|
||||
提取硬件列表中的模型标签为列表,
|
||||
:param hardware_list: 硬件列表,格式如:
|
||||
kyy-gpu-model=RTX5090-32G,kyy-cpu-model=INTEL(R) XEON(R) PLATINUM 8582C
|
||||
"""
|
||||
labels = []
|
||||
for item in hardware_list:
|
||||
if item["type"] in {"cpu", "gpu"}:
|
||||
labels.append(f"kyy-{item['type']}-model={item['model'].replace(' ','-').replace('(','-').replace(')','-').replace('kyy-','')}")
|
||||
return labels
|
||||
|
||||
def determine_accommodat(kubeconfig,get_resources):
|
||||
"""
|
||||
判断产品资源是否可以被当前集群容纳
|
||||
磁盘除外,因为磁盘资源通常是通过 PV/PVC 管理的,不在节点资源统计中。
|
||||
:param kubeconfig: kubeconfig 配置
|
||||
:param get_resources: 产品资源字典,格式如:
|
||||
{
|
||||
"5436-f-gdsb--ewrewrerrtwt": [
|
||||
{
|
||||
"type": "cpu",
|
||||
"model": "INTEL(R) XEON(R) PLATINUM 8582C",
|
||||
"amount": 0
|
||||
},
|
||||
{
|
||||
"type": "memory",
|
||||
"model": "Samsung DDR4 DIMMs",
|
||||
"amount": 0
|
||||
},
|
||||
{
|
||||
"type": "disk",
|
||||
"model": "DATA",
|
||||
"amount": 0
|
||||
},
|
||||
{
|
||||
"type": "gpu",
|
||||
"model": "RTX5090-32G",
|
||||
"amount": 0
|
||||
}
|
||||
],
|
||||
"6787jhgvgjhv32412343142jvgj": [
|
||||
{
|
||||
"type": "cpu",
|
||||
"model": "INTEL(R) XEON(R) PLATINUM 8582C",
|
||||
"amount": 4
|
||||
},
|
||||
{
|
||||
"type": "memory",
|
||||
"model": "Samsung DDR4 DIMMs",
|
||||
"amount": 100
|
||||
},
|
||||
{
|
||||
"type": "disk",
|
||||
"model": "DATA",
|
||||
"amount": 512
|
||||
},
|
||||
{
|
||||
"type": "gpu",
|
||||
"model": "RTX5090-32G",
|
||||
"amount": 2
|
||||
}
|
||||
],
|
||||
}
|
||||
: params kyylabels: 节点自定义标签,格式如 "key1:value1,key2:value2"
|
||||
:return: 可容纳的产品 ID 列表
|
||||
"""
|
||||
init_ids = []
|
||||
try:
|
||||
all_quota = get_node_info(kubeconfig).get('rows', [])
|
||||
if not all_quota:
|
||||
debug("determine_accommodat: 没有获取到节点信息")
|
||||
return init_ids
|
||||
|
||||
products = {}
|
||||
if isinstance(get_resources, str):
|
||||
debug(f"---get_resources格式:{type(get_resources)}")
|
||||
products = json.loads(get_resources)
|
||||
# debug(f"1---products格式:{type(products)}")
|
||||
if isinstance(products, str):
|
||||
products = eval(products)
|
||||
debug(f"2---products格式:{type(products)}")
|
||||
|
||||
all_quota = [x for x in all_quota if x['node_status'] != '未就绪' and x['node_role'] != 'master']
|
||||
debug(f"\n 接收请求资源={products},\n 现有资源:{all_quota}")
|
||||
|
||||
# 预处理节点数据,转换为数值类型
|
||||
|
||||
processed_nodes = []
|
||||
for node in all_quota:
|
||||
# 跳过不可用节点和控制节点
|
||||
if node['node_status'] != '已就绪' or node['node_role'] == 'master':
|
||||
#debug(f"跳过未就绪节点/控制节点:{node['node_internalip']} {node['node_status']} {node['node_name']}")
|
||||
continue
|
||||
|
||||
# 提取可用CPU(去除"核"字并转换为float)
|
||||
cpu_str = node['available_cpu'].replace('核', '')
|
||||
available_cpu = float(cpu_str)
|
||||
|
||||
# 提取可用内存(处理Gi单位)
|
||||
mem_str = node['available_memory']
|
||||
if mem_str.endswith('Gi'):
|
||||
available_memory = float(mem_str.replace('Gi', ''))
|
||||
else:
|
||||
# 假设其他单位为Mi并转换为Gi
|
||||
available_memory = float(mem_str.replace('Mi', '')) / 1024
|
||||
|
||||
available_gpu = node['available_gpu']
|
||||
|
||||
processed_nodes.append({
|
||||
'node_name': node['node_name'],
|
||||
'node_labels': node['node_labels'], # 节点自定义标签
|
||||
'cpu': available_cpu,
|
||||
'memory': available_memory,
|
||||
'gpu': available_gpu
|
||||
})
|
||||
|
||||
# 找出无法部署的产品ID
|
||||
init_ids = []
|
||||
for product_id, resources in products.items():
|
||||
# 提取产品资源需求
|
||||
product_cpu = next((r['amount'] for r in resources if r['type'] == 'cpu'), 0)
|
||||
product_memory = next((r['amount'] for r in resources if r['type'] == 'memory'), 0)
|
||||
product_gpu = next((r['amount'] for r in resources if r['type'] == 'gpu'), 0)
|
||||
|
||||
# 管理员视角创建Pod的时候,CPU的请求单位可能是毫核(m)也可能是Gi,如果是m则转成核
|
||||
if "m" in str(product_cpu):
|
||||
product_cpu = float(product_cpu.replace("m", "")) / 1000.0
|
||||
|
||||
# 管理员视角创建Pod的时候,内存的请求单位可能带了单位Gi,如果是1G则转成1.0
|
||||
if "Gi" in str(product_memory):
|
||||
product_memory = float(product_memory.replace("Gi", ""))
|
||||
elif "Mi" in str(product_memory):
|
||||
product_memory = float(product_memory.replace("Mi", "")) / 1024.0
|
||||
|
||||
# 管理员视角创建Pod的时候,磁盘的请求单位可能是带了单位Gi,如果是1G则转成1.0
|
||||
# 这里磁盘不在节点资源统计中,所以不处理
|
||||
# if "Gi" in str(product_disk):
|
||||
# product_disk = float(product_disk.replace("Gi", ""))
|
||||
# elif "Mi" in str(product_disk):
|
||||
# product_disk = float(product_disk.replace("Mi", "")) / 1024
|
||||
|
||||
# 检查是否存在任何节点可以满足该产品需求(这里规定,不能完全占满,只能略小于,毕竟节点上可能还有其他服务会动态占用资源)
|
||||
can_deploy = False
|
||||
for node in processed_nodes:
|
||||
#此处转换标签并给出判断该节点此标签产品是否可部署
|
||||
kyy_labels = extract_model_labels(resources)
|
||||
if kyy_labels:
|
||||
# 检查节点标签是否包含产品所需的标签
|
||||
if not all(label in node['node_labels'] for label in kyy_labels):
|
||||
debug(f"节点 {node['node_name']} 不满足产品 {product_id} 的标签要求: {kyy_labels}")
|
||||
continue
|
||||
|
||||
debug(f'✅ 请求标签在其中节点选择器标签范围内,可部署: {kyy_labels}')
|
||||
|
||||
debug(f"核心参数判断:{product_cpu=} {node['cpu']=} # {float(product_memory)=} {node['memory']=} # {float(product_gpu)=} {node['gpu']=}")
|
||||
if (product_cpu < node['cpu'] and float(product_memory) < node['memory'] and float(product_gpu) <= node['gpu']):
|
||||
can_deploy = True
|
||||
break
|
||||
|
||||
if not can_deploy:
|
||||
init_ids.append(product_id)
|
||||
|
||||
debug(f"无法在集群任何节点上部署的产品ID: {init_ids}")
|
||||
return init_ids
|
||||
except:
|
||||
import traceback
|
||||
debug(f"创建异常: {traceback.format_exc()}")
|
||||
raise f"determine_accommodat 异常: {traceback.format_exc()}"
|
||||
|
||||
def get_pod_info(kubeconfig):
|
||||
try:
|
||||
# config.load_kube_config()
|
||||
kubeconfig = yaml.safe_load(kubeconfig)
|
||||
config.load_kube_config_from_dict(kubeconfig)
|
||||
v1 = client.CoreV1Api()
|
||||
api_client = client.ApiClient()
|
||||
|
||||
namespaces = v1.list_namespace(timeout_seconds=1).items
|
||||
non_system_namespaces = [ns.metadata.name for ns in namespaces if
|
||||
not ns.metadata.name.startswith(('kube-', 'default', 'local', 'ingress-'))]
|
||||
|
||||
rows = []
|
||||
for namespace in non_system_namespaces:
|
||||
pods = v1.list_namespaced_pod(namespace).items
|
||||
pod_metrics_path = f"/apis/metrics.k8s.io/v1beta1/namespaces/{namespace}/pods"
|
||||
pod_metrics_response = api_client.call_api(
|
||||
pod_metrics_path, 'GET', auth_settings=['BearerToken'], response_type='object')[0]
|
||||
pod_metrics = {pod['metadata']['name']: pod.get("containers",[{}])[0].get('usage', {})
|
||||
for pod in pod_metrics_response.get('items', [])}
|
||||
|
||||
# debug(f"### pods={pods}")
|
||||
for pod in pods:
|
||||
pod_name = pod.metadata.name
|
||||
if pod.status.container_statuses:
|
||||
ready_count = sum(1 for cs in pod.status.container_statuses if cs.ready)
|
||||
else:
|
||||
ready_count = 0
|
||||
|
||||
# 获取容器总数
|
||||
total_containers = len(pod.spec.containers)
|
||||
# 计算就绪容器数
|
||||
ready_count = 0
|
||||
if pod.status.container_statuses:
|
||||
ready_count = sum(1 for status in pod.status.container_statuses if status.ready)
|
||||
# 计算就绪比例
|
||||
ready_ratio = ready_count / total_containers if total_containers > 0 else 0
|
||||
# 判断就绪状态
|
||||
ready_status = "已就绪" if ready_ratio >= 1 else "未就绪"
|
||||
|
||||
# 抛弃下面这种写法,极端情况下集合操作会出问题
|
||||
# ready_status = "已就绪" if ({ready_count}/{len(pod.spec.containers)}) >= 1 else "未就绪"
|
||||
|
||||
readiness_conditions = [{"type": cond.type, "status": cond.status}
|
||||
for cond in pod.status.conditions if cond.type == "Ready"]
|
||||
phase = pod.status.phase
|
||||
restart_count = sum(cs.restart_count for cs in pod.status.container_statuses) if pod.status.container_statuses else 0
|
||||
running_time = time.time() - pod.metadata.creation_timestamp.timestamp()
|
||||
pod_age = format_runtime(running_time)
|
||||
pod_ip = pod.status.pod_ip if pod.status.pod_ip else "Unknown"
|
||||
node_name = pod.spec.node_name if pod.spec.node_name else "Pod未被调度到节点"
|
||||
nominated_node = pod.status.nominated_node_name if pod.status.nominated_node_name else "无"
|
||||
|
||||
if phase == "Pending":
|
||||
pod_ip = "Pending状态,未分配 IP"
|
||||
node_name = "Pending状态,未分配节点"
|
||||
nominated_node = "Pending状态,未分配节点"
|
||||
|
||||
# 提取容器的资源限制(limits)
|
||||
cpu_limit = "未设置"
|
||||
memory_limit = "未设置"
|
||||
gpu_limit = "未设置"
|
||||
|
||||
if pod.spec.containers:
|
||||
container = pod.spec.containers[0] # 假设只取第一个容器
|
||||
if container.resources and container.resources.limits:
|
||||
limits = container.resources.limits
|
||||
cpu_limit = limits.get("cpu", "未设置") # 假设 CPU 限制以核为单位
|
||||
# 处理特殊情况,如果 CPU 限制以毫核(m)为单位,转换为核
|
||||
# debug(f'cpu_limit==={cpu_limit}')
|
||||
if isinstance(cpu_limit, str) and cpu_limit.endswith("m"):
|
||||
debug(f'无法识别的cpu_limit格式:{cpu_limit} 转换为 {float((int(cpu_limit.replace("m", "")) / 1000))}核')
|
||||
cpu_limit = f'{float((int(cpu_limit.replace("m", "")) / 1000))}'
|
||||
memory_limit = limits.get("memory", "未设置")
|
||||
gpu_limit = limits.get("nvidia.com/gpu", "未设置") # 只支持 NVIDIA GPU
|
||||
|
||||
|
||||
# 获取 metrics 数据(已有逻辑不变)
|
||||
cpu_usage = pod_metrics.get(pod_name, {}).get('cpu', 'undefined')
|
||||
if cpu_usage and isinstance(cpu_usage, str):
|
||||
cpu_usage = int(cpu_usage.replace("n", "")) if cpu_usage.endswith("n") else 0
|
||||
cpu_usage = f'{(cpu_usage / 1000000 / 1000):.3f}核'
|
||||
memory_usage = pod_metrics.get(pod_name, {}).get('memory', 'undefined')
|
||||
if memory_usage and isinstance(memory_usage, str):
|
||||
memory_usage = int(memory_usage.replace("Ki", "")) if memory_usage.endswith("Ki") else 0
|
||||
memory_usage = f"{(memory_usage / 1024 / 1024):.3f}Gi"
|
||||
|
||||
if phase in ["Pending", "Succeeded", "Failed"]:
|
||||
cpu_usage = "Pod未运行,无资源使用数据"
|
||||
memory_usage = "Pod未运行,无资源使用数据"
|
||||
|
||||
|
||||
# 新增 GPU 使用情况字段(暂时用占位符)
|
||||
gpu_usage = "0%" # 如果你有 DCGM / Prometheus 可替换为实际值
|
||||
pod_info = {
|
||||
"pod_namespace": namespace,
|
||||
"pod_name": pod_name,
|
||||
"pod_ready": ready_status,
|
||||
"pod_running": phase,
|
||||
"pod_restart": str(restart_count),
|
||||
"pod_age": pod_age,
|
||||
"pod_ip": pod_ip,
|
||||
"pod_node": node_name,
|
||||
"pod_nominated_node": nominated_node,
|
||||
"pod_cpurate": cpu_usage,
|
||||
"pod_memrate": memory_usage,
|
||||
# 新增字段
|
||||
"pod_gpu": gpu_limit,
|
||||
"pod_cpu_limit": cpu_limit + "核" if cpu_limit != "未设置" else "未设置",
|
||||
"pod_memory_limit": memory_limit,
|
||||
"pod_gpu_limit": gpu_limit,
|
||||
}
|
||||
rows.append(pod_info)
|
||||
|
||||
result = {
|
||||
"total": len(rows),
|
||||
"rows": rows
|
||||
}
|
||||
return result
|
||||
except Exception as e:
|
||||
import traceback
|
||||
debug(f"获取Pod信息失败: {traceback.format_exc()}")
|
||||
raise traceback.format_exc()
|
||||
|
||||
def get_node_info(kubeconfig):
|
||||
# 加载配置
|
||||
try:
|
||||
kubeconfig = yaml.safe_load(kubeconfig)
|
||||
config.load_kube_config_from_dict(kubeconfig)
|
||||
v1 = client.CoreV1Api()
|
||||
api_client = client.ApiClient()
|
||||
|
||||
# 获取节点指标和 Pod 列表
|
||||
node_metrics_path = "/apis/metrics.k8s.io/v1beta1/nodes"
|
||||
node_metrics_response = api_client.call_api(
|
||||
node_metrics_path, 'GET', auth_settings=['BearerToken'], response_type='object')[0]
|
||||
node_metrics = {node['metadata']['name']: node.get('usage', {})
|
||||
for node in node_metrics_response.get('items', [])}
|
||||
|
||||
# 获取所有 Pod 及其资源请求
|
||||
pods = v1.list_pod_for_all_namespaces(timeout_seconds=1).items
|
||||
node_pod_resources = {} # 存储每个节点上 Pod 的资源请求
|
||||
|
||||
for pod in pods:
|
||||
if pod.spec.node_name and pod.status.phase in ["Running", "Pending"]:
|
||||
node_name = pod.spec.node_name
|
||||
if node_name not in node_pod_resources:
|
||||
node_pod_resources[node_name] = {
|
||||
"cpu": 0,
|
||||
"memory": 0,
|
||||
"gpu": 0
|
||||
}
|
||||
|
||||
# 累加容器请求的资源
|
||||
for container in pod.spec.containers:
|
||||
if container.resources and container.resources.requests:
|
||||
# CPU (转换为 millicores)
|
||||
cpu_request = container.resources.requests.get("cpu", "0m")
|
||||
cpu_millis = int(float(cpu_request.rstrip("m"))) if "m" in cpu_request else int(float(cpu_request) * 1000)
|
||||
node_pod_resources[node_name]["cpu"] += cpu_millis
|
||||
|
||||
# Memory (转换为 bytes)
|
||||
memory_request = container.resources.requests.get("memory", "0")
|
||||
memory_bytes = int(float(memory_request.rstrip("KiMiGi")))
|
||||
if "Ki" in memory_request:
|
||||
memory_bytes *= 1024
|
||||
elif "Mi" in memory_request:
|
||||
memory_bytes *= 1024 * 1024
|
||||
elif "Gi" in memory_request:
|
||||
memory_bytes *= 1024 * 1024 * 1024
|
||||
node_pod_resources[node_name]["memory"] += memory_bytes
|
||||
|
||||
# GPU
|
||||
gpu_request = container.resources.requests.get("nvidia.com/gpu", "0")
|
||||
node_pod_resources[node_name]["gpu"] += int(gpu_request)
|
||||
|
||||
# 获取节点列表并计算资源使用情况
|
||||
nodes = v1.list_node().items
|
||||
rows = []
|
||||
|
||||
for node in nodes:
|
||||
node_name = node.metadata.name
|
||||
internal_ip = next((address.address for address in node.status.addresses
|
||||
if address.type == "InternalIP"), "未分配")
|
||||
external_ip = next((address.address for address in node.status.addresses
|
||||
if address.type == "ExternalIP"), "未分配")
|
||||
status = node.status.conditions[-1].status if node.status.conditions else "Unknown"
|
||||
status = "已就绪" if status == "True" else "未就绪"
|
||||
|
||||
# 节点角色
|
||||
roles = []
|
||||
role_labels = [
|
||||
"node-role.kubernetes.io/control-plane",
|
||||
"node-role.kubernetes.io/master",
|
||||
"node-role.kubernetes.io/worker"
|
||||
]
|
||||
for label in role_labels:
|
||||
if label in node.metadata.labels:
|
||||
roles.append(label.split("/")[-1])
|
||||
roles_str = "master" if roles else "worker"
|
||||
|
||||
# 节点运行时间
|
||||
running_time = time.time() - node.metadata.creation_timestamp.timestamp()
|
||||
node_age = format_runtime(running_time)
|
||||
|
||||
# 节点信息
|
||||
k8s_version = node.status.node_info.kubelet_version
|
||||
os_image = node.status.node_info.os_image
|
||||
kernel_version = node.status.node_info.kernel_version
|
||||
container_runtime = node.status.node_info.container_runtime_version
|
||||
|
||||
# 自定义标签
|
||||
labels = node.metadata.labels
|
||||
kyy_labels = [f"{k}={v}" for k, v in labels.items() if k.startswith('kyy-')]
|
||||
|
||||
# 实时资源使用情况
|
||||
cpu_usage = node_metrics.get(node_name, {}).get('cpu', 'undefined')
|
||||
if cpu_usage and isinstance(cpu_usage, str):
|
||||
cpu_usage = int(cpu_usage.replace("n", ""))
|
||||
cpu_usage = f'{(cpu_usage / 1000000 / 1000):.3f}核'
|
||||
|
||||
memory_usage = node_metrics.get(node_name, {}).get('memory', 'undefined')
|
||||
if memory_usage and isinstance(memory_usage, str):
|
||||
memory_usage = int(memory_usage.replace("Ki", ""))
|
||||
memory_usage = f"{(memory_usage / 1024 / 1024):.3f}Gi"
|
||||
|
||||
# 节点总资源
|
||||
total_cpu = float(node.status.allocatable.get("cpu", "0"))
|
||||
total_memory = parse_resource_value(node.status.allocatable.get("memory", "0")) / (1024 ** 1) #内存默认Mi转成Gi
|
||||
total_gpu = int(node.status.allocatable.get("nvidia.com/gpu", "0"))
|
||||
|
||||
# 已分配资源
|
||||
allocated_cpu = node_pod_resources.get(node_name, {}).get("cpu", 0) / 1000.0 # 转换为 cores
|
||||
allocated_memory = node_pod_resources.get(node_name, {}).get("memory", 0) / (1024 ** 3) # 转换为 Gi
|
||||
allocated_gpu = node_pod_resources.get(node_name, {}).get("gpu", 0)
|
||||
|
||||
# 可用资源
|
||||
available_cpu = total_cpu - allocated_cpu
|
||||
available_memory = total_memory - allocated_memory
|
||||
available_gpu = total_gpu - allocated_gpu
|
||||
|
||||
node_info = {
|
||||
"node_name": node_name,
|
||||
"node_status": status,
|
||||
"node_role": roles_str,
|
||||
"node_age": node_age,
|
||||
"node_version": k8s_version,
|
||||
"node_internalip": internal_ip,
|
||||
"node_externalip": external_ip,
|
||||
"node_osversion": os_image,
|
||||
"node_kernelversion": kernel_version,
|
||||
"node_containeruntime": container_runtime,
|
||||
"node_labels": kyy_labels,
|
||||
"node_cpurate": f"{(allocated_cpu / total_cpu * 100):.1f}%" if total_cpu > 0 else "0%",#cpu_usage,
|
||||
"node_memrate": f"{(allocated_memory / total_memory * 100):.1f}%" if total_memory > 0 else "0%",#memory_usage,
|
||||
"node_gpu":f"{(allocated_gpu / total_gpu * 100):.1f}%" if total_gpu > 0 else "0%",
|
||||
# 新增资源信息
|
||||
# "node_total_cpu": f"{total_cpu:.2f}核",
|
||||
# "allocated_cpu": f"{allocated_cpu:.2f}核",
|
||||
"available_cpu": f"{available_cpu:.2f}核",
|
||||
# "cpu_rate": f"{(allocated_cpu / total_cpu * 100):.1f}%" if total_cpu > 0 else "0%",
|
||||
|
||||
# "node_total_memory": f"{total_memory:.2f}Gi",
|
||||
# "allocated_memory": f"{allocated_memory:.2f}Gi",
|
||||
"available_memory": f"{available_memory:.2f}Gi",
|
||||
# "memory_rate": f"{(allocated_memory / total_memory * 100):.1f}%" if total_memory > 0 else "0%",
|
||||
|
||||
# "node_total_gpu": total_gpu,
|
||||
# "allocated_gpu": allocated_gpu,
|
||||
"available_gpu": available_gpu,
|
||||
# "gpu_rate": f"{(allocated_gpu / total_gpu * 100):.1f}%" if total_gpu > 0 else "0%"
|
||||
}
|
||||
rows.append(node_info)
|
||||
|
||||
result = {
|
||||
"total": len(rows),
|
||||
"rows": rows
|
||||
}
|
||||
debug(f"=== node_info={result}")
|
||||
return result
|
||||
except:
|
||||
import traceback
|
||||
e = traceback.format_exc()
|
||||
debug(f"获取节点信息失败: {e}")
|
||||
raise e
|
||||
|
||||
|
||||
# 辅助函数:解析资源值
|
||||
def parse_resource_value(value: str) -> float:
|
||||
"""解析 Kubernetes 资源值(如 "1.5", "500m", "2Gi")为统一单位"""
|
||||
if not value:
|
||||
return 0.0
|
||||
|
||||
# 处理 CPU (cores 或 millicores)
|
||||
if value.endswith('m'):
|
||||
return float(value[:-1]) / 1000.0 # 转换为 cores
|
||||
elif re.match(r'^\d+(\.\d+)?$', value):
|
||||
return float(value) # 已经是 cores
|
||||
|
||||
# 处理内存 (Ki, Mi, Gi, Ti)
|
||||
elif value.endswith('Ki'):
|
||||
return float(value[:-2]) / (1024 ** 1) # 转换为 Gi
|
||||
elif value.endswith('Mi'):
|
||||
return float(value[:-2]) / (1024 ** 2)
|
||||
elif value.endswith('Gi'):
|
||||
return float(value[:-2])
|
||||
elif value.endswith('Ti'):
|
||||
return float(value[:-2]) * 1024
|
||||
|
||||
return float(value) # 默认按原单位返回
|
||||
561
app/k8sManager/k8s_utils_relationaldb_mysql.py
Normal file
561
app/k8sManager/k8s_utils_relationaldb_mysql.py
Normal file
File diff suppressed because one or more lines are too long
573
app/k8sManager/multiple_clusters.py
Normal file
573
app/k8sManager/multiple_clusters.py
Normal file
@ -0,0 +1,573 @@
|
||||
import json
|
||||
import yaml
|
||||
import os
|
||||
import hashlib
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from os.path import expanduser
|
||||
from kubernetes import client, config
|
||||
from kubernetes.client import ApiException
|
||||
from . import k8s_utils_linuxos_ubuntu, k8s_utils_relationaldb_mysql, parse_k8s_params
|
||||
from . import ssh_utils,k8s_utils_public
|
||||
from appPublic.log import debug
|
||||
import traceback
|
||||
|
||||
def delete_cluster_node(params):
|
||||
"""
|
||||
删除集群节点
|
||||
--namespace 或 -n:指定节点所在的命名空间。不过,节点是集群级别的资源,不隶属于特定的命名空间,所以此参数一般不用于删除节点。
|
||||
--force:当节点处于不可达状态或者无法正常响应时,可以使用 --force 参数强制删除节点
|
||||
kubectl delete node <node-name> --force
|
||||
--grace-period:指定节点在被强制终止之前的宽限期(以秒为单位)。默认值是 30 秒,设置为 0 表示立即强制删除。一般和 --force 一起使用
|
||||
kubectl delete node <node-name> --force --grace-period=0
|
||||
在删除节点之前,需要先将节点标记为不可调度(Cordon),并将节点上的 Pod 安全地迁移到其他节点(Drain)
|
||||
将节点标记为不可调度,防止新的 Pod 被调度到该节点
|
||||
kubectl cordon <node-name>
|
||||
排空节点上的 Pod,将它们迁移到其他节点
|
||||
kubectl drain <node-name> --ignore-daemonsets --delete-emptydir-data --ignore-not-found
|
||||
--ignore-daemonsets:忽略 DaemonSet 创建的 Pod,因为 DaemonSet 会确保每个节点上都运行一个 Pod 副本,这些 Pod 不需要迁移。
|
||||
--delete-emptydir-data:删除节点上 EmptyDir 卷中的数据,EmptyDir 卷是临时存储,删除节点时数据会丢失
|
||||
--ignore-not-found:如果指定的节点不存在,忽略错误,不会报错退出
|
||||
"""
|
||||
return "delete_cluster_node ok"
|
||||
|
||||
def node_state_switch(params):
|
||||
"""
|
||||
恢复节点:
|
||||
kubectl uncordon 命令将节点标记为可调度状态,这样调度器就会重新考虑将新的 Pod 分配到该节点上
|
||||
kubectl uncordon worker-node-1
|
||||
暂停节点:
|
||||
kubectl cordon 命令将节点标记为不可调度状态,这样调度器就不会将新的 Pod 分配到该节点上
|
||||
kubectl cordon worker-node-1
|
||||
(可选)排空节点上的 Pod
|
||||
kubectl drain <node-name> --ignore-daemonsets --delete-emptydir-data
|
||||
"""
|
||||
return "node_state_switch ok"
|
||||
|
||||
def yaml_apply_delete(params):
|
||||
"""
|
||||
1. 通过cpcc传递过来的参数进行级联初始化资源实例;
|
||||
2. 通过cpcc传递过来的参数进行级联更新资源实例;
|
||||
3. 通过cpcc传递过来的参数进行级联删除资源实例;
|
||||
"""
|
||||
# 为了更好支持多种资源实例类型(操作系统/关系型/非关系型数据库等每种资源实例类型单开逻辑便于维护)
|
||||
instance_type = params.get("instance_type")
|
||||
if instance_type == "RelationalDB":
|
||||
k8s_utils_relationaldb_mysql.handle_k8s_operations(params)
|
||||
# if instance_type == "RelationalDB_PostgreSQL":
|
||||
# k8s_utils_relationaldb_mysql.handle_k8s_operations(params)
|
||||
elif instance_type == "LinuxOS":
|
||||
k8s_utils_linuxos_ubuntu.handle_k8s_operations(params)
|
||||
|
||||
def node_label_opt(params):
|
||||
"""
|
||||
要设置节点 worker-node-1 上的标签 app,可以使用以下命令:
|
||||
kubectl label nodes worker-node-1 app=app,注意标签键和值之间有一个等号 (=),表示设置该标签。
|
||||
|
||||
要取消节点 worker-node-1 上的标签 app,可以使用以下命令:
|
||||
kubectl label nodes worker-node-1 app-,注意标签键后面有一个短横线 (-),表示取消该标签。
|
||||
|
||||
设置/解绑标签后,调度器将考虑该标签进行 Pod 调度,可以使用该标签来选择特定的节点。
|
||||
设置/解绑标签不会影响节点上已经运行的 Pod,它们仍然会继续运行。
|
||||
"""
|
||||
host = params.get("host")
|
||||
port = int(params.get("port"))
|
||||
username = params.get("user")
|
||||
password = params.get("password")
|
||||
worker_node = params.get("worker_node")
|
||||
label = params.get("label")
|
||||
opt = params.get("opt")
|
||||
if opt == "label":
|
||||
get_cluster_node_cmd = [f"kubectl label nodes {worker_node} {label} --overwrite"]
|
||||
debug(f'绑定标签命令: {get_cluster_node_cmd}')
|
||||
if username != "root":
|
||||
results = ssh_utils.ssh_execute_command_noroot(host, port, username, password,
|
||||
get_cluster_node_cmd, sudo_timeout=10) # 设置标签可能需要一些时间
|
||||
else:
|
||||
results = ssh_utils.ssh_execute_command(host, port, username, password, get_cluster_node_cmd)
|
||||
overwrite_info = results[0][0].strip()
|
||||
if "not labeled" in overwrite_info:
|
||||
raise f"{worker_node} 绑定标签 {label} 失败,请检查集群节点状态或标签是否已绑定?"
|
||||
else:
|
||||
return f"{worker_node} 绑定标签 {label} 成功!"
|
||||
elif opt == "unlabel":
|
||||
get_cluster_node_cmd = [f"kubectl label nodes %s %s-" % (worker_node,label.split('=')[0])]
|
||||
debug(f'解绑标签命令: {get_cluster_node_cmd}')
|
||||
if username != "root":
|
||||
results = ssh_utils.ssh_execute_command_noroot(host, port, username, password,
|
||||
get_cluster_node_cmd, sudo_timeout=10) # 取消标签可能需要一些时间
|
||||
else:
|
||||
results = ssh_utils.ssh_execute_command(host, port, username, password, get_cluster_node_cmd)
|
||||
# debug(f'解绑标签结果: {results}')
|
||||
overwrite_info = results[0][0].strip()
|
||||
if "unlabeled" in overwrite_info or overwrite_info == "":
|
||||
return f"{worker_node} 解绑标签 {label} 成功!"
|
||||
else:
|
||||
raise f"{worker_node} 解绑标签 {label} 失败,请检查集群节点状态或标签是否已绑定?"
|
||||
|
||||
def unset_node_label(params):
|
||||
"""
|
||||
要取消节点 worker-node-1 上的标签 app,可以使用以下命令:
|
||||
kubectl label nodes worker-node-1 app-,注意标签键后面有一个短横线 (-),表示取消该标签。
|
||||
取消标签后,节点将不再具有该标签,调度器将不再考虑该标签进行 Pod 调度。
|
||||
取消标签不会影响节点上已经运行的 Pod,它们仍然会继续运行。
|
||||
"""
|
||||
host = params.get("host")
|
||||
port = int(params.get("port"))
|
||||
username = params.get("user")
|
||||
password = params.get("password")
|
||||
worker_node = params.get("worker_node")
|
||||
label = params.get("label")
|
||||
|
||||
|
||||
def get_cluster_nodes_by_server(params):
|
||||
host = params.get("host")
|
||||
port = int(params.get("port"))
|
||||
username = params.get("user")
|
||||
password = params.get("password")
|
||||
get_cluster_node_cmd = ["kubectl get nodes -o wide --show-labels"]
|
||||
if username != "root":
|
||||
results = ssh_utils.ssh_execute_command_noroot(host, port, username, password,
|
||||
get_cluster_node_cmd, sudo_timeout=10)
|
||||
else:
|
||||
results = ssh_utils.ssh_execute_command(host, port, username, password, get_cluster_node_cmd)
|
||||
parse_k8s_nodes_result = results[0][0].strip()
|
||||
parse_k8s_nodes_result = parse_k8s_params.parse_k8s_nodes(parse_k8s_nodes_result)
|
||||
# debug(f'集群 {host=} 所有节点信息如下{results=} => 转换后:\n{parse_k8s_nodes_result=}')
|
||||
return parse_k8s_nodes_result
|
||||
|
||||
def get_cluster_pods_by_kubeconfig(params):
|
||||
"""
|
||||
通过调用方传递来的kubeconfig信息
|
||||
获取集群中所有资源实例(Pod)信息详情
|
||||
"""
|
||||
kubeconfig = params.get("kubeconfig")
|
||||
return k8s_utils_public.get_pod_info(kubeconfig)
|
||||
|
||||
def determine_accommodat_by_kubeconfig(params):
|
||||
"""
|
||||
通过调用方传递来的kubeconfig信息
|
||||
判断集群中可部署哪些部件组合n
|
||||
返回的是产品ID列表
|
||||
"""
|
||||
# debug(f'=====determine_accommodat_by_kubeconfig params: {params}')
|
||||
kubeconfig = params.get("kubeconfig")
|
||||
resources = params.get("resources", {})
|
||||
# debug(f'=====kubeconfig: {kubeconfig}, resources: {resources}')
|
||||
return k8s_utils_public.determine_accommodat(kubeconfig, resources)
|
||||
|
||||
def get_cluster_nodes_by_kubeconfig(params):
|
||||
"""
|
||||
通过调用方传递来的kubeconfig信息
|
||||
获取集群中所有节点信息详情
|
||||
"""
|
||||
kubeconfig = params.get("kubeconfig")
|
||||
return k8s_utils_public.get_node_info(kubeconfig)
|
||||
|
||||
def get_cluster_pods_by_server(params):
|
||||
host = params.get("host")
|
||||
port = int(params.get("port"))
|
||||
username = params.get("user")
|
||||
password = params.get("password")
|
||||
# get_cluster_node_cmd = ["kubectl get pods --all-namespaces -o wide"]
|
||||
get_cluster_pod_cmd = ["kubectl get pods --all-namespaces -o wide | grep -Ev 'kube-flannel|kube-system'"]
|
||||
if username != "root":
|
||||
results = ssh_utils.ssh_execute_command_noroot(host, port, username, password,
|
||||
get_cluster_pod_cmd, sudo_timeout=10)
|
||||
else:
|
||||
results = ssh_utils.ssh_execute_command(host, port, username, password, get_cluster_pod_cmd)
|
||||
parse_k8s_pods_result = results[0][0].strip()
|
||||
parse_k8s_pods_result = parse_k8s_params.parse_k8s_pods(parse_k8s_pods_result)
|
||||
# debug(f'集群 {host=} 所有Pod信息如下{results=} => 转换后:\n{parse_k8s_pods_result=}')
|
||||
return parse_k8s_pods_result
|
||||
|
||||
def new_cluster_install(params):
|
||||
# 随后填充远程操控k8s主逻辑
|
||||
"""
|
||||
用于接收cpcc端传递过来的k8s安装指令参数, 进行远程sshx调用操作内网机器进行集群节点的安装
|
||||
可以安装控制节点和工作节点
|
||||
参数示例:
|
||||
{'cluster_type': '0', 'host': '192.168.0.3', 'port': '22', 'user': 'ysh', 'password': 'Kyy@123456'}
|
||||
"""
|
||||
debug(f'=====new_cluster_install params: {params}')
|
||||
host = params.get("host")
|
||||
port = int(params.get("port"))
|
||||
username = params.get("user")
|
||||
password = params.get("password")
|
||||
role = params.get("role")
|
||||
target_file_path = "/opt/k8s_install.sh"
|
||||
local_file_path="script/k8s_install.sh"
|
||||
scp_map = {
|
||||
local_file_path: target_file_path,
|
||||
"files/kube-flannel.yml":"/opt/kube-flannel.yml",
|
||||
"files/components.yaml":"/opt/components.yaml",
|
||||
"files/ingress-nginx-controller.yaml":"/opt/ingress-nginx-controller.yaml",
|
||||
"files/storage_class.yaml":"/opt/storage_class.yaml",
|
||||
# "files/nfs-provisioner-deploy.yaml":"/opt/nfs-provisioner-deploy.yaml",
|
||||
"files/nfs-rbac.yaml": "/opt/nfs-rbac.yaml",
|
||||
"files/nvidia-device-plugin.yml": "/opt/nvidia-device-plugin.yml",
|
||||
"script/k8s_uninstall.sh": "/opt/k8s_uninstall.sh",
|
||||
"script/import_images.sh": "/opt/import_images.sh",
|
||||
}
|
||||
# 此处如果是工作节点的话应该完成
|
||||
nfs_server_ip = host if role == "master" else str()
|
||||
nfs_share_path = "/k8sdata" if role == "master" else str()
|
||||
install_clusterrole_command = ["chmod 755 %s" % target_file_path,"%s %s %s %s" % (target_file_path,role,nfs_server_ip,nfs_share_path)]
|
||||
debug(f'{install_clusterrole_command=}')
|
||||
try:
|
||||
if username == "root":
|
||||
# 如果是root用户,直接执行安装脚本
|
||||
debug(f'开始Root用户安装集群节点,用户名: {username}, 角色: {role},主机: {host},端口: {port}')
|
||||
ssh_utils.ssh_execute_command(host, port, username, password,
|
||||
install_clusterrole_command, real_time_log=True,
|
||||
scp_map=scp_map)
|
||||
else:
|
||||
# 如果是普通用户,需要先将处理好
|
||||
debug(f'开始普通用户安装集群节点,用户名: {username}, 角色: {role},主机: {host},端口: {port}')
|
||||
ssh_utils.ssh_execute_command_noroot(host, port, username, password,
|
||||
install_clusterrole_command, real_time_log=True,
|
||||
scp_map=scp_map,
|
||||
sudo_timeout=500) # 设置较长的超时时间,适应K8s安装过程
|
||||
except:
|
||||
# debug(f"集群节点安装失败:{traceback.format_exc()}")
|
||||
raise traceback.format_exc()
|
||||
results = "%s => %s节点安装成功" % (host,role)
|
||||
if role == "master":
|
||||
# 安装控制节点接口,一共分三步:
|
||||
# 第一步:执行安装命令
|
||||
# 第二步:获取集群工作节点加入凭证
|
||||
# 第三步:返回加入凭证给cpcc保存(pcapi无状态)
|
||||
clusterauth_command = ['kubeadm token create --print-join-command --ttl 0']
|
||||
if username != "root":
|
||||
join_idp = ssh_utils.ssh_execute_command_noroot(host, port, username, password, clusterauth_command,
|
||||
real_time_log=True, sudo_timeout=60) # 获取token命令应该较快完成
|
||||
else:
|
||||
join_idp = ssh_utils.ssh_execute_command(host, port, username, password, clusterauth_command, real_time_log=True)
|
||||
join_idp = join_idp[0][0].strip()
|
||||
debug(f'集群验证码:{join_idp=}')
|
||||
kubeconfig_context_command = ['cat /root/.kube/config']
|
||||
if username != "root":
|
||||
kubeconfig = ssh_utils.ssh_execute_command_noroot(host, port, username, password,
|
||||
kubeconfig_context_command, real_time_log=True,
|
||||
sudo_timeout=60) # 获取kubeconfig命令应该较快完成
|
||||
else:
|
||||
kubeconfig = ssh_utils.ssh_execute_command(host, port, username, password, kubeconfig_context_command, real_time_log=True)
|
||||
kubeconfig = kubeconfig[0][0].strip()
|
||||
debug(f'集群上下文:{kubeconfig=}')
|
||||
results = join_idp + "###" + kubeconfig
|
||||
if role == "worker":
|
||||
# 安装工作节点接口,一共分两步:
|
||||
# 第一步:执行安装命令
|
||||
# 第二步:通过传进来的加入命令加入集群
|
||||
debug(f'开始工作节点加入集群')
|
||||
|
||||
join_command = params.get("join_command")
|
||||
if username != "root":
|
||||
ssh_utils.ssh_execute_command_noroot(host, port, username, password, [join_command],
|
||||
real_time_log=True, sudo_timeout=120) # 工作节点加入可能需要一些时间
|
||||
else:
|
||||
ssh_utils.ssh_execute_command(host, port, username, password, [join_command], real_time_log=True)
|
||||
|
||||
return results
|
||||
|
||||
def get_multiple_cluster_pod():
|
||||
"""
|
||||
获取 kubeconfig 中所有集群的 Pod 信息(JSON 格式)
|
||||
|
||||
功能:
|
||||
1. 遍历 kubeconfig 中所有上下文(集群)
|
||||
2. 对每个集群获取所有命名空间的 Pod 信息
|
||||
3. 返回格式化的 JSON 结果
|
||||
|
||||
返回值:
|
||||
str: 格式化的 JSON 字符串,结构示例:
|
||||
{
|
||||
"cluster1": [
|
||||
{"ip": "10.0.0.1", "namespace": "default", "name": "pod1"},
|
||||
...
|
||||
],
|
||||
"cluster2": [...]
|
||||
}
|
||||
"""
|
||||
# 获取所有集群上下文(忽略当前激活状态)
|
||||
contexts, _ = config.list_kube_config_contexts()
|
||||
if not contexts:
|
||||
print("未找到任何集群上下文")
|
||||
return
|
||||
|
||||
all_clusters_pods = {} # 存储所有集群的 Pod 信息
|
||||
|
||||
for context in contexts:
|
||||
cluster_name = context["name"]
|
||||
try:
|
||||
# 创建集群专属的 API 客户端
|
||||
api_client = config.new_client_from_config(context=cluster_name)
|
||||
v1 = client.CoreV1Api(api_client)
|
||||
|
||||
# 收集当前集群的 Pod 信息
|
||||
pods = []
|
||||
for pod in v1.list_pod_for_all_namespaces().items:
|
||||
pods.append({
|
||||
"ip": pod.status.pod_ip,
|
||||
"namespace": pod.metadata.namespace,
|
||||
"name": pod.metadata.name
|
||||
})
|
||||
|
||||
all_clusters_pods[cluster_name] = pods
|
||||
|
||||
except Exception as e:
|
||||
print(f"集群 {cluster_name} 访问失败: {str(e)}")
|
||||
|
||||
return all_clusters_pods
|
||||
|
||||
|
||||
def get_multiple_cluster():
|
||||
"""
|
||||
获取所有集群的完整信息,包括用户证书、RBAC状态、服务账号颁发者等。
|
||||
|
||||
该函数会遍历kubeconfig文件中的所有上下文,针对每个上下文对应的集群进行以下操作:
|
||||
1. 从kubeconfig配置中提取静态信息,如API服务器地址、CA证书数据、用户证书和私钥数据。
|
||||
2. 通过Kubernetes API获取动态信息,如节点数量、Kubernetes版本、RBAC是否启用以及服务账号颁发者(如果是OIDC集群)。
|
||||
3. 处理在配置解析和API调用过程中可能出现的错误,并将错误信息记录在结果中。
|
||||
|
||||
返回格式示例:
|
||||
{
|
||||
"cluster1": {
|
||||
"context_name": "ctx1",
|
||||
"api_server": "https://1.1.1.1:6443",
|
||||
"ca_cert_data": "LS0tLS1CRUd...",
|
||||
"user_cert_data": "LS0tLS1CRUd...",
|
||||
"user_key_data": "LS0tLS1CRUd...",
|
||||
"nodes_count": 3,
|
||||
"notready_count": 0,
|
||||
"version": "1.28.3",
|
||||
"rbac_enabled": true,
|
||||
"service_account_issuer": "https://oidc.example.com",
|
||||
"error": null
|
||||
}
|
||||
}
|
||||
"""
|
||||
try:
|
||||
config.load_kube_config()
|
||||
contexts, _ = config.list_kube_config_contexts()
|
||||
|
||||
if not contexts:
|
||||
return json.dumps({"error": "未找到任何集群上下文信息"}, indent=4)
|
||||
|
||||
# 直接读取 kubeconfig 文件获取原始配置
|
||||
kubeconfig_path = expanduser("~/.kube/config")
|
||||
with open(kubeconfig_path, 'r') as f:
|
||||
config_dict = yaml.safe_load(f)
|
||||
|
||||
clusters_config = config_dict.get('clusters', [])
|
||||
users_config = config_dict.get('users', [])
|
||||
|
||||
all_clusters_info = {}
|
||||
for context in contexts:
|
||||
cluster_name = context['context']['cluster']
|
||||
user_name = context['context'].get('user')
|
||||
context_name = context['name'] # 新增:获取上下文名称
|
||||
|
||||
cluster_info = {
|
||||
'nodes_count': 0,
|
||||
'notready_nodes': 0,
|
||||
'k8s_version': '',
|
||||
'error': None,
|
||||
'server_url': '',
|
||||
'context_name': context_name, # 新增字段
|
||||
'user_info': {
|
||||
'name': '',
|
||||
'client_certificate': 'not_support',
|
||||
'client_key': 'not_support',
|
||||
'token': 'not_support'
|
||||
}
|
||||
}
|
||||
|
||||
# 提取 serverUrl 和用户信息
|
||||
cluster_config = next(
|
||||
(c for c in clusters_config if c['name'] == cluster_name),
|
||||
{}
|
||||
)
|
||||
cluster_info['server_url'] = cluster_config.get('cluster', {}).get('server', '')
|
||||
|
||||
user_config = next(
|
||||
(u for u in users_config if u['name'] == user_name),
|
||||
{}
|
||||
)
|
||||
user_data = user_config.get('user', {})
|
||||
|
||||
# 填充用户信息
|
||||
cluster_info['user_info']['name'] = user_config.get('name', '')
|
||||
# cluster_info['user_info']['client_certificate'] = user_data.get('client-certificate-data', '')
|
||||
# cluster_info['user_info']['client_key'] = user_data.get('client-key-data', '')
|
||||
# cluster_info['user_info']['token'] = user_data.get('token', '')
|
||||
|
||||
try:
|
||||
api_client = config.new_client_from_config(context=context['name'])
|
||||
v1 = client.CoreV1Api(api_client)
|
||||
version_api = client.VersionApi(api_client)
|
||||
|
||||
nodes = v1.list_node().items
|
||||
cluster_info['nodes_count'] = len(nodes)
|
||||
|
||||
notready_nodes = 0
|
||||
for node in nodes:
|
||||
ready_condition = next(
|
||||
(cond for cond in node.status.conditions
|
||||
if cond.type == "Ready" and cond.status == "True"),
|
||||
None
|
||||
)
|
||||
if not ready_condition:
|
||||
notready_nodes += 1
|
||||
|
||||
cluster_info['notready_nodes'] = notready_nodes
|
||||
|
||||
version = version_api.get_code()
|
||||
cluster_info['k8s_version'] = version.git_version
|
||||
|
||||
except ApiException as e:
|
||||
cluster_info['error'] = f"API错误({e.status}): {e.reason}"
|
||||
except Exception as e:
|
||||
cluster_info['error'] = f"连接失败: {str(e)}"
|
||||
|
||||
all_clusters_info[cluster_name] = cluster_info # 仍以 cluster_name 作为键
|
||||
|
||||
# return json.dumps(all_clusters_info, indent=4, ensure_ascii=False)
|
||||
return all_clusters_info
|
||||
except Exception as e:
|
||||
return json.dumps({
|
||||
'error': f"系统错误: {str(e)}"
|
||||
}, indent=4)
|
||||
|
||||
|
||||
def process_kubeconfigs():
|
||||
"""
|
||||
检测当前目录下的 kubestage 文件夹中的 kubeconfig 格式文件,
|
||||
计算每个文件的大写 MD5 值,将其改名成对应的 MD5 值,
|
||||
并按照规则 [md5[0]/md5[1]/md5[2]/md5] 的层级形式存储到当前目录下的 savekubes 目录中。
|
||||
如果 MD5 冲突,则记录冲突文件信息并跳过该文件。
|
||||
记录每个集群kubeconfig在savekubes目录里的存储地址和其它信息到数据表
|
||||
"""
|
||||
# 定义路径
|
||||
current_dir = Path.cwd()
|
||||
app_dir = current_dir / "app"
|
||||
source_dir = app_dir / "kubestage"
|
||||
target_dir = app_dir / "savekubes"
|
||||
db_path = app_dir / "mk8s.db" # SQLite 数据库路径
|
||||
|
||||
# 确保目标目录存在
|
||||
target_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 连接 SQLite 数据库(自动创建文件)
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# 创建表(如果不存在)
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS mk8s (
|
||||
md5_hash TEXT PRIMARY KEY,
|
||||
server_url TEXT NOT NULL,
|
||||
now_path TEXT NOT NULL,
|
||||
original_filename TEXT NOT NULL,
|
||||
timestamp TEXT NOT NULL
|
||||
)
|
||||
''')
|
||||
conn.commit()
|
||||
|
||||
# 记录已处理的 MD5 值
|
||||
md5_map = {}
|
||||
|
||||
for file_path in source_dir.glob("*"):
|
||||
if not file_path.is_file():
|
||||
continue
|
||||
|
||||
try:
|
||||
# 读取并解析 YAML
|
||||
with open(file_path, "rb") as f:
|
||||
file_content = f.read()
|
||||
config_data = yaml.safe_load(file_content)
|
||||
|
||||
# 验证基础结构
|
||||
if not all(key in config_data for key in ["apiVersion", "clusters", "contexts"]):
|
||||
raise ValueError("缺少必要字段: apiVersion, clusters 或 contexts")
|
||||
|
||||
if not isinstance(config_data["clusters"], list) or not isinstance(config_data["contexts"], list):
|
||||
raise ValueError("clusters 或 contexts 必须是列表类型")
|
||||
|
||||
# 提取 server URL
|
||||
server_url = None
|
||||
for cluster in config_data["clusters"]:
|
||||
if "cluster" in cluster and "server" in cluster["cluster"]:
|
||||
server_url = cluster["cluster"]["server"]
|
||||
break
|
||||
if not server_url:
|
||||
raise ValueError("未找到有效的 server URL")
|
||||
|
||||
# 计算 MD5
|
||||
md5_hash = hashlib.md5(file_content).hexdigest().upper()
|
||||
|
||||
# 检查 MD5 冲突
|
||||
if md5_hash in md5_map:
|
||||
print(f"MD5 冲突: 文件 {file_path} 和 {md5_map[md5_hash]} 具有相同的 MD5 值 ({md5_hash}),跳过。")
|
||||
continue
|
||||
|
||||
# 记录 MD5 映射
|
||||
md5_map[md5_hash] = str(file_path)
|
||||
|
||||
# 构造目标路径并移动文件
|
||||
sub_dir = target_dir / md5_hash[0] / md5_hash[1] / md5_hash[2]
|
||||
target_file_path = sub_dir / md5_hash
|
||||
sub_dir.mkdir(parents=True, exist_ok=True)
|
||||
os.rename(file_path, target_file_path)
|
||||
# print(f"已处理: {file_path} -> {target_file_path}")
|
||||
print("集群新增成功! kubeconfig在: %s" % target_file_path)
|
||||
|
||||
# 插入到 SQLite 数据库
|
||||
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
|
||||
print(md5_hash, server_url, target_file_path, file_path.name, timestamp)
|
||||
cursor.execute(
|
||||
"INSERT INTO mk8s (md5_hash, server_url, now_path, original_filename, timestamp) "
|
||||
"VALUES (?, ?, ?, ?, ?)",
|
||||
(md5_hash, server_url, str(target_file_path), file_path.name, timestamp)
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# except yaml.YAMLError as e:
|
||||
# error_mark = getattr(e, "problem_mark", None)
|
||||
# if error_mark:
|
||||
# error_line = error_mark.line + 1
|
||||
# error_column = error_mark.column + 1
|
||||
# error_message = (
|
||||
# f"YAML 格式错误:第{error_line}行,第{error_column}列:{e.problem}"
|
||||
# )
|
||||
# else:
|
||||
# error_message = f"YAML 解析失败:{str(e)}"
|
||||
# print(f"文件 {file_path} 不是有效的 kubeconfig 格式({error_message}),跳过。")
|
||||
|
||||
# except ValueError as e:
|
||||
# print(f"文件 {file_path} 不是有效的 kubeconfig 格式({str(e)}),跳过。")
|
||||
|
||||
# except Exception as e:
|
||||
# print(f"处理文件 {file_path} 时发生未知错误:{str(e)},跳过。")
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
cursor.execute("SELECT * FROM mk8s;")
|
||||
rows = cursor.fetchall()
|
||||
print(rows)
|
||||
|
||||
# 关闭数据库连接
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# get_multiple_cluster()
|
||||
# get_multiple_cluster()
|
||||
# ret = get_cluster_nodes_by_server("192_168_0_3-6443")
|
||||
|
||||
# print(ret)
|
||||
# process_kubeconfigs()
|
||||
pass
|
||||
|
||||
92
app/k8sManager/parse_k8s_params.py
Normal file
92
app/k8sManager/parse_k8s_params.py
Normal file
@ -0,0 +1,92 @@
|
||||
import re
|
||||
import json
|
||||
|
||||
|
||||
def parse_k8s_pods(input_text):
|
||||
header_mapping = {
|
||||
"NAMESPACE": "pod_namespace",
|
||||
"NAME": "pod_name",
|
||||
"READY": "pod_ready",
|
||||
"STATUS": "pod_running",
|
||||
"RESTARTS": "pod_restart",
|
||||
"AGE": "pod_age",
|
||||
"IP": "pod_ip",
|
||||
"NODE": "pod_node",
|
||||
"NOMINATED NODE": "pod_nominated_node",
|
||||
"READINESS GATES": "pod_readiness_gates" # 检查列名映射
|
||||
}
|
||||
|
||||
lines = [line.strip() for line in input_text.strip().split('\n')]
|
||||
if not lines:
|
||||
return {"total": 0, "rows": []}
|
||||
|
||||
raw_headers = re.split(r'\s{2,}', lines[0])
|
||||
rows = []
|
||||
for line in lines[1:]:
|
||||
values = re.split(r'\s{2,}', line.strip())
|
||||
if len(values) == len(raw_headers):
|
||||
mapped = {}
|
||||
for i, raw_h in enumerate(raw_headers):
|
||||
if raw_h in header_mapping:
|
||||
mapped[header_mapping[raw_h]] = values[i]
|
||||
rows.append(mapped)
|
||||
|
||||
return {
|
||||
"total": len(rows),
|
||||
"rows": rows
|
||||
}
|
||||
|
||||
|
||||
def parse_k8s_nodes(input_text):
|
||||
# 定义表头映射关系(原始表头 -> 目标字段名)
|
||||
header_mapping = {
|
||||
"NAME": "node_name",
|
||||
"STATUS": "node_status",
|
||||
"ROLES": "node_role",
|
||||
"AGE": "node_age",
|
||||
"VERSION": "node_version",
|
||||
"INTERNAL-IP": "node_internalip",
|
||||
"EXTERNAL-IP": "node_externalip",
|
||||
"OS-IMAGE": "node_osversion",
|
||||
"KERNEL-VERSION": "node_kernelversion",
|
||||
"CONTAINER-RUNTIME": "node_containeruntime",
|
||||
"LABELS":"node_labels",
|
||||
}
|
||||
|
||||
lines = [line.strip() for line in input_text.strip().split('\n')]
|
||||
|
||||
# 处理表头(应用映射关系)
|
||||
raw_headers = re.split(r'\s{2,}', lines[0])
|
||||
headers = [header_mapping[h] for h in raw_headers if h in header_mapping] # 确保只保留存在映射的字段
|
||||
|
||||
rows = []
|
||||
for line in lines[1:]:
|
||||
values = re.split(r'\s{2,}', line.strip())
|
||||
if len(values) == len(raw_headers): # 使用原始表头长度进行匹配(保持列数一致)
|
||||
# 按原始表头顺序映射到目标字段名
|
||||
mapped_values = {header_mapping[raw_headers[i]]: values[i] for i in range(len(raw_headers)) if raw_headers[i] in header_mapping}
|
||||
rows.append(mapped_values)
|
||||
|
||||
result = {
|
||||
"total": len(rows),
|
||||
"rows": rows
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 示例输入(你的 kubectl 输出)
|
||||
input_text1 = '''NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME
|
||||
k8s-master Ready control-plane 4d19h v1.28.2 192.168.0.3 <none> Ubuntu 22.04.1 LTS 5.15.0-91-generic containerd://1.7.24'''
|
||||
|
||||
# 执行转换
|
||||
# output1 = parse_k8s_nodes(input_text1)
|
||||
# print(output1)
|
||||
input_text2 = '''NAMESPACE NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
|
||||
kube-flannel kube-flannel-ds-sbkgm 1/1 Running 0 14m 192.168.0.2 k8s-worker-20250408132243 <none> <none>'''
|
||||
|
||||
result = parse_k8s_pods(input_text2)
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
|
||||
|
||||
16
app/k8sManager/pcapi_utils.py
Normal file
16
app/k8sManager/pcapi_utils.py
Normal file
@ -0,0 +1,16 @@
|
||||
import socket
|
||||
|
||||
def get_local_ipv4():
|
||||
"""获取本机IPv4地址"""
|
||||
try:
|
||||
# 获取本机所有网络接口信息
|
||||
local_ip = socket.gethostbyname(socket.gethostname())
|
||||
except socket.gaierror:
|
||||
# 如果上述方式失败(例如,主机名无法解析),则尝试以下备用方法
|
||||
local_ip = ([l for l in ([ip for ip in socket.gethostbyname_ex(socket.gethostname())[2]
|
||||
if not ip.startswith("127.")][:1], [[(s.connect(('8.8.8.8', 53)), s.getsockname()[0], s.close())
|
||||
for s in [socket.socket(socket.AF_INET, socket.SOCK_DGRAM)]][0][1]]) if l][0][0])
|
||||
return local_ip
|
||||
|
||||
if __name__ == "__main__":
|
||||
print(get_local_ipv4())
|
||||
240
app/k8sManager/ssh_utils.py
Normal file
240
app/k8sManager/ssh_utils.py
Normal file
@ -0,0 +1,240 @@
|
||||
# 暂时不用
|
||||
# from appPublic import sshx
|
||||
# 后面有空了再改成g.debug
|
||||
import time
|
||||
import os
|
||||
import re
|
||||
from appPublic.log import debug
|
||||
import paramiko
|
||||
import socket
|
||||
import traceback
|
||||
|
||||
def ssh_execute_command(host, port, username, password, commands, real_time_log=False,
|
||||
remote_exec=True, scp_map=dict()):
|
||||
try:
|
||||
# 创建 SSH 对象
|
||||
ssh = paramiko.SSHClient()
|
||||
# 允许连接不在 know_hosts 文件中的主机
|
||||
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
# 连接服务器
|
||||
ssh.connect(hostname=host, port=port, username=username, password=password)
|
||||
all_results = []
|
||||
if scp_map:
|
||||
# 创建 SFTP 客户端对象
|
||||
sftp = ssh.open_sftp()
|
||||
# 构建脚本内部调用脚本json
|
||||
for sf,df in scp_map.items():
|
||||
# 上传文件
|
||||
debug(f"远程拷贝 {sf=} => {df=}")
|
||||
sftp.put(sf, df)
|
||||
# 关闭 SFTP 连接
|
||||
sftp.close()
|
||||
if remote_exec:
|
||||
# 通用流程
|
||||
result = ""
|
||||
error = ""
|
||||
for command in commands:
|
||||
stdin, stdout, stderr = ssh.exec_command(f'{command}', get_pty=True)
|
||||
stdin.flush()
|
||||
if real_time_log:
|
||||
debug(f"开始执行命令: {command=}, 请耐心等待...")
|
||||
# 实时读取标准输出
|
||||
for line in iter(stdout.readline, ""):
|
||||
debug(f'{line=}')
|
||||
result += line
|
||||
# 实时读取标准错误输出
|
||||
for line in iter(stderr.readline, ""):
|
||||
debug(f'{line=}')
|
||||
error += line
|
||||
else:
|
||||
result = stdout.read().decode(errors="replace")
|
||||
error = stderr.read().decode(errors="replace")
|
||||
|
||||
all_results.append((result, error))
|
||||
if real_time_log:
|
||||
debug(f"命令 {command=} 执行结束")
|
||||
# 关闭连接
|
||||
ssh.close()
|
||||
return all_results
|
||||
except Exception as e:
|
||||
debug(f"SSH连接或执行命令时出错: {e=}")
|
||||
return [e]
|
||||
|
||||
# ----------------------------------------以下是非Root用户进行Root操作基座-------------------------------------------
|
||||
|
||||
def ssh_execute_command_noroot(host, port, username, password, commands, real_time_log=False,
|
||||
remote_exec=True, scp_map=dict(), temp_dir="/tmp/ssh_temp", sudo_timeout=500):
|
||||
"""
|
||||
增强版SSH执行命令函数,支持普通用户向root目录传输文件和执行sudo命令
|
||||
sudo_timeout参数控制sudo命令的超时时间(秒)
|
||||
"""
|
||||
try:
|
||||
# 创建SSH连接
|
||||
ssh = paramiko.SSHClient()
|
||||
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
ssh.connect(hostname=host, port=port, username=username, password=password)
|
||||
all_results = []
|
||||
|
||||
# 创建临时目录(如果需要文件传输)
|
||||
if scp_map:
|
||||
# 创建临时目录
|
||||
create_temp_cmd = f"mkdir -p {temp_dir} && chmod 700 {temp_dir}"
|
||||
stdin, stdout, stderr = ssh.exec_command(create_temp_cmd)
|
||||
create_error = stderr.read().decode(errors="replace")
|
||||
if create_error:
|
||||
raise Exception(f"创建临时目录失败: {create_error}")
|
||||
|
||||
# 创建SFTP客户端
|
||||
sftp = ssh.open_sftp()
|
||||
|
||||
# 上传文件到临时目录
|
||||
temp_scp_map = {}
|
||||
for local_path, remote_path in scp_map.items():
|
||||
# 确定临时目标路径
|
||||
temp_remote_path = f"{temp_dir}/{os.path.basename(remote_path)}"
|
||||
debug(f"上传文件 {local_path} => {temp_remote_path}")
|
||||
sftp.put(local_path, temp_remote_path)
|
||||
temp_scp_map[temp_remote_path] = remote_path
|
||||
|
||||
# 关闭SFTP连接
|
||||
sftp.close()
|
||||
|
||||
# 将文件从临时目录移动到目标位置(需要sudo权限)
|
||||
for temp_path, final_path in temp_scp_map.items():
|
||||
# 确保目标目录存在
|
||||
mkdir_cmd = f"sudo mkdir -p $(dirname {final_path})"
|
||||
execute_sudo_command(ssh, mkdir_cmd, password, real_time_log, sudo_timeout, username)
|
||||
|
||||
# 移动文件
|
||||
move_cmd = f"sudo mv {temp_path} {final_path}"
|
||||
execute_sudo_command(ssh, move_cmd, password, real_time_log, sudo_timeout, username)
|
||||
|
||||
# 设置文件权限
|
||||
chmod_cmd = f"sudo chmod 644 {final_path}"
|
||||
if final_path.endswith('.sh'): # 脚本文件设置可执行权限
|
||||
chmod_cmd = f"sudo chmod 755 {final_path}"
|
||||
execute_sudo_command(ssh, chmod_cmd, password, real_time_log, sudo_timeout, username)
|
||||
|
||||
# 执行远程命令(如果需要)
|
||||
if remote_exec:
|
||||
for command in commands:
|
||||
# 执行需要sudo权限的命令
|
||||
result, error = execute_sudo_command(ssh, command, password, real_time_log, sudo_timeout, username)
|
||||
all_results.append((result, error))
|
||||
|
||||
# 清理临时目录
|
||||
if scp_map:
|
||||
cleanup_cmd = f"rm -rf {temp_dir}"
|
||||
stdin, stdout, stderr = ssh.exec_command(cleanup_cmd)
|
||||
|
||||
# 关闭SSH连接
|
||||
ssh.close()
|
||||
return all_results
|
||||
|
||||
except Exception as e:
|
||||
debug(f"SSH操作错误: {traceback.format_exc()}")
|
||||
raise e
|
||||
return [(None, str(e))]
|
||||
|
||||
|
||||
def execute_sudo_command(ssh, command, password, real_time_log, sudo_timeout, username):
|
||||
"""
|
||||
执行需要sudo权限的命令,处理密码交互和超时
|
||||
"""
|
||||
sudo_cmd = f"sudo -S -p '[sudo] password: ' {command}" # -k参数确保每次都需要密码
|
||||
# sudo_cmd = f"sudo -k -S -p '[sudo] password: ' {command}" # 正确方式
|
||||
stdin, stdout, stderr = ssh.exec_command(sudo_cmd, get_pty=True)
|
||||
# 真实的sudo命令执行
|
||||
# sudo -p '[sudo] password: ' echo hello
|
||||
# 设置命令超时
|
||||
channel = stdout.channel
|
||||
channel.settimeout(timeout=sudo_timeout)
|
||||
|
||||
# 处理密码提示
|
||||
password_prompt = False
|
||||
initial_output = ""
|
||||
|
||||
try:
|
||||
debug("等待sudo密码提示...")
|
||||
start_time = time.time()
|
||||
while True:
|
||||
ready = False
|
||||
# 检查stdout
|
||||
if channel.recv_ready():
|
||||
# 此处发现chunk长度可能为一个标准kubeconfig长度,导致无法正确读取所有输出
|
||||
chunk = channel.recv(5800).decode(errors="replace")
|
||||
initial_output += chunk
|
||||
debug(f"stdout: {chunk.strip()}")
|
||||
if re.search(r"\[sudo\] password:", chunk):
|
||||
password_prompt = True
|
||||
stdin.write(f"{password}\n")
|
||||
stdin.flush()
|
||||
break
|
||||
ready = True
|
||||
# 检查stderr
|
||||
if channel.recv_stderr_ready():
|
||||
chunk = channel.recv_stderr(5800).decode(errors="replace")
|
||||
initial_output += chunk
|
||||
debug(f"stderr: {chunk.strip()}")
|
||||
if re.search(r"\[sudo\] password:", chunk):
|
||||
password_prompt = True
|
||||
stdin.write(f"{password}\n")
|
||||
stdin.flush()
|
||||
break
|
||||
ready = True
|
||||
# 超时检测
|
||||
if time.time() - start_time > sudo_timeout:
|
||||
raise Exception(f"等待sudo密码提示超时({sudo_timeout}秒): {sudo_cmd}")
|
||||
if not ready:
|
||||
time.sleep(1.5) # 避免CPU占用过高
|
||||
|
||||
# 如果没有收到密码提示但命令执行超时,可能是权限问题
|
||||
if not password_prompt:
|
||||
# 等待一段时间,确保没有密码提示
|
||||
time.sleep(3)
|
||||
# debug(f"ssh初步连接初始输出: {initial_output}")
|
||||
if not re.search(r"\[sudo\] password:", initial_output):
|
||||
raise Exception(f"未收到密码提示,可能sudo配置不允许该用户执行此命令: {sudo_cmd}")
|
||||
|
||||
except socket.timeout:
|
||||
raise Exception(f"命令执行超时({sudo_timeout}秒): {sudo_cmd}")
|
||||
|
||||
# 收集命令输出
|
||||
result = initial_output if not password_prompt else ""
|
||||
error = ""
|
||||
|
||||
try:
|
||||
if real_time_log:
|
||||
debug(f"执行命令: {sudo_cmd}")
|
||||
# 实时读取标准输出
|
||||
while True:
|
||||
if channel.recv_ready():
|
||||
line = channel.recv(5800).decode(errors="replace")
|
||||
debug(f"输出: {line.strip()}")
|
||||
result += line
|
||||
if channel.recv_stderr_ready():
|
||||
line = channel.recv_stderr(5800).decode(errors="replace")
|
||||
debug(f"错误: {line.strip()}")
|
||||
error += line
|
||||
if channel.exit_status_ready():
|
||||
break
|
||||
time.sleep(1.5) # 避免CPU占用过高
|
||||
else:
|
||||
# 非实时模式读取输出
|
||||
result += channel.recv(-1).decode(errors="replace") if channel.recv_ready() else ""
|
||||
error += channel.recv_stderr(-1).decode(errors="replace") if channel.recv_stderr_ready() else ""
|
||||
|
||||
except socket.timeout:
|
||||
raise Exception(f"命令执行超时({sudo_timeout}秒): {sudo_cmd}")
|
||||
|
||||
# 获取命令退出状态
|
||||
exit_status = channel.recv_exit_status()
|
||||
|
||||
# 检查sudo执行是否失败
|
||||
if exit_status != 0:
|
||||
if "incorrect password attempt" in error.lower():
|
||||
error = f"密码错误,无法执行sudo命令: {sudo_cmd}"
|
||||
elif "not allowed to run sudo" in error.lower():
|
||||
error = f"用户 {username} 没有sudo权限执行此命令: {sudo_cmd}"
|
||||
|
||||
return result, error
|
||||
100
app/ldap/ldapOperate.py
Normal file
100
app/ldap/ldapOperate.py
Normal file
@ -0,0 +1,100 @@
|
||||
from ldap3 import Server, Connection, ALL, NTLM, SUBTREE,MODIFY_REPLACE
|
||||
import json
|
||||
|
||||
|
||||
|
||||
|
||||
# LDAP服务器信息
|
||||
# ldap_server_uri = 'ldap://127.0.0.1:7389' # 或者 ldaps://your-ldap-server-secure
|
||||
ldap_server_uri = 'ldap://10.8.64.15' # 或者 ldaps://your-ldap-server-secure
|
||||
ldap_user = 'cn=admin,dc=test,dc=com'
|
||||
ldap_password = '123456'
|
||||
ldap_base = 'dc=test,dc=com'
|
||||
|
||||
# 创建LDAP服务器对象
|
||||
server = Server(ldap_server_uri, get_info=ALL)
|
||||
# 创建连接对象并绑定用户
|
||||
conn = Connection(server, user=ldap_user, password=ldap_password, auto_bind=True)
|
||||
|
||||
|
||||
def get_all_ldap_user():
|
||||
# 搜索条目
|
||||
search_filter = '(objectClass=person)'
|
||||
search_attribute = ['cn', 'sn', 'mail']
|
||||
conn.search(search_base=ldap_base,
|
||||
search_filter=search_filter,
|
||||
search_scope=SUBTREE,
|
||||
attributes=search_attribute)
|
||||
result=[ json.loads(x.entry_to_json())for x in conn.entries]
|
||||
return result
|
||||
def get_all_ldap_cn():
|
||||
# 搜索条目
|
||||
search_filter = '(objectClass=posixGroup)'
|
||||
search_attribute = ['cn', 'objectClass', 'gidNumber']
|
||||
conn.search(search_base=ldap_base,
|
||||
search_filter=search_filter,
|
||||
search_scope=SUBTREE,
|
||||
attributes=search_attribute)
|
||||
|
||||
result=[ json.loads(x.entry_to_json())for x in conn.entries]
|
||||
return result
|
||||
def get_one_cn(cn):
|
||||
# 搜索条目
|
||||
search_filter = f'(&(cn={cn})(objectClass=posixGroup))'
|
||||
search_attribute = ['cn', 'objectClass', 'gidNumber']
|
||||
|
||||
conn.search(search_base=ldap_base,
|
||||
search_filter=search_filter,
|
||||
search_scope=SUBTREE,
|
||||
attributes=search_attribute
|
||||
)
|
||||
if conn.entries is None:
|
||||
return None
|
||||
else:
|
||||
return json.loads(conn.entries[0].entry_to_json())
|
||||
|
||||
|
||||
''''
|
||||
传参示例
|
||||
# uid="test_add1"
|
||||
# plaintext_password="654321"
|
||||
# uid_number=123456
|
||||
# cn="test"
|
||||
# add_ldap_user(uid,plaintext_password,cn)
|
||||
'''
|
||||
def add_ldap_user(uid,uid_number,plaintext_password,cn ):
|
||||
|
||||
cn_attr=get_one_cn(cn)
|
||||
new_user_dn=f"uid={uid},ou=test,{ldap_base}"
|
||||
new_user_attrs={
|
||||
"objectClass": ["top", "posixAccount", "inetOrgPerson", "shadowAccount"],
|
||||
"uidNumber":uid_number,
|
||||
"gidNumber":cn_attr["attributes"]["gidNumber"],
|
||||
'sn':[uid],
|
||||
'loginShell': ["/bin/bash"],
|
||||
'homeDirectory':["/srv/nfs/"+uid],
|
||||
'cn':[cn]
|
||||
}
|
||||
|
||||
flag=conn.add(new_user_dn,new_user_attrs["objectClass"],new_user_attrs)
|
||||
print(conn.result)
|
||||
if flag is True:
|
||||
return modify_password(new_user_dn, plaintext_password)
|
||||
else:
|
||||
return conn.result
|
||||
|
||||
|
||||
def modify_password(new_user_dn,plaintext_password):
|
||||
mod_attrs = {
|
||||
'userPassword': (
|
||||
MODIFY_REPLACE,[plaintext_password])
|
||||
}
|
||||
conn.modify(new_user_dn, mod_attrs)
|
||||
return conn.result
|
||||
|
||||
def delete_ldap_user(uid):
|
||||
|
||||
user_dn=f"uid={uid},ou=test,{ldap_base}"
|
||||
|
||||
conn.delete(user_dn)
|
||||
return conn.result
|
||||
94
app/pcapi.py
Normal file
94
app/pcapi.py
Normal file
@ -0,0 +1,94 @@
|
||||
from ahserver.serverenv import ServerEnv
|
||||
from ahserver.webapp import webapp
|
||||
from ahserver.auth_api import get_client_ip
|
||||
from ahserver.auth_api import AuthAPI
|
||||
from appPublic.argsConvert import ArgsConvert
|
||||
from appPublic.jsonConfig import getConfig
|
||||
from appPublic.log import debug
|
||||
from appPublic.worker import awaitify
|
||||
from aiohttp import BasicAuth
|
||||
|
||||
from storage.common import get_storage_json #示例
|
||||
# from ldap.ldapOperate import * #目前没有ldap服务器
|
||||
|
||||
# k8s多集群管理核心接口
|
||||
from k8sManager.multiple_clusters import *
|
||||
|
||||
async def checkuserpasswd(obj, request, user, passwd):
|
||||
auth = request.headers.get('Authorization')
|
||||
if auth is None:
|
||||
debug(f'auth is None, {request.headers=}')
|
||||
return False
|
||||
if auth.startswith('Basic '):
|
||||
auther = BasicAuth('x')
|
||||
m = auther.decode(auth)
|
||||
username = m.login
|
||||
password = m.password
|
||||
config = getConfig()
|
||||
if username != config.authentication.user:
|
||||
debug(f'{username=},{password=}, user not match')
|
||||
return False
|
||||
if password != config.authentication.password:
|
||||
debug(f'{username=},{password=}, password not match')
|
||||
return False
|
||||
ip = get_client_ip(None, request)
|
||||
if ip not in config.authentication.iplist:
|
||||
debug(f'{username=},{password=}, ip not in allowed ip pools')
|
||||
return False
|
||||
return True
|
||||
debug(f'not a basic authentication')
|
||||
return False
|
||||
|
||||
async def create_namespaced_job_v1(namespace, jobdesc):
|
||||
batch_v1 = client.BatchV1Api()
|
||||
f = awaitify(batch_v1.create_namespaced_job)
|
||||
return await f(namespace=namespace, body=jobdesc)
|
||||
|
||||
async def determine_accommodat_by_kubeconfig_v1(params):
|
||||
f = awaitify(determine_accommodat_by_kubeconfig)
|
||||
return await f(params)
|
||||
|
||||
def paramify(data, ns):
|
||||
ac = ArgsConvert('${', '}$')
|
||||
d = ac.convert(data, ns)
|
||||
return d
|
||||
|
||||
def init_func():
|
||||
AuthAPI.checkUserPermission = checkuserpasswd
|
||||
g = ServerEnv()
|
||||
# 示例代码
|
||||
g.create_namespaced_job_v1 = create_namespaced_job_v1
|
||||
g.paramify = paramify
|
||||
g.debug = debug
|
||||
|
||||
###ldap相关
|
||||
# g.add_ldap_user=add_ldap_user
|
||||
# g.get_all_ldap_user=get_all_ldap_user
|
||||
# g.get_all_ldap_cn=get_all_ldap_cn
|
||||
# g.get_one_cn=get_one_cn
|
||||
# g.modify_password=modify_password
|
||||
# g.delete_ldap_user=delete_ldap_user
|
||||
|
||||
### k8s多集群相关
|
||||
g.new_cluster_install = new_cluster_install
|
||||
g.get_multiple_cluster = get_multiple_cluster
|
||||
g.get_multiple_cluster_pod = get_multiple_cluster_pod
|
||||
g.get_cluster_nodes_by_server = get_cluster_nodes_by_server
|
||||
g.get_cluster_pods_by_server = get_cluster_pods_by_server
|
||||
g.delete_cluster_node = delete_cluster_node
|
||||
g.node_state_switch = node_state_switch
|
||||
g.yaml_apply_delete = yaml_apply_delete
|
||||
g.get_cluster_nodes_by_kubeconfig = get_cluster_nodes_by_kubeconfig
|
||||
g.determine_accommodat_by_kubeconfig = determine_accommodat_by_kubeconfig
|
||||
g.get_cluster_pods_by_kubeconfig = get_cluster_pods_by_kubeconfig
|
||||
g.node_label_opt = node_label_opt
|
||||
|
||||
g.get_storage_json=get_storage_json
|
||||
g.result_dict={
|
||||
"status":False,
|
||||
"info":"operate failed",
|
||||
"data":{}
|
||||
}
|
||||
|
||||
if __name__ == '__main__':
|
||||
webapp(init_func)
|
||||
0
app/slurm/__init__.py
Normal file
0
app/slurm/__init__.py
Normal file
87
app/slurm/job.py
Normal file
87
app/slurm/job.py
Normal file
@ -0,0 +1,87 @@
|
||||
from . import parse_job
|
||||
|
||||
from . import sshClient
|
||||
import json
|
||||
|
||||
|
||||
|
||||
def get_history_job_command(query):
|
||||
command="sacct -a -p "
|
||||
if "startStartTime" in query:
|
||||
command=command+"-S "+query["startStartTime"]+" "
|
||||
if "group" in query:
|
||||
command=command+"-g "+query["group"]+" "
|
||||
if "jobId" in query:
|
||||
command=command+"-j "+query["jobId"]+" "
|
||||
if "accountUserName" in query:
|
||||
command=command+"-u "+query["accountUserName"]+" "
|
||||
if "jobIdList" in query:
|
||||
command = command + "-j"
|
||||
for jobId in query["jobIdList"]:
|
||||
command=command+"-j "+jobId+" "
|
||||
|
||||
command=command+"--format=JobId,JobName%30,User%50,state,partition,NodeList,AllocCPUS,Submit,Start,End,Group,Workdir%100,Priority,ReqTRES%50"+" "
|
||||
command=command+" "+"-X"
|
||||
return command
|
||||
|
||||
'''
|
||||
获取历史作业
|
||||
'''
|
||||
def get_history_list(data):
|
||||
command=get_history_job_command(data)
|
||||
|
||||
return sshClient.exec_command(command)
|
||||
|
||||
|
||||
def get_history_list_json(data):
|
||||
result=get_history_list(data)
|
||||
result_json=parse_job.process_data(result)
|
||||
return result_json
|
||||
|
||||
|
||||
|
||||
'''
|
||||
提交作业
|
||||
'''
|
||||
def submit_job(command):
|
||||
command=command.replace("\r"," ")
|
||||
return sshClient.exec_command(command)
|
||||
|
||||
|
||||
'''
|
||||
恢复作业
|
||||
'''
|
||||
def resume_job(jobId):
|
||||
command="sudo scontrol resume "+jobId
|
||||
return sshClient.exec_command(command)
|
||||
|
||||
|
||||
'''
|
||||
挂起作业
|
||||
'''
|
||||
def suspend_job(jobId):
|
||||
command="sudo scontrol suspend "+jobId
|
||||
return sshClient.exec_command(command)
|
||||
|
||||
|
||||
'''
|
||||
杀掉作业
|
||||
'''
|
||||
def kill_job(jobId):
|
||||
command="sudo scancel "+jobId
|
||||
return sshClient.exec_command(command)
|
||||
'''
|
||||
获取实时作业
|
||||
'''
|
||||
def get_real_time_list(query):
|
||||
command="squeue -a "
|
||||
return sshClient.exec_command(command)
|
||||
'''
|
||||
获取实时作业
|
||||
'''
|
||||
def get_real_time_list_json(query):
|
||||
command="squeue -a --json"
|
||||
result= sshClient.exec_command(command)
|
||||
std_out=result["stdout"]
|
||||
std_out=json.loads(std_out)
|
||||
return std_out
|
||||
57
app/slurm/node.py
Normal file
57
app/slurm/node.py
Normal file
@ -0,0 +1,57 @@
|
||||
from . import sshClient
|
||||
|
||||
'''
|
||||
获取节点详情
|
||||
'''
|
||||
def get_node_details_json(NodeName):
|
||||
command = "scontrol show node "
|
||||
|
||||
if NodeName is not None:
|
||||
command=command+NodeName
|
||||
|
||||
print(command)
|
||||
|
||||
result= sshClient.exec_command(command)
|
||||
data_str=result["stdout"]
|
||||
# 按空行分割字符串,得到每个节点的数据
|
||||
nodes_data = data_str.strip().split('\n\n')
|
||||
|
||||
# 初始化一个列表来存储所有节点的字典
|
||||
nodes_list = []
|
||||
|
||||
# 遍历每个节点的数据
|
||||
for node_data in nodes_data:
|
||||
# 初始化一个字典来存储当前节点的键值对
|
||||
node_dict = {}
|
||||
# 按行分割当前节点的数据
|
||||
lines = node_data.strip().split('\n')
|
||||
for line in lines:
|
||||
if "OS" in line:
|
||||
node_dict["OS"]=line.split("=")[1]
|
||||
else:
|
||||
# 按空格分割键值对
|
||||
key_value_pairs = line.strip().split()
|
||||
for pair in key_value_pairs:
|
||||
pair_list= pair.split('=')
|
||||
if len(pair_list) < 2:
|
||||
key=pair_list[0]
|
||||
value=""
|
||||
else:
|
||||
key=pair_list[0]
|
||||
value=pair_list[1]
|
||||
# 将键和值添加到字典中
|
||||
node_dict[key] = value
|
||||
# 将当前节点的字典添加到列表中
|
||||
nodes_list.append(node_dict)
|
||||
return nodes_list
|
||||
|
||||
def update_node(dict_data):
|
||||
command="sudo scontrol update"
|
||||
if dict_data["NodeName"] is not None:
|
||||
command=command+" "+"NodeName="+dict_data["NodeName"]
|
||||
command=command+" "+"State="+dict_data["State"]
|
||||
command=command+" "+"Reason="+dict_data["Reason"]
|
||||
|
||||
result= sshClient.exec_command(command)
|
||||
return result
|
||||
|
||||
201
app/slurm/parse_job.py
Normal file
201
app/slurm/parse_job.py
Normal file
@ -0,0 +1,201 @@
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import List
|
||||
import json
|
||||
class jobJsonVO:
|
||||
def __init__(self):
|
||||
self.jobId = None
|
||||
self.jobName = None
|
||||
self.accountUserName = None
|
||||
self.status = None
|
||||
self.queueName = None
|
||||
self.execHosts = None
|
||||
self.numProcessors = None
|
||||
self.submitTime = None
|
||||
self.startTime = None
|
||||
self.endTime = None
|
||||
self.userGroup = None
|
||||
self.workDir = None
|
||||
self.userPriority = None
|
||||
self.gpuCardNum = None
|
||||
self.runningTime = None
|
||||
self.formatRunningTime = None
|
||||
self.jobProcessorTime = None
|
||||
self.jobGpuCardTime = None
|
||||
|
||||
def get_start_time(self):
|
||||
return self.startTime
|
||||
|
||||
def set_start_time(self, startTime):
|
||||
self.startTime = startTime
|
||||
|
||||
def get_end_time(self):
|
||||
return self.endTime
|
||||
|
||||
def set_end_time(self, endTime):
|
||||
self.endTime = endTime
|
||||
|
||||
def get_status(self):
|
||||
return self.status
|
||||
|
||||
def getNumProcessors(self):
|
||||
return self.numProcessors
|
||||
|
||||
def getRunningTime(self):
|
||||
return self.runningTime
|
||||
class JobConstants:
|
||||
DONE = "DONE"
|
||||
EXIT = "EXIT"
|
||||
RUN = "RUN"
|
||||
PEND = "PEND"
|
||||
CANCELLED = "CANCELLED"
|
||||
|
||||
def parse_status(status: str) -> str:
|
||||
if status == "COMPLETED":
|
||||
return JobConstants.DONE
|
||||
if status == "FAILED":
|
||||
return JobConstants.EXIT
|
||||
if status == "RUNNING":
|
||||
return JobConstants.RUN
|
||||
if status == "PENDING":
|
||||
return JobConstants.PEND
|
||||
if "CANCELLED" in status:
|
||||
return JobConstants.CANCELLED
|
||||
return status
|
||||
|
||||
def parse_slurm_str_to_str(date_str: str) -> str:
|
||||
# Implement your date parsing logic here
|
||||
date_str=date_str.replace("T"," ")
|
||||
date_str=date_str.replace("Z"," ")
|
||||
return date_str
|
||||
|
||||
def handle_alloc_tres_get_gpus(tres_str: str) -> int:
|
||||
# Implement your GPU card number extraction logic here
|
||||
return 0
|
||||
|
||||
def calculate_processor_time(job: jobJsonVO) -> float:
|
||||
# Implement your processor time calculation logic here
|
||||
processors = job.getNumProcessors()
|
||||
runningTime = job.getRunningTime()
|
||||
processorsRunningTime = processors * runningTime
|
||||
return processorsRunningTime
|
||||
|
||||
def calculate_gpu_card_time(job: jobJsonVO) -> float:
|
||||
# Implement your GPU card time calculation logic here
|
||||
return 0.0
|
||||
def parse_date(date_str: str) -> datetime:
|
||||
# 假设这是一个将字符串解析为 datetime 对象的函数
|
||||
# 这里使用默认的日期格式,您可以根据实际需求调整
|
||||
return datetime.strptime(date_str, "%Y-%m-%d %H:%M:%S")
|
||||
def get_now_date() -> datetime:
|
||||
# 返回当前时间的 datetime 对象
|
||||
return datetime.now()
|
||||
|
||||
def get_running_time(item: jobJsonVO) -> int:
|
||||
t = item.get_start_time()
|
||||
if t is None:
|
||||
return 0
|
||||
|
||||
start_time_temp = parse_slurm_str_to_str(item.get_start_time())
|
||||
item.set_start_time(start_time_temp)
|
||||
|
||||
if start_time_temp == "-":
|
||||
return 0
|
||||
|
||||
start_time_date = datetime.now() # 默认值为当前时间
|
||||
if item.get_status() != JobConstants.PEND:
|
||||
start_time_date = parse_date(start_time_temp)
|
||||
|
||||
running_time = 0
|
||||
status = item.get_status()
|
||||
|
||||
if status in [JobConstants.DONE, JobConstants.EXIT]:
|
||||
end_time_temp = start_time_temp if item.get_end_time() is None else parse_slurm_str_to_str(item.get_end_time())
|
||||
end_time_date = parse_date(end_time_temp)
|
||||
item.set_end_time(end_time_temp)
|
||||
running_time = int((end_time_date - start_time_date).total_seconds() * 1000) # 转换为毫秒
|
||||
|
||||
elif status == JobConstants.RUN:
|
||||
running_time = int((get_now_date() - start_time_date).total_seconds() * 1000) # 转换为毫秒
|
||||
|
||||
elif status == JobConstants.CANCELLED:
|
||||
end_time_date = parse_date(item.get_end_time()) if item.get_end_time() else start_time_date
|
||||
running_time = int((end_time_date - start_time_date).total_seconds() * 1000) # 转换为毫秒
|
||||
|
||||
elif status == JobConstants.PEND:
|
||||
running_time = 0
|
||||
|
||||
return running_time
|
||||
|
||||
def format_running_time(job: jobJsonVO) -> str:
|
||||
# Implement your running time formatting logic here
|
||||
return ""
|
||||
|
||||
def process_data(data: dict) -> List[jobJsonVO]:
|
||||
|
||||
|
||||
try:
|
||||
item_list = data["stdout"].split("\n")
|
||||
job_json_list = []
|
||||
|
||||
for i in range(len(item_list)):
|
||||
if len(item_list) < 1:
|
||||
return []
|
||||
|
||||
if i < 1:
|
||||
continue
|
||||
|
||||
words = item_list[i].split("|")
|
||||
word_list = [word.strip() for word in words if word.strip()]
|
||||
|
||||
if len(word_list) < 14:
|
||||
continue
|
||||
|
||||
job_json = jobJsonVO()
|
||||
|
||||
try:
|
||||
jobId = int(word_list[0])
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
job_json.jobId = word_list[0]
|
||||
job_json.jobName = word_list[1]
|
||||
job_json.accountUserName = word_list[2]
|
||||
job_json.status = parse_status(word_list[3])
|
||||
job_json.queueName = word_list[4]
|
||||
job_json.execHosts = word_list[5]
|
||||
|
||||
try:
|
||||
job_json.numProcessors = int(word_list[6])
|
||||
job_json.submitTime = parse_slurm_str_to_str(word_list[7])
|
||||
job_json.startTime = parse_slurm_str_to_str(word_list[8])
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
if job_json.status == "DONE" or len(word_list) >= 10:
|
||||
if word_list[9] != "Unknown":
|
||||
job_json.endTime = parse_slurm_str_to_str(word_list[9])
|
||||
|
||||
job_json.userGroup = word_list[10]
|
||||
job_json.workDir = word_list[11]
|
||||
job_json.userPriority = int(word_list[12])
|
||||
job_json.gpuCardNum = handle_alloc_tres_get_gpus(word_list[13])
|
||||
|
||||
job_json.runningTime = get_running_time(job_json)
|
||||
job_json.formatRunningTime = format_running_time(job_json)
|
||||
job_json.jobProcessorTime = calculate_processor_time(job_json)
|
||||
job_json.jobGpuCardTime = calculate_gpu_card_time(job_json)
|
||||
|
||||
|
||||
|
||||
job_dict = job_json.__dict__
|
||||
|
||||
job_json_list.append(job_dict)
|
||||
|
||||
job_json_list.reverse()
|
||||
|
||||
return job_json_list
|
||||
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
raise Exception("CLUSTER_ERROR")
|
||||
90
app/slurm/partition.py
Normal file
90
app/slurm/partition.py
Normal file
@ -0,0 +1,90 @@
|
||||
from . import sshClient
|
||||
import json
|
||||
'''
|
||||
创建队列
|
||||
'''
|
||||
def create_partition(dict):
|
||||
command = "sudo scontrol create "
|
||||
for key, value in dict.items():
|
||||
command=command+key+"="+str(value)+" "
|
||||
|
||||
print(command)
|
||||
return sshClient.exec_command(command)
|
||||
'''
|
||||
更新队列
|
||||
'''
|
||||
def update_partition(dict):
|
||||
command = "sudo scontrol update "
|
||||
for key, value in dict.items():
|
||||
command=command+key+"="+str(value)+" "
|
||||
|
||||
return sshClient.exec_command(command)
|
||||
|
||||
'''
|
||||
删除队列
|
||||
'''
|
||||
def delete_partition(dict):
|
||||
command = "sudo scontrol delete "
|
||||
for key, value in dict.items():
|
||||
command = command + key + "=" + str(value) + " "
|
||||
|
||||
return sshClient.exec_command(command)
|
||||
'''
|
||||
查询队列详情
|
||||
'''
|
||||
def list_partition_detail(PartitionName):
|
||||
command = "sudo scontrol show part "
|
||||
if PartitionName is not None:
|
||||
command = command + PartitionName + " "
|
||||
|
||||
return sshClient.exec_command(command)
|
||||
|
||||
'''
|
||||
查询队列信息简略
|
||||
'''
|
||||
def list_partition_info(query):
|
||||
command = "sudo sinfo "
|
||||
|
||||
return sshClient.exec_command(command)
|
||||
|
||||
'''
|
||||
查询队列信息简略 带json
|
||||
'''
|
||||
def list_partition_detail_json(query):
|
||||
command = "scontrol show part "
|
||||
|
||||
if query["partitionName"] is not None:
|
||||
command=command+ " "+query["partitionName"]
|
||||
result=sshClient.exec_command(command)
|
||||
|
||||
data_str = result["stdout"]
|
||||
# 按空行分割字符串,得到每个节点的数据
|
||||
partition_data = data_str.strip().split('\n\n')
|
||||
|
||||
# 初始化一个列表来存储所有节点的字典
|
||||
partition_list = []
|
||||
|
||||
# 遍历每个节点的数据
|
||||
for partition_data in partition_data:
|
||||
# 初始化一个字典来存储当前节点的键值对
|
||||
partition_dict = {}
|
||||
# 按行分割当前节点的数据
|
||||
lines = partition_data.strip().split('\n')
|
||||
for line in lines:
|
||||
# 按空格分割键值对
|
||||
key_value_pairs = line.strip().split()
|
||||
|
||||
for pair in key_value_pairs:
|
||||
pair_list= pair.split('=')
|
||||
if len(pair_list)<2:
|
||||
key=pair_list[0]
|
||||
value=""
|
||||
else:
|
||||
key=pair_list[0]
|
||||
value=pair_list[1]
|
||||
# 将键和值添加到字典中
|
||||
partition_dict[key] = value
|
||||
# 将当前节点的字典添加到列表中
|
||||
partition_list.append(partition_dict)
|
||||
|
||||
return partition_list
|
||||
55
app/slurm/sshClient.py
Normal file
55
app/slurm/sshClient.py
Normal file
@ -0,0 +1,55 @@
|
||||
import paramiko
|
||||
|
||||
|
||||
def exec_command(command):
|
||||
# 设置SSH连接参数
|
||||
hostname = '10.8.64.15'
|
||||
port = 22 # SSH端口,默认是22
|
||||
username = 'ceni'
|
||||
password = '1qazXSW@34'
|
||||
|
||||
# hostname = '127.0.0.1'
|
||||
# port = 722 # SSH端口,默认是22
|
||||
# username = 'ceni'
|
||||
# password = '1qazXSW@34'
|
||||
|
||||
# 创建SSH客户端
|
||||
client = paramiko.SSHClient()
|
||||
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
client.connect(hostname, port, username, password)
|
||||
|
||||
# 执行命令
|
||||
stdin, stdout, stderr = client.exec_command(command)
|
||||
result_out=stdout.read().decode("utf-8")
|
||||
result_error=stderr.read().decode("utf-8")
|
||||
|
||||
result={
|
||||
"stdout": result_out,
|
||||
"stderr": result_error
|
||||
}
|
||||
# 关闭连接
|
||||
# 关闭连接
|
||||
client.close()
|
||||
|
||||
return result
|
||||
|
||||
def exec_command_hostname(command,hostname,port,username,password):
|
||||
# 创建SSH客户端
|
||||
client = paramiko.SSHClient()
|
||||
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
client.connect(hostname, port, username, password)
|
||||
|
||||
# 执行命令
|
||||
stdin, stdout, stderr = client.exec_command(command)
|
||||
result_out=stdout.read().decode("utf-8")
|
||||
result_error=stderr.read().decode("utf-8")
|
||||
|
||||
result={
|
||||
"stdout": result_out,
|
||||
"stderr": result_error
|
||||
}
|
||||
# 关闭连接
|
||||
# 关闭连接
|
||||
client.close()
|
||||
|
||||
return result
|
||||
0
app/storage/__init__.py
Normal file
0
app/storage/__init__.py
Normal file
32
app/storage/common.py
Normal file
32
app/storage/common.py
Normal file
@ -0,0 +1,32 @@
|
||||
from slurm import sshClient
|
||||
import json
|
||||
|
||||
def df_to_json(df_output):
|
||||
# 解析输出
|
||||
lines = df_output.strip().split("\n")
|
||||
headers = lines[0].split()
|
||||
data = []
|
||||
|
||||
for line in lines[1:]:
|
||||
values = line.split()
|
||||
entry = {
|
||||
headers[0]: values[0], # filesystem
|
||||
headers[1]: values[1], # type
|
||||
headers[2]: values[2], # size
|
||||
headers[3]: values[3], # used
|
||||
headers[4]: values[4], # avail
|
||||
headers[5]: values[5], # use_percent
|
||||
headers[6]: values[6], # mounted_on
|
||||
}
|
||||
data.append(entry)
|
||||
return data
|
||||
def get_storage_json(point):
|
||||
command="df -h --output=source,fstype,size,used,avail,pcent,target"
|
||||
if point is not None:
|
||||
command=command+" "+point
|
||||
result=sshClient.exec_command(command)
|
||||
stdout=result["stdout"]
|
||||
print(result)
|
||||
result_json=df_to_json(stdout)
|
||||
print(result_json)
|
||||
return result_json
|
||||
336
app/unit_test/k8sinfo_cpu_memory_gpu.py
Normal file
336
app/unit_test/k8sinfo_cpu_memory_gpu.py
Normal file
@ -0,0 +1,336 @@
|
||||
from kubernetes import client, config
|
||||
import json
|
||||
import re
|
||||
from typing import Dict, Any, List, Tuple
|
||||
import yaml
|
||||
import time
|
||||
|
||||
def get_node_info(kubeconfig):
|
||||
try:
|
||||
# 加载配置
|
||||
kubeconfig = yaml.safe_load(kubeconfig)
|
||||
config.load_kube_config_from_dict(kubeconfig)
|
||||
v1 = client.CoreV1Api()
|
||||
api_client = client.ApiClient()
|
||||
|
||||
# 获取节点指标和 Pod 列表
|
||||
node_metrics_path = "/apis/metrics.k8s.io/v1beta1/nodes"
|
||||
node_metrics_response = api_client.call_api(
|
||||
node_metrics_path, 'GET', auth_settings=['BearerToken'], response_type='object')[0]
|
||||
node_metrics = {node['metadata']['name']: node.get('usage', {})
|
||||
for node in node_metrics_response.get('items', [])}
|
||||
|
||||
# 获取所有 Pod 及其资源请求
|
||||
pods = v1.list_pod_for_all_namespaces().items
|
||||
node_pod_resources = {} # 存储每个节点上 Pod 的资源请求
|
||||
print(pods)
|
||||
|
||||
for pod in pods:
|
||||
if pod.spec.node_name and pod.status.phase in ["Running", "Pending"]:
|
||||
node_name = pod.spec.node_name
|
||||
if node_name not in node_pod_resources:
|
||||
node_pod_resources[node_name] = {
|
||||
"cpu": 0,
|
||||
"memory": 0,
|
||||
"gpu": 0
|
||||
}
|
||||
|
||||
# 累加容器请求的资源
|
||||
for container in pod.spec.containers:
|
||||
if container.resources and container.resources.requests:
|
||||
# CPU (转换为 millicores)
|
||||
cpu_request = container.resources.requests.get("cpu", "0m")
|
||||
cpu_millis = int(float(cpu_request.rstrip("m"))) if "m" in cpu_request else int(float(cpu_request) * 1000)
|
||||
node_pod_resources[node_name]["cpu"] += cpu_millis
|
||||
|
||||
# Memory (转换为 bytes)
|
||||
memory_request = container.resources.requests.get("memory", "0")
|
||||
memory_bytes = int(float(memory_request.rstrip("KiMiGi")))
|
||||
if "Ki" in memory_request:
|
||||
memory_bytes *= 1024
|
||||
elif "Mi" in memory_request:
|
||||
memory_bytes *= 1024 * 1024
|
||||
elif "Gi" in memory_request:
|
||||
memory_bytes *= 1024 * 1024 * 1024
|
||||
node_pod_resources[node_name]["memory"] += memory_bytes
|
||||
|
||||
# GPU
|
||||
gpu_request = container.resources.requests.get("nvidia.com/gpu", "0")
|
||||
node_pod_resources[node_name]["gpu"] += int(gpu_request)
|
||||
|
||||
# 获取节点列表并计算资源使用情况
|
||||
nodes = v1.list_node().items
|
||||
rows = []
|
||||
|
||||
for node in nodes:
|
||||
node_name = node.metadata.name
|
||||
internal_ip = next((address.address for address in node.status.addresses
|
||||
if address.type == "InternalIP"), "未分配")
|
||||
external_ip = next((address.address for address in node.status.addresses
|
||||
if address.type == "ExternalIP"), "未分配")
|
||||
status = node.status.conditions[-1].status if node.status.conditions else "Unknown"
|
||||
status = "已就绪" if status == "True" else "未就绪"
|
||||
|
||||
# 节点角色
|
||||
roles = []
|
||||
role_labels = [
|
||||
"node-role.kubernetes.io/control-plane",
|
||||
"node-role.kubernetes.io/master",
|
||||
"node-role.kubernetes.io/worker"
|
||||
]
|
||||
for label in role_labels:
|
||||
if label in node.metadata.labels:
|
||||
roles.append(label.split("/")[-1])
|
||||
roles_str = "控制节点" if roles else "工作节点"
|
||||
|
||||
# 节点运行时间
|
||||
running_time = time.time() - node.metadata.creation_timestamp.timestamp()
|
||||
node_age = running_time
|
||||
|
||||
# 节点信息
|
||||
k8s_version = node.status.node_info.kubelet_version
|
||||
os_image = node.status.node_info.os_image
|
||||
kernel_version = node.status.node_info.kernel_version
|
||||
container_runtime = node.status.node_info.container_runtime_version
|
||||
|
||||
# 自定义标签
|
||||
labels = node.metadata.labels
|
||||
kyy_labels = [f"{k}={v}" for k, v in labels.items() if k.startswith('kyy-')]
|
||||
|
||||
# 实时资源使用情况
|
||||
cpu_usage = node_metrics.get(node_name, {}).get('cpu', 'undefined')
|
||||
if cpu_usage and isinstance(cpu_usage, str):
|
||||
cpu_usage = int(cpu_usage.replace("n", ""))
|
||||
cpu_usage = f'{(cpu_usage / 1000000 / 1000):.3f}核'
|
||||
|
||||
memory_usage = node_metrics.get(node_name, {}).get('memory', 'undefined')
|
||||
if memory_usage and isinstance(memory_usage, str):
|
||||
memory_usage = int(memory_usage.replace("Ki", ""))
|
||||
memory_usage = f"{(memory_usage / 1024 / 1024):.3f}Gi"
|
||||
|
||||
# 节点总资源
|
||||
total_cpu = float(node.status.allocatable.get("cpu", "0"))
|
||||
total_memory = parse_resource_value(node.status.allocatable.get("memory", "0")) / (1024 ** 1) #内存默认Mi转成Gi
|
||||
total_gpu = int(node.status.allocatable.get("nvidia.com/gpu", "0"))
|
||||
|
||||
# 已分配资源
|
||||
allocated_cpu = node_pod_resources.get(node_name, {}).get("cpu", 0) / 1000.0 # 转换为 cores
|
||||
allocated_memory = node_pod_resources.get(node_name, {}).get("memory", 0) / (1024 ** 3) # 转换为 Gi
|
||||
allocated_gpu = node_pod_resources.get(node_name, {}).get("gpu", 0)
|
||||
|
||||
# 可用资源
|
||||
available_cpu = total_cpu - allocated_cpu
|
||||
available_memory = total_memory - allocated_memory
|
||||
available_gpu = total_gpu - allocated_gpu
|
||||
|
||||
node_info = {
|
||||
# "node_name": node_name,
|
||||
# "node_status": status,
|
||||
# "node_role": roles_str,
|
||||
# "node_age": node_age,
|
||||
# "node_version": k8s_version,
|
||||
# "node_internalip": internal_ip,
|
||||
# "node_externalip": external_ip,
|
||||
# "node_osversion": os_image,
|
||||
# "node_kernelversion": kernel_version,
|
||||
# "node_containeruntime": container_runtime,
|
||||
# "node_labels": kyy_labels,
|
||||
# "node_cpurate": cpu_usage,
|
||||
# "node_memrate": memory_usage,
|
||||
# 新增资源信息
|
||||
"node_total_cpu": f"{total_cpu:.2f}核",
|
||||
"node_allocated_cpu": f"{allocated_cpu:.2f}核",
|
||||
"node_available_cpu": f"{available_cpu:.2f}核",
|
||||
"node_cpu_usage_percent": f"{(allocated_cpu / total_cpu * 100):.1f}%" if total_cpu > 0 else "0%",
|
||||
|
||||
"node_total_memory": f"{total_memory:.2f}Gi",
|
||||
"node_allocated_memory": f"{allocated_memory:.2f}Gi",
|
||||
"node_available_memory": f"{available_memory:.2f}Gi",
|
||||
"node_memory_usage_percent": f"{(allocated_memory / total_memory * 100):.1f}%" if total_memory > 0 else "0%",
|
||||
|
||||
"node_total_gpu": total_gpu,
|
||||
"node_allocated_gpu": allocated_gpu,
|
||||
"node_available_gpu": available_gpu,
|
||||
"node_gpu_usage_percent": f"{(allocated_gpu / total_gpu * 100):.1f}%" if total_gpu > 0 else "0%"
|
||||
}
|
||||
rows.append(node_info)
|
||||
|
||||
result = {
|
||||
"total": len(rows),
|
||||
"rows": rows
|
||||
}
|
||||
print(f"=== node_info={result}")
|
||||
return result
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print(f"获取节点信息失败: {e}")
|
||||
raise e
|
||||
|
||||
# 辅助函数:解析资源值
|
||||
def parse_resource_value(value: str) -> float:
|
||||
"""解析 Kubernetes 资源值(如 "1.5", "500m", "2Gi")为统一单位"""
|
||||
if not value:
|
||||
return 0.0
|
||||
|
||||
# 处理 CPU (cores 或 millicores)
|
||||
if value.endswith('m'):
|
||||
return float(value[:-1]) / 1000.0 # 转换为 cores
|
||||
elif re.match(r'^\d+(\.\d+)?$', value):
|
||||
return float(value) # 已经是 cores
|
||||
|
||||
# 处理内存 (Ki, Mi, Gi, Ti)
|
||||
elif value.endswith('Ki'):
|
||||
return float(value[:-2]) / (1024 ** 1) # 转换为 Gi
|
||||
elif value.endswith('Mi'):
|
||||
return float(value[:-2]) / (1024 ** 2)
|
||||
elif value.endswith('Gi'):
|
||||
return float(value[:-2])
|
||||
elif value.endswith('Ti'):
|
||||
return float(value[:-2]) * 1024
|
||||
|
||||
return float(value) # 默认按原单位返回
|
||||
|
||||
|
||||
def get_pod_info(kubeconfig):
|
||||
try:
|
||||
# config.load_kube_config()
|
||||
kubeconfig = yaml.safe_load(kubeconfig)
|
||||
config.load_kube_config_from_dict(kubeconfig)
|
||||
v1 = client.CoreV1Api()
|
||||
api_client = client.ApiClient()
|
||||
|
||||
namespaces = v1.list_namespace().items
|
||||
non_system_namespaces = [ns.metadata.name for ns in namespaces if
|
||||
not ns.metadata.name.startswith(('kube-', 'default', 'local', 'ingress-'))]
|
||||
|
||||
rows = []
|
||||
for namespace in non_system_namespaces:
|
||||
pods = v1.list_namespaced_pod(namespace).items
|
||||
pod_metrics_path = f"/apis/metrics.k8s.io/v1beta1/namespaces/{namespace}/pods"
|
||||
pod_metrics_response = api_client.call_api(
|
||||
pod_metrics_path, 'GET', auth_settings=['BearerToken'], response_type='object')[0]
|
||||
pod_metrics = {pod['metadata']['name']: pod.get("containers",[{}])[0].get('usage', {})
|
||||
for pod in pod_metrics_response.get('items', [])}
|
||||
|
||||
# debug(f"### pod_metrics_response={pod_metrics_response}")
|
||||
for pod in pods:
|
||||
pod_name = pod.metadata.name
|
||||
if pod.status.container_statuses:
|
||||
ready_count = sum(1 for cs in pod.status.container_statuses if cs.ready)
|
||||
else:
|
||||
ready_count = 0
|
||||
ready_status = f"{ready_count}/{len(pod.spec.containers)}"
|
||||
readiness_conditions = [{"type": cond.type, "status": cond.status}
|
||||
for cond in pod.status.conditions if cond.type == "Ready"]
|
||||
phase = pod.status.phase
|
||||
restart_count = sum(cs.restart_count for cs in pod.status.container_statuses) if pod.status.container_statuses else 0
|
||||
running_time = time.time() - pod.metadata.creation_timestamp.timestamp()
|
||||
pod_age = running_time
|
||||
pod_ip = pod.status.pod_ip if pod.status.pod_ip else "Unknown"
|
||||
node_name = pod.spec.node_name if pod.spec.node_name else "Pod未被调度到节点"
|
||||
nominated_node = pod.status.nominated_node_name if pod.status.nominated_node_name else "无"
|
||||
|
||||
if phase == "Pending":
|
||||
pod_ip = "Pending状态,未分配 IP"
|
||||
node_name = "Pending状态,未分配节点"
|
||||
nominated_node = "Pending状态,未分配节点"
|
||||
|
||||
|
||||
# ✅ 提取容器的资源限制(limits)
|
||||
cpu_limit = "未设置"
|
||||
memory_limit = "未设置"
|
||||
gpu_limit = "未设置"
|
||||
|
||||
if pod.spec.containers:
|
||||
container = pod.spec.containers[0] # 假设只取第一个容器
|
||||
if container.resources and container.resources.limits:
|
||||
limits = container.resources.limits
|
||||
cpu_limit = limits.get("cpu", "未设置")
|
||||
memory_limit = limits.get("memory", "未设置")
|
||||
gpu_limit = limits.get("nvidia.com/gpu", "未设置") # 只支持 NVIDIA GPU
|
||||
|
||||
|
||||
# ✅ 获取 metrics 数据(已有逻辑不变)
|
||||
cpu_usage = pod_metrics.get(pod_name, {}).get('cpu', 'undefined')
|
||||
if cpu_usage and isinstance(cpu_usage, str):
|
||||
cpu_usage = int(cpu_usage.replace("n", ""))
|
||||
cpu_usage = f'{(cpu_usage / 1000000 / 1000):.3f}核'
|
||||
memory_usage = pod_metrics.get(pod_name, {}).get('memory', 'undefined')
|
||||
if memory_usage and isinstance(memory_usage, str):
|
||||
memory_usage = int(memory_usage.replace("Ki", ""))
|
||||
memory_usage = f"{(memory_usage / 1024):.3f}Mi"
|
||||
|
||||
if phase in ["Pending", "Succeeded", "Failed"]:
|
||||
cpu_usage = "Pod未运行,无资源使用数据"
|
||||
memory_usage = "Pod未运行,无资源使用数据"
|
||||
|
||||
|
||||
# ✅ 新增 GPU 使用情况字段(暂时用占位符)
|
||||
gpu_usage = "0%" # 如果你有 DCGM / Prometheus 可替换为实际值
|
||||
pod_info = {
|
||||
"pod_namespace": namespace,
|
||||
"pod_name": pod_name,
|
||||
"pod_ready": ready_status,
|
||||
"pod_running": phase,
|
||||
"pod_restart": str(restart_count),
|
||||
"pod_age": pod_age,
|
||||
"pod_ip": pod_ip,
|
||||
"pod_node": node_name,
|
||||
"pod_nominated_node": nominated_node,
|
||||
"pod_cpurate": cpu_usage,
|
||||
"pod_memrate": memory_usage,
|
||||
# ✅ 新增字段
|
||||
"pod_cpu_limit": cpu_limit,
|
||||
"pod_memory_limit": memory_limit,
|
||||
"pod_gpu_limit": gpu_limit,
|
||||
"pod_gpu_usage": gpu_usage,
|
||||
}
|
||||
rows.append(pod_info)
|
||||
|
||||
result = {
|
||||
"total": len(rows),
|
||||
"rows": rows
|
||||
}
|
||||
return result
|
||||
except Exception as e:
|
||||
raise "获取Pod信息失败: %s" % e.reason
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
kubeconfig = """apiVersion: v1
|
||||
clusters:
|
||||
- cluster:
|
||||
certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURCVENDQWUyZ0F3SUJBZ0lJTGd4THlGMjM3QmN3RFFZSktvWklodmNOQVFFTEJRQXdGVEVUTUJFR0ExVUUKQXhNS2EzVmlaWEp1WlhSbGN6QWVGdzB5TlRBME1ETXdOelE1TXpWYUZ3MHpOVEEwTURFd056VTBNelZhTUJVeApFekFSQmdOVkJBTVRDbXQxWW1WeWJtVjBaWE13Z2dFaU1BMEdDU3FHU0liM0RRRUJBUVVBQTRJQkR3QXdnZ0VLCkFvSUJBUURQUm5hdkZmNXBTWWUvVmJLc0s2SnhEazhyc2hsc2h5WnNNRk8xZDVhZG45Z055T0wwR2NtbEsrQ1EKVklKSnF3RklJeSsxUVlEd3VRMytzczEwYmV2Y2lqM1BRanluaXJRRkNhRlA0NHh2ZkEyK2thV1FYeTVncGwrMwpjSkI1K1MxVmx2Vi9aSHQ5SXgwNjFCdHB4dE5oMUkxNS9IYk4rWmVNNnEvd3lxUW93Y01ub2pyNDltYkxxOWNwCnFSem5LL2FwWXlBYnljUk9uWWlIZ0FjQWdsclFOTjBKUEJZd2dRd0pIUmlIcGhtVFBkdmY2ckxkNFR0dFl2OXgKdmZIRDNjVUdwZkVBUElaNUJBVi9ZM3p5V0pSbDQzSFV2Ri9jemNDQ01jOVlUd3VXaEpxb2doUUZUdnNuSVZzTwovNEtKQzRwQXFSenJlZFRWdExmMXgzQlRpVCt0QWdNQkFBR2pXVEJYTUE0R0ExVWREd0VCL3dRRUF3SUNwREFQCkJnTlZIUk1CQWY4RUJUQURBUUgvTUIwR0ExVWREZ1FXQkJUZjRZbzBpOVpIZC9ObzdkYWZrZVRTbzVzdzN6QVYKQmdOVkhSRUVEakFNZ2dwcmRXSmxjbTVsZEdWek1BMEdDU3FHU0liM0RRRUJDd1VBQTRJQkFRRERLalJWVVp1YwppckJ4QWdZWnBBeW5NSHdWQTF6YStVT09FM1U0MEMyVTN0VGgrK1BsN2o2ZGJZTWNWdEFvTXhxaDRiVjNQRW5SCmtWcWNaN2NjS3FGSXduZlhHK0ZGTVVwazVoTk0xLzc2UXBobi9OWk8zSStSVjFtV0VOU1BzM1FZdEVoWktXUlgKYWRXZ0krK0x1cUZyZVpTVzVjRXNnMWZDODFtd3dhTXdkRHZWcFJZMFEwWlBsMGFqTURsSlNDaDNOSXpQOS82bwpndXBrY1JSdWtvRGlscWVraXlrRWJ5OVJCWHZIbXo3Q0sxQ1ZnZXZJTDZrVnRPRFF2Rm10Qm1WemlRNWFDcXJOCmtZNmd6OUNGMkdKc2M4UkZrcWQxbzdMelhPakJsTkdzN2k2WmdEOE1Ca2tiank2RmZDZWVndmxOOGFCU2VmblEKZ2ZNOVptbnRpMVNDCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K
|
||||
server: https://192.168.0.3:6443
|
||||
name: kubernetes
|
||||
contexts:
|
||||
- context:
|
||||
cluster: kubernetes
|
||||
user: kubernetes-admin
|
||||
name: kubernetes-admin@kubernetes
|
||||
current-context: kubernetes-admin@kubernetes
|
||||
kind: Config
|
||||
preferences: {}
|
||||
users:
|
||||
- name: kubernetes-admin
|
||||
user:
|
||||
client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSURJRENDQWdpZ0F3SUJBZ0lIVGZPdmU4TzBJVEFOQmdrcWhraUc5dzBCQVFzRkFEQVZNUk13RVFZRFZRUUQKRXdwcmRXSmxjbTVsZEdWek1CNFhEVEkxTURRd016QTNORGt6TlZvWERUSTJNRFF3TXpBM05UUXpOMW93TkRFWApNQlVHQTFVRUNoTU9jM2x6ZEdWdE9tMWhjM1JsY25NeEdUQVhCZ05WQkFNVEVHdDFZbVZ5Ym1WMFpYTXRZV1J0CmFXNHdnZ0VpTUEwR0NTcUdTSWIzRFFFQkFRVUFBNElCRHdBd2dnRUtBb0lCQVFEWVJJT3h0TWFkOWs2T1JsL1UKZ2ZnZVJDQkpjZmMrc2ZFbzkxeW4vc05KZFVIbWRuamtMaC9wRjcwZkdoVWZ3R2t5dzR0WkdpTFFNR0xwclpyeAphVTdJT0R3a3I2ejl1SkQzaHlFZExhZGpZT0NOMHJhUFNpV05GV1QwSVN2UVBjZzNGQkQ2YmFHb2RtSmN5YnBPCk5qY1VZZmh5WEVqRXMwOU92QzhhZUJCbm9Na1RkRk53dlFaYXE2LzR3eTUyN0k3aUdIUVdvL21JS1VUVHhzRFgKMzJnVXErZmRVMEh5STJJeWhNMGdwT29uNURCVmRUbWsyMkZsVHk0ZWJ3Q3R4QmMvRCtpelhuZFpVd2tHMExMVwpqTEc4L3JkWTZ4WFJDVkhHM1BWNURRK0JvNEpnMTUwWWFSUnBKeDJYSGxad3N5OFBZcWVLcTM0b1pxczRTRndmCjJCY3JBZ01CQUFHalZqQlVNQTRHQTFVZER3RUIvd1FFQXdJRm9EQVRCZ05WSFNVRUREQUtCZ2dyQmdFRkJRY0QKQWpBTUJnTlZIUk1CQWY4RUFqQUFNQjhHQTFVZEl3UVlNQmFBRk4vaGlqU0wxa2QzODJqdDFwK1I1TktqbXpEZgpNQTBHQ1NxR1NJYjNEUUVCQ3dVQUE0SUJBUUFTR0phc1EyQXpLdVNZWFdtMGlYOUhnWTNZQUJGMHpYRzRKZU5lCjREekxkOHF2TXlqRGMwUWFWSUtNbWswemhrV1ZIQzNKSEZWalRXcDBUNFE0TlVBMk8rOXFob1p0a25NL3dsQlUKS0Zab3ZHNFd6SU1sdVJwL21ZRUIzL3dHbkFPV01MdEtBSWJ3d3FRVWl4VW5KYkxCeG4xQ1k5ZERzb1o4VmZZMQp4N2R0WDBJWjJkbU1ETTVLV1lrbW5tQWJBR0tXazZBR3pVWEpWNmlTU3laYjlWLzNuN3hmZlpZRkVDQXBQNk91CjhmRGdIVjBCdEMxS3VmU0tsTitLMnF2aXAzMlRjRHdoTEVHQWQ2aU9qYzhBRXlHelJmOWY4M0xUSGJ2dGtibjYKR0VQQlBQSExSTFlQWEh0OE9LbHdNOThwQWxkSkIyWEJ6UEttc0JFeGFOSWRXd2FTCi0tLS0tRU5EIENFUlRJRklDQVRFLS0tLS0K
|
||||
client-key-data: LS0tLS1CRUdJTiBSU0EgUFJJVkFURSBLRVktLS0tLQpNSUlFcEFJQkFBS0NBUUVBMkVTRHNiVEduZlpPamtaZjFJSDRIa1FnU1hIM1BySHhLUGRjcC83RFNYVkI1blo0CjVDNGY2UmU5SHhvVkg4QnBNc09MV1JvaTBEQmk2YTJhOFdsT3lEZzhKSytzL2JpUTk0Y2hIUzJuWTJEZ2pkSzIKajBvbGpSVms5Q0VyMEQzSU54UVErbTJocUhaaVhNbTZUalkzRkdINGNseEl4TE5QVHJ3dkduZ1FaNkRKRTNSVApjTDBHV3F1ditNTXVkdXlPNGhoMEZxUDVpQ2xFMDhiQTE5OW9GS3ZuM1ZOQjhpTmlNb1ROSUtUcUorUXdWWFU1CnBOdGhaVTh1SG04QXJjUVhQdy9vczE1M1dWTUpCdEN5MW95eHZQNjNXT3NWMFFsUnh0ejFlUTBQZ2FPQ1lOZWQKR0drVWFTY2RseDVXY0xNdkQyS25pcXQrS0dhck9FaGNIOWdYS3dJREFRQUJBb0lCQVFDQ1djRjZ3YmdaQzVWTApvZFV1MCt1RjZvLy9WS2F1YmpncDlmWXQ5NXNqVW42Vzl2OWtvUHh3MVBNVHBQZm9mR09yeWpyYVNLdUZDalVFCkhiUlBINmJ4ZlJ1YkRSdmFqWDByQkpLTDhMRjhiNjdKTEtFR2VxMXBmT1N0VkxVQXZjeElqbHF4WnBUU1loQmwKVnQxcE9MbzRHZGpTclJiYklDeUVDMTdrdUV0QytZV3lFb3E5MmlLNVdMTHdHM2hwVzhyVlVLVzZ2T0cyd0l4bAp0RWhMSGpOOWtnb1VVa2pORG9tK2FlcVVxeXhDeUZEdll4UmdhVTd0Y3pJSk52SUk3aDYxaExQbEZtMmxGQ0xlCjhjeTdKUDMyV1ZDSUpUMHhRNkJJRTdvVld4WWIvMzFVSHYrTHg0UHlBcFpiZ3piMjlvQm54VjhneUxnVjZDWW0Kd1psQlQ4S2hBb0dCQU9tMFZqTkVHVm5EaXNsTDFVVkNKYzFCVU1KcjNwalQvV0g4d2s0UzJYWmhwRWdVQmpQYgpDM3Y5czkxNHh6SjhXYWFtUFZPVGZMRmxzRWFLNnJpMFhjQkhXQi9ob1R1aDVKaDByS1RNWWFMTm9SdU00VCt6Ci9zUG1aY1ZMVXcxdHFmd3U5YlVpSTJCQURQNFM2MUFubk5hSnF1UmFWRk8vT1pqZUkvbHJzMVBSQW9HQkFPem0KVTNvcjNuSDh4WHI2WDNJUjRDM3l3TkZLaHNVVE44VmdWNWRVL0U5RmRHTldUVzRkWHdCK01jeUlQMlFLbjlycwpmcU9Cb0c3NlRKVHF0YzVobjY5Q014c1lVNVdPcDhOZW9oaXplY1luSTFjTk94TmZwdzZDdUZVb1pmTFFxU1dICmJ4dEVEaFkrcXJjR2FLZ3VzMk1uMkJ2cEg1bUhCTk5DL05pSVZ1WTdBb0dBZFlnVEhkOHVuSjBockJCdUpsR1kKN3p2YzRKb2RMV0RYZWpNQ2lQOGp6RXhZc1VNWXgzVnV0aUdtRmtpS2JWSnFSOHdzNVY0MEJJY3VlcHVjWmQyWApsSDZNekNQTjBVNmV4eWxPTmVidlowL2dxUmxWb3BMa0dpTkJwVkkzWjNaeVdYaElhNXJLamJwSWpuSjNVeTFJCnpBQWFLSk5nKzJrZEQwc1FibnlDaURFQ2dZQVFDZVA2OEg5bDdqd2NnRmozNnhmblpIa0RjbTAvYUhhdEtVR2sKNEQ4WXl0WC9aN2RrVGg3QmRNbkFWRFVlZTgyb3o3d2ZLOGFGM1BKVVhyT2lYbCttU1BBVzFJWE1LVlZZVjg3WApwMGNHVUY0SEpjRXJKWjIwME1yVUVTRWQyRnlyU3NrTjZvU2RvdTZCNTdBc09zVXdZR0UwT290R0pLc0I5cFlSCnZ1RkxRd0tCZ1FEZVFuRElPaUQ2SEpmc2loTC8xZ3dnS0hVeVc2WGYrNFhQODd3ZlVXT1N0SEpza29oTkZHdk8KSnpNdUFvc2V2UGFWdElCSXBZbFgycUlaaHlhdyt2VW9BUTZYRnR3WjM1QWo1T1VPbVFQQUJWbkVXZUJZRzdSaQpaZmhEU2NTek5xb3ozWFpjMnA4a2VMWE1XOWJsTDNNOTdOMFRLbExuZ0NrSTdoaXJMVGE2T0E9PQotLS0tLUVORCBSU0EgUFJJVkFURSBLRVktLS0tLQo="""
|
||||
|
||||
# 加载配置
|
||||
# kubeconfig = yaml.safe_load(kubeconfig)
|
||||
# config.load_kube_config_from_dict(kubeconfig)
|
||||
# 测试获取节点信息
|
||||
# try:
|
||||
# node_info = get_node_info(kubeconfig)
|
||||
# print(json.dumps(node_info, indent=4, ensure_ascii=False))
|
||||
# except Exception as e:
|
||||
# print(f"Error: {e}")
|
||||
try:
|
||||
pod_info = get_pod_info(kubeconfig)
|
||||
print(json.dumps(pod_info, indent=4, ensure_ascii=False))
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
54
app/unit_test/local_test.py
Normal file
54
app/unit_test/local_test.py
Normal file
@ -0,0 +1,54 @@
|
||||
import re
|
||||
|
||||
def parse_resource_value(value_str, resource_type, unit):
|
||||
"""
|
||||
解析资源值并转换为可读格式,假设 value_str 仅为数字(已去除单位)。
|
||||
|
||||
:param value_str: 仅包含数字的字符串(已去除单位)
|
||||
:param resource_type: 'cpu' 或 'memory'
|
||||
:param unit: 原始单位,用于决定如何处理该数值(如 'm', 'n', 'Ki', 'Mi', 'Gi' 等)
|
||||
:return: 转换后的可读值和目标单位
|
||||
"""
|
||||
print(111)
|
||||
# 直接转换输入字符串为浮点数(不再提取单位)
|
||||
try:
|
||||
value = float(value_str.strip())
|
||||
except ValueError:
|
||||
raise ValueError("无法解析输入字符串为数字")
|
||||
|
||||
print(222)
|
||||
if resource_type == 'cpu':
|
||||
# CPU利用率的转换,根据unit参数判断原始单位
|
||||
if unit == 'n': # 纳秒
|
||||
return value / 1e9, '%'
|
||||
elif unit == 'm': # 毫核
|
||||
return value / 1000, 'cores'
|
||||
else:
|
||||
# 默认认为是核心数(单位是 core 或直接以整数表示)
|
||||
return value, 'cores'
|
||||
|
||||
elif resource_type == 'memory':
|
||||
# 内存相关的单位转换
|
||||
units_dict = {'Ki': 1, 'Mi': 1024, 'Gi': 1024 * 1024}
|
||||
if unit in units_dict:
|
||||
bytes_val = value * 1024 * units_dict[unit] # Ki/Mi/Gi -> 字节
|
||||
elif unit == 'B' or unit == '': # 字节或无单位
|
||||
bytes_val = value
|
||||
else:
|
||||
raise ValueError(f"不支持的内存单位: {unit}")
|
||||
|
||||
print(444)
|
||||
# 将字节转换为MB或GB
|
||||
if bytes_val < 1024 * 1024 * 1024:
|
||||
return bytes_val / (1024 * 1024), 'MB'
|
||||
else:
|
||||
return bytes_val / (1024 * 1024 * 1024), 'GB'
|
||||
|
||||
else:
|
||||
raise ValueError("未知的资源类型,应为 'cpu' 或 'memory'")
|
||||
|
||||
if __name__ == "__main__":
|
||||
numeric_part = re.sub(r'\D', '', '80739445n')
|
||||
numeric_part2 = re.sub(r'\D', '', '4792336Ki')
|
||||
print(f'CPU:{parse_resource_value(numeric_part, "cpu", unit="n")}') # CPU利用率
|
||||
print(f'内存:{parse_resource_value(numeric_part2, "memory", unit="Ki")}') # 内存利用率
|
||||
261
app/unit_test/master_test.py
Normal file
261
app/unit_test/master_test.py
Normal file
@ -0,0 +1,261 @@
|
||||
def ssh_execute_commands(host, port, username, password, commands, real_time_log=False):
|
||||
try:
|
||||
import paramiko
|
||||
# 创建 SSH 对象
|
||||
ssh = paramiko.SSHClient()
|
||||
# 允许连接不在 know_hosts 文件中的主机
|
||||
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
# 连接服务器
|
||||
ssh.connect(hostname=host, port=port, username=username, password=password)
|
||||
all_results = []
|
||||
result = ""
|
||||
error = ""
|
||||
for command in commands:
|
||||
# stdin, stdout, stderr = ssh.exec_command(f'sudo -S {command}', get_pty=True)
|
||||
# stdin.write(password + '\n')
|
||||
stdin, stdout, stderr = ssh.exec_command(f'{command}', get_pty=True)
|
||||
stdin.flush()
|
||||
if real_time_log:
|
||||
print(f"开始执行命令: {command}")
|
||||
# 实时读取标准输出
|
||||
for line in iter(stdout.readline, ""):
|
||||
print(line, end="")
|
||||
result += line
|
||||
# 实时读取标准错误输出
|
||||
for line in iter(stderr.readline, ""):
|
||||
print(line, end="")
|
||||
error += line
|
||||
else:
|
||||
result = stdout.read().decode()
|
||||
error = stderr.read().decode()
|
||||
|
||||
all_results.append((result, error))
|
||||
if real_time_log:
|
||||
print(f"命令 {command} 执行结束")
|
||||
|
||||
# 关闭连接
|
||||
ssh.close()
|
||||
return all_results
|
||||
except Exception as e:
|
||||
print(f"SSH 连接或执行命令时出错: {e}")
|
||||
return None
|
||||
|
||||
def new_cluster(params):
|
||||
# 随后填充远程操控k8s主逻辑
|
||||
"""
|
||||
用于接收cpcc端传递过来的k8s安装指令参数,进行远程sshx调用操作内网机器进行集群节点的安装
|
||||
参数示例:
|
||||
{'cluster_type': '0', 'host': '192.168.0.3', 'port': '22', 'user': 'ysh', 'password': 'Kyy@123456'}
|
||||
"""
|
||||
host = params.get("host")
|
||||
port = int(params.get("port"))
|
||||
username = params.get("user")
|
||||
password = params.get("password")
|
||||
commands = ['kubectl get nodes', 'kubectl get pods --all-namespaces', 'kubectl get services --all-namespaces']
|
||||
results = ssh_execute_commands(host, port, username, password, commands, real_time_log=True)
|
||||
if results:
|
||||
# print("所有命令执行的整体结果:")
|
||||
for result, error in results:
|
||||
if result:
|
||||
print("执行结果:")
|
||||
print(result)
|
||||
if error:
|
||||
print("错误信息:")
|
||||
print(error)
|
||||
return results
|
||||
|
||||
|
||||
import json
|
||||
import argparse
|
||||
import logging
|
||||
from kubernetes import client, config
|
||||
from kubernetes.client.rest import ApiException
|
||||
import time
|
||||
|
||||
|
||||
def setup_logging():
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
|
||||
def format_runtime(seconds):
|
||||
if seconds < 60:
|
||||
return f"{int(seconds)}s"
|
||||
elif seconds < 3600:
|
||||
minutes = int(seconds // 60)
|
||||
return f"{minutes}m"
|
||||
elif seconds < 86400:
|
||||
hours = int(seconds // 3600)
|
||||
return f"{hours}h"
|
||||
else:
|
||||
days = int(seconds // 86400)
|
||||
return f"{days}d"
|
||||
|
||||
|
||||
def get_node_info():
|
||||
try:
|
||||
config.load_kube_config()
|
||||
v1 = client.CoreV1Api()
|
||||
api_client = client.ApiClient()
|
||||
node_metrics_path = "/apis/metrics.k8s.io/v1beta1/nodes"
|
||||
|
||||
nodes = v1.list_node().items
|
||||
node_metrics_response = api_client.call_api(
|
||||
node_metrics_path, 'GET', auth_settings=['BearerToken'], response_type='object')[0]
|
||||
node_metrics = {node['metadata']['name']: node.get('usage', {})
|
||||
for node in node_metrics_response.get('items', [])}
|
||||
|
||||
rows = []
|
||||
for node in nodes:
|
||||
node_name = node.metadata.name
|
||||
internal_ip = next((address.address for address in node.status.addresses
|
||||
if address.type == "InternalIP"), "Unknown")
|
||||
external_ip = next((address.address for address in node.status.addresses
|
||||
if address.type == "ExternalIP"), "Unknown")
|
||||
status = node.status.conditions[-1].status if node.status.conditions else "Unknown"
|
||||
roles = []
|
||||
role_labels = [
|
||||
"node-role.kubernetes.io/control-plane",
|
||||
"node-role.kubernetes.io/master",
|
||||
"node-role.kubernetes.io/worker"
|
||||
]
|
||||
for label in role_labels:
|
||||
if label in node.metadata.labels:
|
||||
roles.append(label.split("/")[-1])
|
||||
roles_str = ",".join(roles) if roles else "None"
|
||||
running_time = time.time() - node.metadata.creation_timestamp.timestamp()
|
||||
node_age = format_runtime(running_time)
|
||||
k8s_version = node.status.node_info.kubelet_version
|
||||
os_image = node.status.node_info.os_image
|
||||
kernel_version = node.status.node_info.kernel_version
|
||||
container_runtime = node.status.node_info.container_runtime_version
|
||||
labels = node.metadata.labels
|
||||
|
||||
cpu_usage = node_metrics.get(node_name, {}).get('cpu', 'undefined')
|
||||
memory_usage = node_metrics.get(node_name, {}).get('memory', 'undefined')
|
||||
|
||||
node_info = {
|
||||
"node_name": node_name,
|
||||
"node_status": status,
|
||||
"node_role": roles_str,
|
||||
"node_age": node_age,
|
||||
"node_version": k8s_version,
|
||||
"node_internalip": internal_ip,
|
||||
"node_externalip": external_ip,
|
||||
"node_osversion": os_image,
|
||||
"node_kernelversion": kernel_version,
|
||||
"node_containeruntime": container_runtime,
|
||||
"node_labels": labels,
|
||||
"node_cpurate": cpu_usage,
|
||||
"node_memrate": memory_usage
|
||||
}
|
||||
rows.append(node_info)
|
||||
|
||||
result = {
|
||||
"total": len(rows),
|
||||
"rows": rows
|
||||
}
|
||||
return result
|
||||
except ApiException as e:
|
||||
logging.error(f"获取节点信息时出错: {e}")
|
||||
return {"total": 0, "rows": []}
|
||||
|
||||
|
||||
def get_pod_info():
|
||||
try:
|
||||
config.load_kube_config()
|
||||
v1 = client.CoreV1Api()
|
||||
api_client = client.ApiClient()
|
||||
|
||||
namespaces = v1.list_namespace().items
|
||||
non_system_namespaces = [ns.metadata.name for ns in namespaces if
|
||||
not ns.metadata.name.startswith(('kube-', 'default', 'local'))]
|
||||
|
||||
rows = []
|
||||
for namespace in non_system_namespaces:
|
||||
pods = v1.list_namespaced_pod(namespace).items
|
||||
pod_metrics_path = f"/apis/metrics.k8s.io/v1beta1/namespaces/{namespace}/pods"
|
||||
pod_metrics_response = api_client.call_api(
|
||||
pod_metrics_path, 'GET', auth_settings=['BearerToken'], response_type='object')[0]
|
||||
pod_metrics = {pod['metadata']['name']: pod.get('usage', {})
|
||||
for pod in pod_metrics_response.get('items', [])}
|
||||
|
||||
for pod in pods:
|
||||
pod_name = pod.metadata.name
|
||||
if pod.status.container_statuses:
|
||||
ready_count = sum(1 for cs in pod.status.container_statuses if cs.ready)
|
||||
else:
|
||||
ready_count = 0
|
||||
ready_status = f"{ready_count}/{len(pod.spec.containers)}"
|
||||
readiness_conditions = [{"type": cond.type, "status": cond.status}
|
||||
for cond in pod.status.conditions if cond.type == "Ready"]
|
||||
phase = pod.status.phase
|
||||
restart_count = sum(cs.restart_count for cs in pod.status.container_statuses) if pod.status.container_statuses else 0
|
||||
running_time = time.time() - pod.metadata.creation_timestamp.timestamp()
|
||||
pod_age = format_runtime(running_time)
|
||||
pod_ip = pod.status.pod_ip if pod.status.pod_ip else "Unknown"
|
||||
node_name = pod.spec.node_name if pod.spec.node_name else "Pod 未被调度到节点"
|
||||
nominated_node = pod.status.nominated_node_name if pod.status.nominated_node_name else "调度器未提名节点"
|
||||
|
||||
if phase == "Pending":
|
||||
pod_ip = "Pod 处于 Pending 状态,未分配 IP"
|
||||
node_name = "Pod 处于 Pending 状态,未被调度到节点"
|
||||
nominated_node = "Pod 处于 Pending 状态,调度器未提名节点"
|
||||
|
||||
readiness_gates = []
|
||||
|
||||
cpu_usage = pod_metrics.get(pod_name, {}).get('cpu', 'undefined')
|
||||
memory_usage = pod_metrics.get(pod_name, {}).get('memory', 'undefined')
|
||||
|
||||
if phase in ["Pending", "Succeeded", "Failed"]:
|
||||
cpu_usage = "Pod 未运行,无资源使用数据"
|
||||
memory_usage = "Pod 未运行,无资源使用数据"
|
||||
|
||||
pod_info = {
|
||||
"pod_namespace": namespace,
|
||||
"pod_name": pod_name,
|
||||
"pod_ready": ready_status,
|
||||
"pod_running": phase,
|
||||
"pod_restart": restart_count,
|
||||
"pod_age": pod_age,
|
||||
"pod_ip": pod_ip,
|
||||
"pod_node": node_name,
|
||||
"pod_nominated_node": nominated_node,
|
||||
"pod_readiness_gates": readiness_gates,
|
||||
"pod_cpurate": cpu_usage,
|
||||
"pod_memrate": memory_usage
|
||||
}
|
||||
rows.append(pod_info)
|
||||
|
||||
result = {
|
||||
"total": len(rows),
|
||||
"rows": rows
|
||||
}
|
||||
return result
|
||||
except ApiException as e:
|
||||
logging.error(f"获取Pod信息时出错: {e}")
|
||||
return {"total": 0, "rows": []}
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# params = {'cluster_type': '0', 'host': '192.168.0.3', 'port': '22', 'user': 'root', 'password': 'Yuanshenhong.1'}
|
||||
# new_cluster(params)
|
||||
parser = argparse.ArgumentParser(description='获取Kubernetes节点和Pod实时信息')
|
||||
parser.add_argument('--interval', type=int, default=300, help='刷新间隔(秒)')
|
||||
args = parser.parse_args()
|
||||
|
||||
setup_logging()
|
||||
while True:
|
||||
node_info = get_node_info()
|
||||
pod_info = get_pod_info()
|
||||
|
||||
result = {
|
||||
"节点信息": node_info,
|
||||
"Pod信息": pod_info
|
||||
}
|
||||
|
||||
logging.info(json.dumps(result, indent=4, ensure_ascii=False))
|
||||
|
||||
time.sleep(args.interval)
|
||||
|
||||
79
app/unit_test/noroot_install.py
Normal file
79
app/unit_test/noroot_install.py
Normal file
@ -0,0 +1,79 @@
|
||||
def ssh_execute_command(host, port, username, password, commands, real_time_log=False,
|
||||
remote_exec=True, scp_map=dict()):
|
||||
try:
|
||||
import os
|
||||
import paramiko
|
||||
# 创建 SSH 对象
|
||||
ssh = paramiko.SSHClient()
|
||||
# 允许连接不在 know_hosts 文件中的主机
|
||||
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
# 连接服务器
|
||||
ssh.connect(hostname=host, port=port, username=username, password=password)
|
||||
all_results = []
|
||||
if scp_map:
|
||||
sftp = ssh.open_sftp()
|
||||
for sf, df in scp_map.items():
|
||||
# 1. 上传到 /tmp/ 目录
|
||||
tmp_path = f"/tmp/{os.path.basename(df)}"
|
||||
print(f"上传 {sf} 到临时目录 {tmp_path}")
|
||||
sftp.put(sf, tmp_path)
|
||||
|
||||
# 2. 用 sudo 移动到目标目录
|
||||
cmd = f"echo {password} | sudo mv {tmp_path} {df}"
|
||||
print(f"用 sudo 移动 {tmp_path} 到 {df}")
|
||||
stdin, stdout, stderr = ssh.exec_command(cmd)
|
||||
exit_status = stdout.channel.recv_exit_status()
|
||||
if exit_status != 0:
|
||||
print(f"移动失败: {stderr.read().decode()}")
|
||||
else:
|
||||
print("移动成功")
|
||||
sftp.close()
|
||||
if remote_exec:
|
||||
# 通用流程
|
||||
result = ""
|
||||
error = ""
|
||||
for command in commands:
|
||||
stdin, stdout, stderr = ssh.exec_command(f'{command}', get_pty=True)
|
||||
stdin.flush()
|
||||
if real_time_log:
|
||||
print(f"开始执行命令: {command=}, 请耐心等待...")
|
||||
# 实时读取标准输出
|
||||
for line in iter(stdout.readline, ""):
|
||||
print(f'{line=}')
|
||||
result += line
|
||||
# 实时读取标准错误输出
|
||||
for line in iter(stderr.readline, ""):
|
||||
print(f'{line=}')
|
||||
error += line
|
||||
else:
|
||||
result = stdout.read().decode()
|
||||
error = stderr.read().decode()
|
||||
|
||||
all_results.append((result, error))
|
||||
if real_time_log:
|
||||
print(f"命令 {command=} 执行结束")
|
||||
# 关闭连接
|
||||
ssh.close()
|
||||
return all_results
|
||||
except Exception as e:
|
||||
print(f"SSH连接或执行命令时出错: {e=}")
|
||||
return [e]
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试代码
|
||||
host = ""
|
||||
port = 22
|
||||
username = ""
|
||||
password = ""
|
||||
commands = ["sudo", "apt-get update"]
|
||||
scp_map = {
|
||||
"local_file.txt": "/remote/path/remote_file.txt"
|
||||
}
|
||||
results = ssh_execute_command(host, port, username, password, commands, real_time_log=True, scp_map=scp_map)
|
||||
for result, error in results:
|
||||
print(f"Result: {result}")
|
||||
print(f"Error: {error}")
|
||||
# This code is a simplified version of the SSH command execution utility.
|
||||
# It uses the paramiko library to connect to a remote server and execute commands.
|
||||
# The code includes functionality for uploading files via SFTP and executing commands with real-time logging.
|
||||
# This code is a simplified version of the SSH command execution utility.
|
||||
70
app/unit_test/worker_test.py
Normal file
70
app/unit_test/worker_test.py
Normal file
@ -0,0 +1,70 @@
|
||||
def ssh_execute_commands(host, port, username, password, commands, real_time_log=False):
|
||||
try:
|
||||
import paramiko
|
||||
# 创建 SSH 对象
|
||||
ssh = paramiko.SSHClient()
|
||||
# 允许连接不在 know_hosts 文件中的主机
|
||||
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
# 连接服务器
|
||||
ssh.connect(hostname=host, port=port, username=username, password=password)
|
||||
all_results = []
|
||||
result = ""
|
||||
error = ""
|
||||
for command in commands:
|
||||
# stdin, stdout, stderr = ssh.exec_command(f'sudo -S {command}', get_pty=True)
|
||||
# stdin.write(password + '\n')
|
||||
stdin, stdout, stderr = ssh.exec_command(f'{command}', get_pty=True)
|
||||
stdin.flush()
|
||||
if real_time_log:
|
||||
print(f"开始执行命令: {command}")
|
||||
# 实时读取标准输出
|
||||
for line in iter(stdout.readline, ""):
|
||||
print(line, end="")
|
||||
result += line
|
||||
# 实时读取标准错误输出
|
||||
for line in iter(stderr.readline, ""):
|
||||
print(line, end="")
|
||||
error += line
|
||||
else:
|
||||
result = stdout.read().decode()
|
||||
error = stderr.read().decode()
|
||||
|
||||
all_results.append((result, error))
|
||||
if real_time_log:
|
||||
print(f"命令 {command} 执行结束")
|
||||
|
||||
# 关闭连接
|
||||
ssh.close()
|
||||
return all_results
|
||||
except Exception as e:
|
||||
print(f"SSH 连接或执行命令时出错: {e}")
|
||||
return None
|
||||
|
||||
def new_cluster(params):
|
||||
# 随后填充远程操控k8s主逻辑
|
||||
"""
|
||||
用于接收cpcc端传递过来的k8s安装指令参数,进行远程sshx调用操作内网机器进行集群节点的安装
|
||||
参数示例:
|
||||
{'cluster_type': '0', 'host': '192.168.0.3', 'port': '22', 'user': 'ysh', 'password': 'Kyy@123456'}
|
||||
"""
|
||||
host = params.get("host")
|
||||
port = int(params.get("port"))
|
||||
username = params.get("user")
|
||||
password = params.get("password")
|
||||
commands = ['cd /install/ && ./k8s_install_1804.sh master','cd /install/ && cat join_command.txt']
|
||||
results = ssh_execute_commands(host, port, username, password, commands, real_time_log=True)
|
||||
if results:
|
||||
print("所有命令执行的整体结果:")
|
||||
for result, error in results:
|
||||
if result:
|
||||
print("执行结果:")
|
||||
print(result)
|
||||
if error:
|
||||
print("错误信息:")
|
||||
print(error)
|
||||
return results
|
||||
|
||||
if __name__ == "__main__":
|
||||
params = {'cluster_type': '0', 'host': '192.168.0.2', 'port': '22', 'user': 'root', 'password': 'Yuanshenhong.1'}
|
||||
new_cluster(params)
|
||||
|
||||
55
conf/config.json
Normal file
55
conf/config.json
Normal file
@ -0,0 +1,55 @@
|
||||
{
|
||||
"logger":{
|
||||
"name":"pcapi",
|
||||
"levelname":"info",
|
||||
"logfile":"$[workdir]$/logs/pcapi.log"
|
||||
},
|
||||
"authentication":{
|
||||
"user":"root",
|
||||
"password":"Kyy@123456",
|
||||
"iplist":[
|
||||
"47.93.12.75",
|
||||
"127.0.0.1",
|
||||
"117.50.205.57",
|
||||
"10.60.179.61",
|
||||
"114.246.236.28",
|
||||
"115.190.98.166",
|
||||
"61.48.132.253",
|
||||
"114.246.239.237",
|
||||
"223.72.41.93",
|
||||
"111.201.209.76"
|
||||
]
|
||||
},
|
||||
"filesroot":"$[workdir]$/files",
|
||||
"website":{
|
||||
"paths":[
|
||||
["$[workdir]$/wwwroot",""]
|
||||
],
|
||||
"client_max_size":10000,
|
||||
"host":"0.0.0.0",
|
||||
"port":9001,
|
||||
"coding":"utf-8",
|
||||
"indexes":[
|
||||
"index.dspy",
|
||||
"index.md"
|
||||
],
|
||||
"startswiths":[
|
||||
{
|
||||
"leading":"/idfile",
|
||||
"registerfunction":"idFileDownload"
|
||||
}
|
||||
],
|
||||
"processors":[
|
||||
[".dspy","dspy"],
|
||||
[".md","md"]
|
||||
],
|
||||
"session_max_time":3000,
|
||||
"session_issue_time":2500
|
||||
},
|
||||
"langMapping":{
|
||||
"zh-Hans-CN":"zh-cn",
|
||||
"zh-CN":"zh-cn",
|
||||
"en-us":"en",
|
||||
"en-US":"en"
|
||||
}
|
||||
}
|
||||
0
files/README.md
Normal file
0
files/README.md
Normal file
202
files/components.yaml
Normal file
202
files/components.yaml
Normal file
@ -0,0 +1,202 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: metrics-server
|
||||
name: metrics-server
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: metrics-server
|
||||
rbac.authorization.k8s.io/aggregate-to-admin: "true"
|
||||
rbac.authorization.k8s.io/aggregate-to-edit: "true"
|
||||
rbac.authorization.k8s.io/aggregate-to-view: "true"
|
||||
name: system:aggregated-metrics-reader
|
||||
rules:
|
||||
- apiGroups:
|
||||
- metrics.k8s.io
|
||||
resources:
|
||||
- pods
|
||||
- nodes
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: metrics-server
|
||||
name: system:metrics-server
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes/metrics
|
||||
verbs:
|
||||
- get
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- pods
|
||||
- nodes
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: metrics-server
|
||||
name: metrics-server-auth-reader
|
||||
namespace: kube-system
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: extension-apiserver-authentication-reader
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: metrics-server
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: metrics-server
|
||||
name: metrics-server:system:auth-delegator
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: system:auth-delegator
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: metrics-server
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: metrics-server
|
||||
name: system:metrics-server
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: system:metrics-server
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: metrics-server
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: metrics-server
|
||||
name: metrics-server
|
||||
namespace: kube-system
|
||||
spec:
|
||||
ports:
|
||||
- name: https
|
||||
port: 443
|
||||
protocol: TCP
|
||||
targetPort: https
|
||||
selector:
|
||||
k8s-app: metrics-server
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: metrics-server
|
||||
name: metrics-server
|
||||
namespace: kube-system
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: metrics-server
|
||||
strategy:
|
||||
rollingUpdate:
|
||||
maxUnavailable: 0
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: metrics-server
|
||||
spec:
|
||||
containers:
|
||||
- args:
|
||||
- --cert-dir=/tmp
|
||||
- --secure-port=10250
|
||||
- --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname
|
||||
- --kubelet-use-node-status-port
|
||||
- --metric-resolution=15s
|
||||
- --kubelet-insecure-tls
|
||||
image: registry.aliyuncs.com/google_containers/metrics-server:v0.7.2
|
||||
imagePullPolicy: IfNotPresent
|
||||
livenessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /livez
|
||||
port: https
|
||||
scheme: HTTPS
|
||||
periodSeconds: 10
|
||||
name: metrics-server
|
||||
ports:
|
||||
- containerPort: 10250
|
||||
name: https
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /readyz
|
||||
port: https
|
||||
scheme: HTTPS
|
||||
initialDelaySeconds: 20
|
||||
periodSeconds: 10
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 200Mi
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
volumeMounts:
|
||||
- mountPath: /tmp
|
||||
name: tmp-dir
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
priorityClassName: system-cluster-critical
|
||||
serviceAccountName: metrics-server
|
||||
volumes:
|
||||
- emptyDir: {}
|
||||
name: tmp-dir
|
||||
---
|
||||
apiVersion: apiregistration.k8s.io/v1
|
||||
kind: APIService
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: metrics-server
|
||||
name: v1beta1.metrics.k8s.io
|
||||
spec:
|
||||
group: metrics.k8s.io
|
||||
groupPriorityMinimum: 100
|
||||
insecureSkipTLSVerify: true
|
||||
service:
|
||||
name: metrics-server
|
||||
namespace: kube-system
|
||||
version: v1beta1
|
||||
versionPriority: 100
|
||||
610
files/ingress-nginx-controller.yaml
Normal file
610
files/ingress-nginx-controller.yaml
Normal file
@ -0,0 +1,610 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
name: ingress-nginx
|
||||
---
|
||||
apiVersion: v1
|
||||
automountServiceAccountToken: true
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx
|
||||
namespace: ingress-nginx
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx-admission
|
||||
namespace: ingress-nginx
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx
|
||||
namespace: ingress-nginx
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- namespaces
|
||||
verbs:
|
||||
- get
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- configmaps
|
||||
- pods
|
||||
- secrets
|
||||
- endpoints
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- services
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- networking.k8s.io
|
||||
resources:
|
||||
- ingresses
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- networking.k8s.io
|
||||
resources:
|
||||
- ingresses/status
|
||||
verbs:
|
||||
- update
|
||||
- apiGroups:
|
||||
- networking.k8s.io
|
||||
resources:
|
||||
- ingressclasses
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resourceNames:
|
||||
- ingress-nginx-leader
|
||||
resources:
|
||||
- configmaps
|
||||
verbs:
|
||||
- get
|
||||
- update
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- configmaps
|
||||
verbs:
|
||||
- create
|
||||
- apiGroups:
|
||||
- coordination.k8s.io
|
||||
resourceNames:
|
||||
- ingress-nginx-leader
|
||||
resources:
|
||||
- leases
|
||||
verbs:
|
||||
- get
|
||||
- update
|
||||
- apiGroups:
|
||||
- coordination.k8s.io
|
||||
resources:
|
||||
- leases
|
||||
verbs:
|
||||
- create
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- events
|
||||
verbs:
|
||||
- create
|
||||
- patch
|
||||
- apiGroups:
|
||||
- discovery.k8s.io
|
||||
resources:
|
||||
- endpointslices
|
||||
verbs:
|
||||
- list
|
||||
- watch
|
||||
- get
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx-admission
|
||||
namespace: ingress-nginx
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- secrets
|
||||
verbs:
|
||||
- get
|
||||
- create
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- configmaps
|
||||
- endpoints
|
||||
- nodes
|
||||
- pods
|
||||
- secrets
|
||||
- namespaces
|
||||
verbs:
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- coordination.k8s.io
|
||||
resources:
|
||||
- leases
|
||||
verbs:
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes
|
||||
verbs:
|
||||
- get
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- services
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- networking.k8s.io
|
||||
resources:
|
||||
- ingresses
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- events
|
||||
verbs:
|
||||
- create
|
||||
- patch
|
||||
- apiGroups:
|
||||
- networking.k8s.io
|
||||
resources:
|
||||
- ingresses/status
|
||||
verbs:
|
||||
- update
|
||||
- apiGroups:
|
||||
- networking.k8s.io
|
||||
resources:
|
||||
- ingressclasses
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- discovery.k8s.io
|
||||
resources:
|
||||
- endpointslices
|
||||
verbs:
|
||||
- list
|
||||
- watch
|
||||
- get
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx-admission
|
||||
rules:
|
||||
- apiGroups:
|
||||
- admissionregistration.k8s.io
|
||||
resources:
|
||||
- validatingwebhookconfigurations
|
||||
verbs:
|
||||
- get
|
||||
- update
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx
|
||||
namespace: ingress-nginx
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: ingress-nginx
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: ingress-nginx
|
||||
namespace: ingress-nginx
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx-admission
|
||||
namespace: ingress-nginx
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: ingress-nginx-admission
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: ingress-nginx-admission
|
||||
namespace: ingress-nginx
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: ingress-nginx
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: ingress-nginx
|
||||
namespace: ingress-nginx
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx-admission
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: ingress-nginx-admission
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: ingress-nginx-admission
|
||||
namespace: ingress-nginx
|
||||
---
|
||||
apiVersion: v1
|
||||
data:
|
||||
allow-snippet-annotations: "true"
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx-controller
|
||||
namespace: ingress-nginx
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx-controller
|
||||
namespace: ingress-nginx
|
||||
spec:
|
||||
externalTrafficPolicy: Local
|
||||
ipFamilies:
|
||||
- IPv4
|
||||
ipFamilyPolicy: SingleStack
|
||||
ports:
|
||||
- appProtocol: http
|
||||
name: http
|
||||
port: 80
|
||||
protocol: TCP
|
||||
targetPort: http
|
||||
- appProtocol: https
|
||||
name: https
|
||||
port: 443
|
||||
protocol: TCP
|
||||
targetPort: https
|
||||
selector:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
type: LoadBalancer
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx-controller-admission
|
||||
namespace: ingress-nginx
|
||||
spec:
|
||||
ports:
|
||||
- appProtocol: https
|
||||
name: https-webhook
|
||||
port: 443
|
||||
targetPort: webhook
|
||||
selector:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
type: ClusterIP
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx-controller
|
||||
namespace: ingress-nginx
|
||||
spec:
|
||||
minReadySeconds: 0
|
||||
revisionHistoryLimit: 10
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: controller
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
spec:
|
||||
containers:
|
||||
- args:
|
||||
- /nginx-ingress-controller
|
||||
- --publish-service=$(POD_NAMESPACE)/ingress-nginx-controller
|
||||
- --election-id=ingress-nginx-leader
|
||||
- --controller-class=k8s.io/ingress-nginx
|
||||
- --ingress-class=nginx
|
||||
- --configmap=$(POD_NAMESPACE)/ingress-nginx-controller
|
||||
- --validating-webhook=:8443
|
||||
- --validating-webhook-certificate=/usr/local/certificates/cert
|
||||
- --validating-webhook-key=/usr/local/certificates/key
|
||||
env:
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
image: registry.cn-hangzhou.aliyuncs.com/google_containers/nginx-ingress-controller:v1.5.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
lifecycle:
|
||||
preStop:
|
||||
exec:
|
||||
command:
|
||||
- /wait-shutdown
|
||||
livenessProbe:
|
||||
failureThreshold: 5
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 10254
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
name: controller
|
||||
ports:
|
||||
- containerPort: 80
|
||||
name: http
|
||||
protocol: TCP
|
||||
- containerPort: 443
|
||||
name: https
|
||||
protocol: TCP
|
||||
- containerPort: 8443
|
||||
name: webhook
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: 10254
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 90Mi
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: true
|
||||
capabilities:
|
||||
add:
|
||||
- NET_BIND_SERVICE
|
||||
drop:
|
||||
- ALL
|
||||
runAsUser: 101
|
||||
volumeMounts:
|
||||
- mountPath: /usr/local/certificates/
|
||||
name: webhook-cert
|
||||
readOnly: true
|
||||
dnsPolicy: ClusterFirst
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
serviceAccountName: ingress-nginx
|
||||
terminationGracePeriodSeconds: 300
|
||||
volumes:
|
||||
- name: webhook-cert
|
||||
secret:
|
||||
secretName: ingress-nginx-admission
|
||||
---
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx-admission-create
|
||||
namespace: ingress-nginx
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx-admission-create
|
||||
spec:
|
||||
containers:
|
||||
- args:
|
||||
- create
|
||||
- --host=ingress-nginx-controller-admission,ingress-nginx-controller-admission.$(POD_NAMESPACE).svc
|
||||
- --namespace=$(POD_NAMESPACE)
|
||||
- --secret-name=ingress-nginx-admission
|
||||
env:
|
||||
- name: POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
image: registry.cn-hangzhou.aliyuncs.com/google_containers/kube-webhook-certgen:v1.1.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
name: create
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
restartPolicy: OnFailure
|
||||
securityContext:
|
||||
fsGroup: 2000
|
||||
runAsNonRoot: true
|
||||
runAsUser: 2000
|
||||
serviceAccountName: ingress-nginx-admission
|
||||
---
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx-admission-patch
|
||||
namespace: ingress-nginx
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/component: admission-webhook
|
||||
app.kubernetes.io/instance: ingress-nginx
|
||||
app.kubernetes.io/name: ingress-nginx
|
||||
app.kubernetes.io/part-of: ingress-nginx
|
||||
app.kubernetes.io/version: 1.5.1
|
||||
name: ingress-nginx-admission-patch
|
||||
spec:
|
||||
containers:
|
||||
- args:
|
||||
- patch
|
||||
- --webhook-name=ingress-nginx-admission
|
||||
- --namespace=$(POD_NAMESPACE)
|
||||
- --patch-mutating=false
|
||||
- --secret-name=ingress-nginx-admission
|
||||
- --patch-failure-policy=Fail
|
||||
env:
|
||||
- name: POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
image: registry.cn-hangzhou.aliyuncs.com/google_containers/kube-webhook-certgen:v1.1.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
name: patch
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
restartPolicy: OnFailure
|
||||
securityContext:
|
||||
fsGroup: 2000
|
||||
runAsNonRoot: true
|
||||
runAsUser: 2000
|
||||
serviceAccountName: ingress-nginx-admission
|
||||
209
files/kube-flannel.yml
Normal file
209
files/kube-flannel.yml
Normal file
@ -0,0 +1,209 @@
|
||||
---
|
||||
kind: Namespace
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: kube-flannel
|
||||
labels:
|
||||
k8s-app: flannel
|
||||
pod-security.kubernetes.io/enforce: privileged
|
||||
---
|
||||
kind: ClusterRole
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: flannel
|
||||
name: flannel
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- pods
|
||||
verbs:
|
||||
- get
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes/status
|
||||
verbs:
|
||||
- patch
|
||||
---
|
||||
kind: ClusterRoleBinding
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: flannel
|
||||
name: flannel
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: flannel
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: flannel
|
||||
namespace: kube-flannel
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: flannel
|
||||
name: flannel
|
||||
namespace: kube-flannel
|
||||
---
|
||||
kind: ConfigMap
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: kube-flannel-cfg
|
||||
namespace: kube-flannel
|
||||
labels:
|
||||
tier: node
|
||||
k8s-app: flannel
|
||||
app: flannel
|
||||
data:
|
||||
cni-conf.json: |
|
||||
{
|
||||
"name": "cbr0",
|
||||
"cniVersion": "0.3.1",
|
||||
"plugins": [
|
||||
{
|
||||
"type": "flannel",
|
||||
"delegate": {
|
||||
"hairpinMode": true,
|
||||
"isDefaultGateway": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "portmap",
|
||||
"capabilities": {
|
||||
"portMappings": true
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
net-conf.json: |
|
||||
{
|
||||
"Network": "10.244.0.0/16",
|
||||
"EnableNFTables": false,
|
||||
"Backend": {
|
||||
"Type": "vxlan"
|
||||
}
|
||||
}
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: kube-flannel-ds
|
||||
namespace: kube-flannel
|
||||
labels:
|
||||
tier: node
|
||||
app: flannel
|
||||
k8s-app: flannel
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: flannel
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
tier: node
|
||||
app: flannel
|
||||
spec:
|
||||
affinity:
|
||||
nodeAffinity:
|
||||
requiredDuringSchedulingIgnoredDuringExecution:
|
||||
nodeSelectorTerms:
|
||||
- matchExpressions:
|
||||
- key: kubernetes.io/os
|
||||
operator: In
|
||||
values:
|
||||
- linux
|
||||
hostNetwork: true
|
||||
priorityClassName: system-node-critical
|
||||
tolerations:
|
||||
- operator: Exists
|
||||
effect: NoSchedule
|
||||
serviceAccountName: flannel
|
||||
initContainers:
|
||||
- name: install-cni-plugin
|
||||
image: ghcr.io/flannel-io/flannel-cni-plugin:v1.6.2-flannel1
|
||||
command:
|
||||
- cp
|
||||
args:
|
||||
- -f
|
||||
- /flannel
|
||||
- /opt/cni/bin/flannel
|
||||
volumeMounts:
|
||||
- name: cni-plugin
|
||||
mountPath: /opt/cni/bin
|
||||
- name: install-cni
|
||||
image: ghcr.io/flannel-io/flannel:v0.26.4
|
||||
command:
|
||||
- cp
|
||||
args:
|
||||
- -f
|
||||
- /etc/kube-flannel/cni-conf.json
|
||||
- /etc/cni/net.d/10-flannel.conflist
|
||||
volumeMounts:
|
||||
- name: cni
|
||||
mountPath: /etc/cni/net.d
|
||||
- name: flannel-cfg
|
||||
mountPath: /etc/kube-flannel/
|
||||
containers:
|
||||
- name: kube-flannel
|
||||
image: ghcr.io/flannel-io/flannel:v0.26.4
|
||||
command:
|
||||
- /opt/bin/flanneld
|
||||
args:
|
||||
- --ip-masq
|
||||
- --kube-subnet-mgr
|
||||
resources:
|
||||
requests:
|
||||
cpu: "100m"
|
||||
memory: "50Mi"
|
||||
securityContext:
|
||||
privileged: false
|
||||
capabilities:
|
||||
add: ["NET_ADMIN", "NET_RAW"]
|
||||
env:
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
- name: EVENT_QUEUE_DEPTH
|
||||
value: "5000"
|
||||
volumeMounts:
|
||||
- name: run
|
||||
mountPath: /run/flannel
|
||||
- name: flannel-cfg
|
||||
mountPath: /etc/kube-flannel/
|
||||
- name: xtables-lock
|
||||
mountPath: /run/xtables.lock
|
||||
volumes:
|
||||
- name: run
|
||||
hostPath:
|
||||
path: /run/flannel
|
||||
- name: cni-plugin
|
||||
hostPath:
|
||||
path: /opt/cni/bin
|
||||
- name: cni
|
||||
hostPath:
|
||||
path: /etc/cni/net.d
|
||||
- name: flannel-cfg
|
||||
configMap:
|
||||
name: kube-flannel-cfg
|
||||
- name: xtables-lock
|
||||
hostPath:
|
||||
path: /run/xtables.lock
|
||||
type: FileOrCreate
|
||||
39
files/nfs-provisioner-deploy.yaml
Normal file
39
files/nfs-provisioner-deploy.yaml
Normal file
@ -0,0 +1,39 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: nfs-client-provisioner
|
||||
labels:
|
||||
app: nfs-client-provisioner
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate ## 设置升级策略为删除再创建(默认为滚动更新)
|
||||
selector:
|
||||
matchLabels:
|
||||
app: nfs-client-provisioner
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: nfs-client-provisioner
|
||||
spec:
|
||||
serviceAccountName: nfs-client-provisioner
|
||||
containers:
|
||||
- name: nfs-client-provisioner
|
||||
#image: gcr.io/k8s-staging-sig-storage/nfs-subdir-external-provisioner:v4.0.0
|
||||
image: registry.cn-beijing.aliyuncs.com/xngczl/nfs-subdir-external-provisione:v4.0.0
|
||||
volumeMounts:
|
||||
- name: nfs-client-root
|
||||
mountPath: /persistentvolumes
|
||||
env:
|
||||
- name: PROVISIONER_NAME ## Provisioner的名称,以后设置的storageclass要和这个保持一致
|
||||
value: k8s-sigs.io/nfs-subdir-external-provisioner
|
||||
- name: NFS_SERVER ## NFS服务器地址,需和valumes参数中配置的保持一致
|
||||
value: 192.168.0.3
|
||||
- name: NFS_PATH ## NFS服务器数据存储目录,需和valumes参数中配置的保持一致
|
||||
value: /d/k8s_nss
|
||||
volumes:
|
||||
- name: nfs-client-root
|
||||
nfs:
|
||||
server: 192.168.0.3 ## NFS服务器地址
|
||||
path: /d/k8s_nss ## NFS服务器数据存储目录
|
||||
readOnly: false
|
||||
60
files/nfs-rbac.yaml
Normal file
60
files/nfs-rbac.yaml
Normal file
@ -0,0 +1,60 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: nfs-client-provisioner
|
||||
namespace: default # 替换成你要部署的 Namespace
|
||||
---
|
||||
kind: ClusterRole
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: nfs-client-provisioner-runner
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["persistentvolumes"]
|
||||
verbs: ["get", "list", "watch", "create", "delete"]
|
||||
- apiGroups: [""]
|
||||
resources: ["persistentvolumeclaims"]
|
||||
verbs: ["get", "list", "watch", "update"]
|
||||
- apiGroups: ["storage.k8s.io"]
|
||||
resources: ["storageclasses"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: [""]
|
||||
resources: ["events"]
|
||||
verbs: ["create", "update", "patch"]
|
||||
---
|
||||
kind: ClusterRoleBinding
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: run-nfs-client-provisioner
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: nfs-client-provisioner
|
||||
namespace: default
|
||||
roleRef:
|
||||
kind: ClusterRole
|
||||
name: nfs-client-provisioner-runner
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
---
|
||||
kind: Role
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: leader-locking-nfs-client-provisioner
|
||||
namespace: default
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["endpoints"]
|
||||
verbs: ["get", "list", "watch", "create", "update", "patch"]
|
||||
---
|
||||
kind: RoleBinding
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
metadata:
|
||||
name: leader-locking-nfs-client-provisioner
|
||||
namespace: default
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: nfs-client-provisioner
|
||||
namespace: default
|
||||
roleRef:
|
||||
kind: Role
|
||||
name: leader-locking-nfs-client-provisioner
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
13
files/storage_class.yaml
Normal file
13
files/storage_class.yaml
Normal file
@ -0,0 +1,13 @@
|
||||
apiVersion: storage.k8s.io/v1
|
||||
kind: StorageClass
|
||||
metadata:
|
||||
name: nfs-storage-class
|
||||
annotations:
|
||||
storageclass.kubernetes.io/is-default-class: "false"
|
||||
allowVolumeExpansion: true
|
||||
provisioner: k8s-sigs.io/nfs-subdir-external-provisioner
|
||||
reclaimPolicy: Delete
|
||||
volumeBindingMode: Immediate
|
||||
parameters:
|
||||
pathPattern: "${.PVC.namespace}/${.PVC.name}"
|
||||
onDelete: delete
|
||||
5
install.sh
Normal file
5
install.sh
Normal file
@ -0,0 +1,5 @@
|
||||
#!/bin/bash
|
||||
pip3 -V
|
||||
pip3 list
|
||||
pip3 install kubernetes packaging ldap3 paramiko python-dateutil aiohttp-socks asyncssh nanoid redis -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
pip3 install --upgrade cryptography pyOpenSSL
|
||||
0
logs/README.md
Normal file
0
logs/README.md
Normal file
3
requirements.txt
Normal file
3
requirements.txt
Normal file
@ -0,0 +1,3 @@
|
||||
git+https://git.kaiyuancloud.cn/yumoqing/apppublic
|
||||
git+https://git.kaiyuancloud.cn/yumoqing/sqlor
|
||||
git+https://git.kaiyuancloud.cn/yumoqing/ahserver
|
||||
0
script/README.md
Normal file
0
script/README.md
Normal file
23
script/delete_allimage.sh
Normal file
23
script/delete_allimage.sh
Normal file
@ -0,0 +1,23 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 停止并删除所有容器和 Pod
|
||||
echo "Stopping all containers..."
|
||||
crictl stop $(crictl ps -q) || true
|
||||
echo "Removing all containers..."
|
||||
crictl rm $(crictl ps -a -q) || true
|
||||
echo "Stopping all pods..."
|
||||
crictl stopp $(crictl pods -q) || true
|
||||
echo "Removing all pods..."
|
||||
crictl rmp $(crictl pods -q) || true
|
||||
|
||||
# 删除所有镜像(crictl 方式)
|
||||
echo "Deleting all images via crictl..."
|
||||
crictl images --quiet | xargs -r crictl rmi || true
|
||||
|
||||
# 删除所有镜像(ctr 方式)
|
||||
echo "Deleting all images via ctr in k8s.io namespace..."
|
||||
ctr -n=k8s.io images list --quiet | xargs -r ctr -n=k8s.io image rm || true
|
||||
echo "Deleting all images via ctr in default namespace..."
|
||||
ctr -n=default images list --quiet | xargs -r ctr -n=default image rm || true
|
||||
|
||||
echo "All images and containers have been deleted."
|
||||
53
script/export_images.sh
Normal file
53
script/export_images.sh
Normal file
@ -0,0 +1,53 @@
|
||||
#!/bin/bash/
|
||||
|
||||
# 设置 Kubernetes 版本和镜像仓库地址
|
||||
K8S_VERSION="v1.28.2"
|
||||
ALIYUN_REGISTRY="registry.aliyuncs.com/google_containers" # 阿里云 Kubernetes 镜像源
|
||||
FLANNEL_REPO="ghcr.io/flannel-io" # Flannel 镜像仓库
|
||||
NETWORK_PLUGIN="flannel"
|
||||
NETWORK_PLUGIN_VERSION="v0.26.4"
|
||||
NETWORK_PLUGIN_CNI="flannel-cni-plugin"
|
||||
NETWORK_PLUGIN_CNI_VERSION="v1.6.2-flannel1"
|
||||
|
||||
# Kubernetes 控制平面镜像列表(阿里云镜像源)
|
||||
KUBERNETES_IMAGES=(
|
||||
"${ALIYUN_REGISTRY}/kube-apiserver:${K8S_VERSION}"
|
||||
"${ALIYUN_REGISTRY}/kube-controller-manager:${K8S_VERSION}"
|
||||
"${ALIYUN_REGISTRY}/kube-scheduler:${K8S_VERSION}"
|
||||
"${ALIYUN_REGISTRY}/kube-proxy:${K8S_VERSION}"
|
||||
"${ALIYUN_REGISTRY}/pause:3.9"
|
||||
"${ALIYUN_REGISTRY}/etcd:3.5.9-0"
|
||||
"${ALIYUN_REGISTRY}/coredns:v1.10.1"
|
||||
)
|
||||
|
||||
# 网络插件镜像(Flannel)
|
||||
NETWORK_IMAGES=(
|
||||
"${FLANNEL_REPO}/${NETWORK_PLUGIN}:${NETWORK_PLUGIN_VERSION}"
|
||||
)
|
||||
NETWORK_CNI_IMAGES=(
|
||||
"${FLANNEL_REPO}/${NETWORK_PLUGIN_CNI}:${NETWORK_PLUGIN_CNI_VERSION}"
|
||||
)
|
||||
|
||||
# 合并所有镜像
|
||||
ALL_IMAGES=("${KUBERNETES_IMAGES[@]}" "${NETWORK_IMAGES[@]}" "${NETWORK_CNI_IMAGES[@]}")
|
||||
|
||||
# 导出本地已存在的镜像(在源节点运行)
|
||||
function export_images() {
|
||||
echo "==> 正在导出本地已存在的 Kubernetes v${K8S_VERSION} 镜像..."
|
||||
mkdir -p /opt/k8s-images
|
||||
cd /opt/k8s-images || exit
|
||||
|
||||
for image in "${ALL_IMAGES[@]}"; do
|
||||
echo "正在检查并导出镜像:${image}"
|
||||
if ctr -n=k8s.io images list --quiet | grep -q "${image}"; then
|
||||
output_file="${image//\//_}.tar"
|
||||
ctr -n=k8s.io images export ${output_file} ${image} --platform=linux/amd64
|
||||
echo "✅ 成功导出:${output_file}"
|
||||
else
|
||||
echo "⚠️ 镜像 ${image} 不存在于本地,跳过!"
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# 根据需要选择执行导出或导入
|
||||
export_images # 在源节点运行,导出镜像
|
||||
116
script/generate_apitoken.sh
Executable file
116
script/generate_apitoken.sh
Executable file
@ -0,0 +1,116 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 定义变量
|
||||
NAMESPACE="my-namespace"
|
||||
SERVICE_ACCOUNT="my-sa"
|
||||
|
||||
# YAML 内容(确保 Deployment 明确使用 ServiceAccount)
|
||||
all_resources_yaml='
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: '"$NAMESPACE"'
|
||||
|
||||
---
|
||||
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: '"$SERVICE_ACCOUNT"'
|
||||
namespace: '"$NAMESPACE"'
|
||||
|
||||
---
|
||||
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: my-mysql-service
|
||||
namespace: '"$NAMESPACE"'
|
||||
spec:
|
||||
type: NodePort
|
||||
selector:
|
||||
app: mysql
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 3306
|
||||
targetPort: 3306
|
||||
nodePort: 30060
|
||||
|
||||
---
|
||||
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: mysql-deployment
|
||||
namespace: '"$NAMESPACE"'
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: mysql
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: mysql
|
||||
spec:
|
||||
serviceAccountName: '"$SERVICE_ACCOUNT"' # 关键:强制 Pod 使用该 ServiceAccount
|
||||
containers:
|
||||
- name: mysql
|
||||
image: mysql:8.0
|
||||
env:
|
||||
- name: MYSQL_ROOT_PASSWORD
|
||||
value: "123456"
|
||||
resources:
|
||||
limits:
|
||||
cpu: "300m"
|
||||
memory: "512Mi"
|
||||
'
|
||||
|
||||
# 创建资源函数
|
||||
create_resources() {
|
||||
echo "$all_resources_yaml" | kubectl apply -f -
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "资源创建失败"
|
||||
exit 1
|
||||
fi
|
||||
# 新增:等待 Secret 生成(最多 10 秒)
|
||||
echo "等待 ServiceAccount 的 Secret 生成..."
|
||||
for i in {1..10}; do
|
||||
local secret_name=$(kubectl get serviceaccount "$SERVICE_ACCOUNT" -n "$NAMESPACE" -o jsonpath='{.secrets[0].name}' 2>/dev/null)
|
||||
if [ -n "$secret_name" ]; then
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
}
|
||||
|
||||
# 删除资源函数
|
||||
delete_resources() {
|
||||
echo "$all_resources_yaml" | kubectl delete -f -
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "资源创建失败"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# 获取 Token 函数(优化错误提示)
|
||||
get_service_account_token() {
|
||||
local secret_name=$(kubectl get serviceaccount "$SERVICE_ACCOUNT" -n "$NAMESPACE" -o jsonpath='{.secrets[0].name}' 2>/dev/null)
|
||||
if [ -z "$secret_name" ]; then
|
||||
echo "错误:ServiceAccount 的 Secret 未生成,请检查 Pod 是否正常运行"
|
||||
exit 1
|
||||
fi
|
||||
local token=$(kubectl get secret -n "$NAMESPACE" "$secret_name" -o jsonpath='{.data.token}' | base64 -d)
|
||||
echo "ApiToken: $token"
|
||||
}
|
||||
|
||||
# 执行流程
|
||||
create_resources
|
||||
#echo "资源创建完成"
|
||||
#kubectl get all -n "$NAMESPACE"
|
||||
|
||||
#echo "正在获取 ServiceAccount 的 Token..."
|
||||
#get_service_account_token
|
||||
|
||||
|
||||
#delete_resources
|
||||
74
script/import_images.sh
Normal file
74
script/import_images.sh
Normal file
@ -0,0 +1,74 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 设置 Kubernetes 版本和镜像仓库地址
|
||||
K8S_VERSION="v1.28.2"
|
||||
ALIYUN_REGISTRY="registry.aliyuncs.com/google_containers" # 阿里云 Kubernetes 镜像源
|
||||
HANGZHOU_ALIYUN_REGISTRY="registry.cn-hangzhou.aliyuncs.com/google_containers" # 杭州阿里云镜像站
|
||||
FLANNEL_REPO="ghcr.io/flannel-io" # Flannel 镜像仓库
|
||||
NETWORK_PLUGIN="flannel"
|
||||
NETWORK_PLUGIN_VERSION="v0.26.4"
|
||||
NETWORK_PLUGIN_CNI="flannel-cni-plugin"
|
||||
NETWORK_PLUGIN_CNI_VERSION="v1.6.2-flannel1"
|
||||
|
||||
# Kubernetes 控制平面镜像列表(阿里云镜像源)
|
||||
KUBERNETES_IMAGES=(
|
||||
"${ALIYUN_REGISTRY}/kube-apiserver:${K8S_VERSION}"
|
||||
"${ALIYUN_REGISTRY}/kube-controller-manager:${K8S_VERSION}"
|
||||
"${ALIYUN_REGISTRY}/kube-scheduler:${K8S_VERSION}"
|
||||
"${ALIYUN_REGISTRY}/kube-proxy:${K8S_VERSION}"
|
||||
"${ALIYUN_REGISTRY}/pause:3.9"
|
||||
"${ALIYUN_REGISTRY}/etcd:3.5.9-0"
|
||||
"${ALIYUN_REGISTRY}/coredns:v1.10.1"
|
||||
"${ALIYUN_REGISTRY}/metrics-server:v0.7.2"
|
||||
"${HANGZHOU_ALIYUN_REGISTRY}/kube-webhook-certgen:v1.1.1"
|
||||
"${HANGZHOU_ALIYUN_REGISTRY}/nginx-ingress-controller:v1.5.1"
|
||||
)
|
||||
|
||||
# 网络插件镜像(Flannel)
|
||||
NETWORK_IMAGES=(
|
||||
"${FLANNEL_REPO}/${NETWORK_PLUGIN}:${NETWORK_PLUGIN_VERSION}"
|
||||
)
|
||||
NETWORK_CNI_IMAGES=(
|
||||
"${FLANNEL_REPO}/${NETWORK_PLUGIN_CNI}:${NETWORK_PLUGIN_CNI_VERSION}"
|
||||
)
|
||||
|
||||
# 合并所有镜像
|
||||
ALL_IMAGES=("${KUBERNETES_IMAGES[@]}" "${NETWORK_IMAGES[@]}" "${NETWORK_CNI_IMAGES[@]}")
|
||||
|
||||
# 导入镜像并自动修复配置(在目标节点运行)
|
||||
function import_images() {
|
||||
echo "==> 正在导入镜像到目标节点..."
|
||||
|
||||
# 2. 停止 containerd 服务
|
||||
sudo systemctl stop containerd
|
||||
|
||||
# 3. 进入镜像目录
|
||||
cd /opt/k8s-images || exit
|
||||
|
||||
# 4. 清理旧镜像(根据你的镜像仓库地址过滤)
|
||||
echo "正在清理旧镜像..."
|
||||
for img in $(ctr -n=k8s.io images list --quiet); do
|
||||
if [[ $img == ${ALIYUN_REGISTRY}* || $img == ${FLANNEL_REPO}* || $img == ${HANGZHOU_ALIYUN_REGISTRY}* ]]; then
|
||||
ctr -n=k8s.io images rm $img || true
|
||||
fi
|
||||
done
|
||||
|
||||
sudo systemctl start containerd
|
||||
|
||||
# 5. 导入所有 tar 文件
|
||||
for file in *.tar; do
|
||||
echo "正在导入镜像:${file}"
|
||||
ctr -n=k8s.io images import ${file} --platform=linux/amd64
|
||||
echo "✅ 导入成功:${file}"
|
||||
done
|
||||
|
||||
# 6. 启动 containerd 并验证
|
||||
#sudo systemctl restart containerd
|
||||
echo "已导入的镜像列表:"
|
||||
ctr -n=k8s.io images list | grep -E "${ALIYUN_REGISTRY}|${HANGZHOU_ALIYUN_REGISTRY}|${FLANNEL_REPO}"
|
||||
crictl images
|
||||
}
|
||||
|
||||
|
||||
# 根据需要选择执行导入
|
||||
import_images # 在目标节点运行,导入镜像
|
||||
660
script/k8s_install.sh
Normal file
660
script/k8s_install.sh
Normal file
@ -0,0 +1,660 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 部分ubuntu操作系统在安装包时会出现交互式图形界面弹窗的形式,此处我们忽略
|
||||
# 交互式提示的本质:DEBIAN_FRONTEND=noninteractive 是控制apt/dpkg非交互的核心,其他UCF变量和配置文件是补充,确保配置文件冲突时自动选择新 / 旧版本,避免弹窗。
|
||||
# packagekit 的作用:该服务主要用于图形化包管理,在服务器环境中可停止但无需mask,mask会导致系统无法正常管理该服务及其依赖。
|
||||
|
||||
# 禁用包管理交互式提示(不影响系统服务)
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
export UCF_FORCE_CONFFNEW=1
|
||||
export UCF_FORCE_CONFFMISS=1
|
||||
export UCF_FORCE_CONFFIGNORE=1
|
||||
|
||||
# 配置apt和dpkg的非交互行为
|
||||
echo 'Dpkg::Options {
|
||||
"--force-confdef";
|
||||
"--force-confnew";
|
||||
}' > /etc/apt/apt.conf.d/99noninteractive
|
||||
echo 'force-confold' > /etc/dpkg/dpkg.cfg.d/force-confold
|
||||
|
||||
# 优化:仅停止packagekit(不mask)
|
||||
systemctl stop packagekit
|
||||
|
||||
echo "########## 安装K8S必须root用户下执行 ###########"
|
||||
# 检查是否为root用户
|
||||
if [ "$(id -u)" != "0" ]; then
|
||||
echo "请以root用户身份运行此脚本"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 新ubuntu18.04设备环境先换阿里源:
|
||||
cp /etc/apt/sources.list /etc/apt/sources.list.bak
|
||||
tee /etc/apt/sources.list << EOF
|
||||
deb http://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
|
||||
deb-src http://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
|
||||
|
||||
deb http://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
|
||||
deb-src http://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
|
||||
|
||||
deb http://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
|
||||
deb-src http://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
|
||||
|
||||
deb http://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
|
||||
deb-src http://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
|
||||
|
||||
deb http://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
|
||||
deb-src http://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
|
||||
EOF
|
||||
apt-get update -y
|
||||
|
||||
apt upgrade -y
|
||||
|
||||
apt install -y libtss2-esys0 -f
|
||||
|
||||
# 设置脚本在出错时立即退出,并将错误信息输出
|
||||
set -e
|
||||
# set -o pipefail
|
||||
|
||||
# 函数:输出日志信息
|
||||
log_info() {
|
||||
echo "[INFO] $1"
|
||||
}
|
||||
|
||||
# 函数:输出错误信息并退出
|
||||
log_error() {
|
||||
echo "[ERROR] $1" >&2
|
||||
exit 1
|
||||
}
|
||||
|
||||
# 关闭防火墙
|
||||
# log_info "关闭防火墙..."
|
||||
# ufw disable || log_error "关闭防火墙失败"
|
||||
|
||||
# selinux相关操作
|
||||
log_info "安装selinux-utils..."
|
||||
apt install -y selinux-utils || log_error "安装selinux-utils失败"
|
||||
log_info "设置SELinux为Permissive模式..."
|
||||
if grep -q "SELINUX=enforcing" /etc/selinux/config || grep -q "SELINUX=permissive" /etc/selinux/config; then
|
||||
echo "SELinux已开启"
|
||||
setenforce 0 || log_error "设置SELinux模式失败"
|
||||
sed -i 's/^SELINUX=enforcing$/SELINUX=permissive/' /etc/selinux/config || log_error "修改SELinux配置文件失败"
|
||||
else
|
||||
echo "SELinux未开启"
|
||||
fi
|
||||
|
||||
#安装htop,vim,net-tools
|
||||
apt install vim htop net-tools -y || log_error "安装htop,vim,net-tools失败"
|
||||
|
||||
# 禁止swap分区
|
||||
log_info "禁止swap分区..."
|
||||
swapoff -a || log_error "禁止swap分区失败"
|
||||
# 注释掉swap一行
|
||||
sed -i '/swap/s/^/#/' /etc/fstab || log_error "注释swap行失败"
|
||||
|
||||
# 桥接的IPV4流量传递到iptables 的链
|
||||
log_info "配置桥接的IPV4流量传递到iptables的链..."
|
||||
cat > /etc/sysctl.d/k8s.conf <<EOF
|
||||
net.bridge.bridge-nf-call-ip6tables = 1
|
||||
net.bridge.bridge-nf-call-iptables = 1
|
||||
EOF
|
||||
sysctl --system || log_error "使sysctl配置生效失败"
|
||||
|
||||
# 新增k8s镜像源
|
||||
log_info "新增k8s镜像源..."
|
||||
curl -s https://mirrors.aliyun.com/kubernetes/apt/doc/apt-key.gpg | apt-key add - || log_error "添加k8s镜像源的密钥失败"
|
||||
echo "deb https://mirrors.aliyun.com/kubernetes/apt/ kubernetes-xenial main" > /etc/apt/sources.list.d/kubernetes.list
|
||||
apt-get update -y || log_error "更新apt源失败"
|
||||
|
||||
# 安装nfs
|
||||
# log_info "安装nfs-common..."
|
||||
# apt-get install -y nfs-common || log_error "安装nfs-common失败"
|
||||
apt install -y aptitude
|
||||
|
||||
# 更新系统并安装必要工具
|
||||
log_info "更新系统并安装必要工具..."
|
||||
apt update -y || log_error "系统更新或升级失败"
|
||||
apt install -y curl apt-transport-https ipvsadm gnupg2 software-properties-common || log_error "安装必要工具失败"
|
||||
|
||||
# 安装docker
|
||||
log_info "正在跳过安装docker..."
|
||||
# 删除原有的Docker软件源
|
||||
# if [ -f /etc/apt/sources.list.d/docker.list ]; then
|
||||
# rm /etc/apt/sources.list.d/docker.list
|
||||
# fi
|
||||
|
||||
# 添加阿里云的Docker镜像源
|
||||
# 备份现有文件
|
||||
# if [ -f /usr/share/keyrings/docker-archive-keyring.gpg ]; then
|
||||
# mv /usr/share/keyrings/docker-archive-keyring.gpg /usr/share/keyrings/docker-archive-keyring.gpg.bak
|
||||
# fi
|
||||
# 覆盖现有文件
|
||||
curl -fsSL https://mirrors.aliyun.com/docker-ce/linux/ubuntu/gpg | gpg --batch --yes --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
|
||||
echo "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://mirrors.aliyun.com/docker-ce/linux/ubuntu $(lsb_release -cs) stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null
|
||||
|
||||
|
||||
# 更新apt源
|
||||
apt update -y || log_error "更新apt源失败"
|
||||
# apt install docker-ce=5:20.10.24~3-0~ubuntu-focal docker-ce-cli=5:20.10.24~3-0~ubuntu-focal containerd.io --allow-downgrades -y || log_error "安装docker失败"
|
||||
apt install containerd --allow-downgrades -y || log_error "安装containerd失败"
|
||||
systemctl enable containerd || log_error "启动containerd服务失败"
|
||||
|
||||
# 配置containerd-crictl
|
||||
if [ ! -f /etc/crictl.yaml ]; then
|
||||
sudo tee /etc/crictl.yaml > /dev/null <<EOF
|
||||
runtime-endpoint: unix:///var/run/containerd/containerd.sock
|
||||
image-endpoint: unix:///var/run/containerd/containerd.sock
|
||||
timeout: 10
|
||||
debug: false
|
||||
pull-image-on-create: false
|
||||
EOF
|
||||
fi
|
||||
|
||||
# 安装kubeadm、kubelet、kubectl等
|
||||
log_info "安装kubeadm、kubelet、kubectl等..."
|
||||
# wget https://pkgs.k8s.io/core:/stable:/v1.21/deb/Release.key -O apt-key.gpg || log_error "下载kubeadm等的密钥失败"
|
||||
# apt-key add apt-key.gpg && rm -f apt-key.gpg || log_error "导入&删除apt-key.gpg文件失败"
|
||||
curl -s https://mirrors.aliyun.com/kubernetes/apt/doc/apt-key.gpg | apt-key add - || log_error "添加k8s镜像源的密钥失败"
|
||||
# curl -fsSL https://pkgs.k8s.io/core:/stable:/v1.28/deb/Release.key | sudo gpg --dearmor -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg
|
||||
# echo "deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v1.28/deb/ /" | sudo tee /etc/apt/sources.list.d/kubernetes.list
|
||||
echo "deb https://mirrors.aliyun.com/kubernetes/apt/ kubernetes-xenial main" | tee /etc/apt/sources.list.d/kubernetes.list
|
||||
apt-get update -y || log_error "更新apt源以安装kubeadm等失败"
|
||||
|
||||
apt install -y kubelet=1.28.2-00 kubeadm=1.28.2-00 kubectl=1.28.2-00 --allow-downgrades --allow-change-held-packages || log_error "安装kubeadm,kubelet,kubectl失败"
|
||||
apt-mark hold kubeadm kubelet kubectl # 防止自动升级导致的问题
|
||||
|
||||
systemctl enable kubelet && systemctl start kubelet || log_error "启动kubelet服务失败"
|
||||
|
||||
# 备份docker的daemon.json文件
|
||||
# if [ -f /etc/docker/daemon.json ]; then
|
||||
# cp /etc/docker/daemon.json /etc/docker/daemon.json.bak
|
||||
# fi
|
||||
|
||||
# 配置docker的daemon.json
|
||||
# cat <<EOF > /etc/docker/daemon.json
|
||||
# {"registry-mirrors":["https://registry.docker-cn.com","https://registry.cn-hangzhou.aliyuncs.com"],"exec-opts": ["native.cgroupdriver=systemd"]}
|
||||
# EOF
|
||||
|
||||
# 重新加载docker配置并重启docker服务
|
||||
systemctl daemon-reload
|
||||
# systemctl restart docker
|
||||
|
||||
# 初始化节点
|
||||
sudo modprobe br_netfilter
|
||||
sudo sysctl net.bridge.bridge-nf-call-iptables=1
|
||||
|
||||
# 加载必要内核模块
|
||||
sudo modprobe overlay
|
||||
sudo modprobe br_netfilter
|
||||
|
||||
# 编辑 `/etc/modules-load.d/k8s.conf` 添加以下内容:
|
||||
cat <<EOF | sudo tee /etc/modules-load.d/k8s.conf
|
||||
overlay
|
||||
br_netfilter
|
||||
EOF
|
||||
|
||||
# 编辑 `/etc/sysctl.d/k8s.conf` 配置网络参数:
|
||||
cat <<EOF | sudo tee /etc/sysctl.d/k8s.conf
|
||||
net.bridge.bridge-nf-call-iptables = 1
|
||||
net.bridge.bridge-nf-call-ip6tables = 1
|
||||
net.ipv4.ip_forward = 1
|
||||
EOF
|
||||
|
||||
# 生效配置
|
||||
sudo sysctl --system
|
||||
|
||||
# 将containerd默认配置写入文件
|
||||
mkdir -p /etc/containerd
|
||||
containerd config default > /etc/containerd/config.toml
|
||||
|
||||
# 创建目录
|
||||
sudo mkdir -p /etc/containerd/certs.d
|
||||
mkdir -p /etc/containerd/certs.d/docker.io
|
||||
mkdir -p /etc/containerd/certs.d/registry.k8s.io
|
||||
mkdir -p /etc/containerd/certs.d/gcr.io
|
||||
|
||||
## 定义阿里云镜像源地址
|
||||
ALIYUN_DOCKER="https://registry.docker-cn.com"
|
||||
ALIYUN_K8S="https://registry.aliyuncs.com/google_containers"
|
||||
ALIYUN_GCR="$ALIYUN_K8S" # gcr.io 同样使用阿里云镜像源
|
||||
|
||||
# 配置文件路径
|
||||
CONFIG_TOML="/etc/containerd/config.toml"
|
||||
CERTS_DIR="/etc/containerd/certs.d"
|
||||
|
||||
# 1. 修改 containerd 配置文件
|
||||
echo "正在配置 containerd 的镜像加速..."
|
||||
if ! grep -q 'config_path' "$CONFIG_TOML"; then
|
||||
# 在 config.toml 中添加 config_path 配置
|
||||
sudo sed -i '$a\ [plugins."io.containerd.grpc.v1.cri".registry]\n config_path = "'"$CERTS_DIR"'"' "$CONFIG_TOML"
|
||||
fi
|
||||
|
||||
# 2. 创建 certs.d 目录(如果不存在)
|
||||
sudo mkdir -p "$CERTS_DIR"
|
||||
|
||||
# 3. 配置 Docker Hub 镜像加速
|
||||
echo "配置 Docker Hub 镜像加速..."
|
||||
sudo mkdir -p "$CERTS_DIR/docker.io"
|
||||
cat <<EOF | sudo tee "$CERTS_DIR/docker.io/hosts.toml"
|
||||
server = "https://docker.io"
|
||||
[host."$ALIYUN_DOCKER"]
|
||||
capabilities = ["pull", "resolve"]
|
||||
EOF
|
||||
|
||||
# 4. 配置 Kubernetes 官方镜像源
|
||||
echo "配置 Kubernetes 官方镜像加速..."
|
||||
sudo mkdir -p "$CERTS_DIR/registry.k8s.io"
|
||||
cat <<EOF | sudo tee "$CERTS_DIR/registry.k8s.io/hosts.toml"
|
||||
server = "https://registry.k8s.io"
|
||||
[host."$ALIYUN_K8S"]
|
||||
capabilities = ["pull", "resolve"]
|
||||
EOF
|
||||
|
||||
# 5. 配置 Google Container Registry (gcr.io)
|
||||
echo "配置 Google Container Registry 镜像加速..."
|
||||
sudo mkdir -p "$CERTS_DIR/gcr.io"
|
||||
cat <<EOF | sudo tee "$CERTS_DIR/gcr.io/hosts.toml"
|
||||
server = "https://gcr.io"
|
||||
[host."$ALIYUN_GCR"]
|
||||
capabilities = ["pull", "resolve"]
|
||||
EOF
|
||||
|
||||
# 5. 修复 pause 镜像地址(使用阿里云镜像)
|
||||
sudo sed -i 's|sandbox_image = "registry.k8s.io/pause:.*"|sandbox_image = "registry.aliyuncs.com/google_containers/pause:3.9"|g' /etc/containerd/config.toml
|
||||
|
||||
# --- 修正配置项 ---
|
||||
|
||||
|
||||
# 1. 检查并设置 [plugins."io.containerd.grpc.v1.cri".containerd].systemd_cgroup = true
|
||||
# echo "Checking/fixing 'systemd_cgroup' configuration..."
|
||||
# if ! grep -q 'systemd_cgroup = true' "$CONFIG_TOML"; then
|
||||
# # 使用 sed 直接替换整行,无需捕获组
|
||||
# sed -i "/^\s*systemd_cgroup\s*=\s*.*/c\
|
||||
# systemd_cgroup = true" "$CONFIG_TOML"
|
||||
# echo "Modified 'systemd_cgroup' to 'true'."
|
||||
# else
|
||||
# echo "'systemd_cgroup' is already set to 'true'."
|
||||
# fi
|
||||
|
||||
# 2. 检查并设置 [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options].SystemdCgroup = true
|
||||
# 检查配置文件是否存在
|
||||
if [ ! -f "$CONFIG_TOML" ]; then
|
||||
echo "Error: Config file not found at $CONFIG_TOML"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Checking/fixing 'SystemdCgroup' configuration..."
|
||||
# 使用 sed 修改 SystemdCgroup 的值为 true,保留缩进
|
||||
if ! grep -q '^\s*SystemdCgroup\s*=\s*true' "$CONFIG_TOML"; then
|
||||
# 替换等号右侧的值,保留左侧的缩进和键名
|
||||
sed -i 's/^\(\s*SystemdCgroup\s*=\s*\).*/\1true/' "$CONFIG_TOML"
|
||||
echo "Modified 'SystemdCgroup' to 'true'. 修改后的值为:"
|
||||
grep '^\s*SystemdCgroup\s*=\s*true' "$CONFIG_TOML"
|
||||
else
|
||||
echo "'SystemdCgroup' is already set to 'true'."
|
||||
fi
|
||||
|
||||
# 3. 重启 containerd 服务
|
||||
echo "Restarting containerd..."
|
||||
sudo systemctl restart containerd
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "containerd restarted successfully."
|
||||
else
|
||||
echo "Failed to restart containerd. Check logs for errors."
|
||||
fi
|
||||
|
||||
# 验证配置
|
||||
echo "Verifying configuration..."
|
||||
crictl info | grep -i "systemd_cgroup" && crictl info | grep -i "SystemdCgroup"
|
||||
echo "containerd配置初始纠正完成."
|
||||
|
||||
echo "开始更新containerd配置以适配GPU实例"
|
||||
|
||||
# 检查是否有 NVIDIA GPU
|
||||
if lspci | grep -i nvidia > /dev/null 2>&1; then
|
||||
log_info "检测到NVIDIA GPU,开始配置nvidia-container-runtime..."
|
||||
|
||||
dpkg -i /opt/*.deb || log_error "安装nvidia-container-runtime及其依赖失败!"
|
||||
|
||||
# 配置 containerd 支持 nvidia runtime
|
||||
CONTAINERD_CONFIG="/etc/containerd/config.toml"
|
||||
if ! grep -q '\[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia\]' "$CONTAINERD_CONFIG"; then
|
||||
cat <<EOF >> "$CONTAINERD_CONFIG"
|
||||
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
|
||||
privileged_without_host_devices = false
|
||||
runtime_type = "io.containerd.runc.v2"
|
||||
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
|
||||
BinaryName = "/usr/bin/nvidia-container-runtime"
|
||||
EOF
|
||||
fi
|
||||
|
||||
# 重启 containerd
|
||||
systemctl restart containerd
|
||||
log_info "nvidia-container-runtime 配置完成,containerd已重启"
|
||||
else
|
||||
log_info "未检测到NVIDIA GPU,跳过nvidia-container-runtime配置"
|
||||
fi
|
||||
|
||||
# 修改 DNS 为阿里云公共 DNS(提升镜像拉取速度)
|
||||
# sudo tee /etc/resolv.conf <<EOF
|
||||
# nameserver 223.5.5.5
|
||||
# nameserver 223.6.6.6
|
||||
# nameserver 8.8.8.8
|
||||
# nameserver 114.114.114.114
|
||||
# EOF
|
||||
|
||||
# 5. 验证配置是否生效
|
||||
# sudo crictl --runtime-endpoint unix:///run/containerd/containerd.sock info
|
||||
crictl info
|
||||
|
||||
# 开启ip转发
|
||||
# 处理[ERROR FileContent--proc-sys-net-ipv4-ip_forward]: /proc/sys/net/ipv4/ip_forward contents are not set to 1问题
|
||||
if ! grep -q "^net.ipv4.ip_forward = 1" /etc/sysctl.conf; then
|
||||
echo "net.ipv4.ip_forward = 1" | sudo tee -a /etc/sysctl.conf > /dev/null
|
||||
sudo sysctl -p
|
||||
fi
|
||||
|
||||
# nfs_server_ip="192.168.0.3" # 替换为实际的NFS服务器IP
|
||||
# nfs_share_path="/d/k8s_nss"
|
||||
echo "======== 动态获取NFS服务器IP和共享目录 ========"
|
||||
nfs_server_ip="$2" # 替换为实际的NFS服务器IP
|
||||
nfs_share_path="$3" # 替换为实际的NFS服务器共享目录
|
||||
|
||||
# 不改变原有逻辑的基础上,将 K8s相关数据目录迁移到 $nfs_share_path 目录
|
||||
log_info "迁移K8s相关数据目录到$nfs_share_path挂载点..."
|
||||
|
||||
# 迁移containerd数据目录
|
||||
if [ ! -d $nfs_share_path/containerd ]; then
|
||||
mkdir -p $nfs_share_path/containerd
|
||||
fi
|
||||
if [ -d /var/lib/containerd ] && [ ! -L /var/lib/containerd ]; then
|
||||
systemctl stop containerd
|
||||
mv /var/lib/containerd/* $nfs_share_path/containerd/ 2>/dev/null || true
|
||||
rm -rf /var/lib/containerd
|
||||
ln -sf $nfs_share_path/containerd /var/lib/
|
||||
systemctl start containerd
|
||||
fi
|
||||
|
||||
# 迁移kubelet数据目录
|
||||
if [ ! -d $nfs_share_path/kubelet ]; then
|
||||
mkdir -p $nfs_share_path/kubelet
|
||||
fi
|
||||
if [ ! -L /var/lib/kubelet ]; then
|
||||
systemctl stop kubelet
|
||||
mv /var/lib/kubelet/* $nfs_share_path/kubelet/ 2>/dev/null || true
|
||||
rm -rf /var/lib/kubelet
|
||||
ln -sf $nfs_share_path/kubelet /var/lib/
|
||||
systemctl start kubelet
|
||||
fi
|
||||
|
||||
# 迁移kubeadm数据目录
|
||||
if [ ! -d $nfs_share_path/kubeadm ]; then
|
||||
mkdir -p $nfs_share_path/kubeadm
|
||||
fi
|
||||
if [ ! -L /var/lib/kubeadm ]; then
|
||||
mv /var/lib/kubeadm/* $nfs_share_path/kubeadm/ 2>/dev/null || true
|
||||
rm -rf /var/lib/kubeadm
|
||||
ln -sf $nfs_share_path/kubeadm /var/lib/
|
||||
fi
|
||||
|
||||
# 迁移etcd数据目录(仅master节点)
|
||||
if [ "$1" == "master" ]; then
|
||||
if [ ! -d $nfs_share_path/etcd ]; then
|
||||
mkdir -p $nfs_share_path/etcd
|
||||
fi
|
||||
if [ ! -L /var/lib/etcd ]; then
|
||||
systemctl stop kubelet 2>/dev/null || true
|
||||
mv /var/lib/etcd/* $nfs_share_path/etcd/ 2>/dev/null || true
|
||||
rm -rf /var/lib/etcd
|
||||
ln -sf $nfs_share_path/etcd /var/lib/
|
||||
systemctl start kubelet 2>/dev/null || true
|
||||
fi
|
||||
fi
|
||||
|
||||
# 权限修正
|
||||
chown -R root:root $nfs_share_path/containerd $nfs_share_path/kubelet $nfs_share_path/kubeadm $nfs_share_path/etcd 2>/dev/null || true
|
||||
|
||||
log_info "K8s数据目录迁移完成,所有数据将存储于$nfs_share_path下。"
|
||||
|
||||
# 判断是主节点还是副节点
|
||||
if [ "$1" == "master" ]; then
|
||||
# 写入hosts
|
||||
# if ! grep -q "k8s-master" /etc/hosts; then
|
||||
# echo "127.0.0.1 k8s-master" | sudo tee -a /etc/hosts > /dev/null
|
||||
# fi
|
||||
# 修改主机名,这里假设新主机名为 k8s-node,可根据实际情况修改
|
||||
hostnamectl set-hostname k8s-master || log_error "修改主机名失败"
|
||||
# 防火墙开放端口
|
||||
log_info "开放防火墙端口..."
|
||||
# 安装并配置 ufw(仅开放必要端口)
|
||||
# 开放 Kubernetes 控制平面端口
|
||||
sudo ufw allow 6443/tcp
|
||||
sudo ufw allow 10257/tcp
|
||||
sudo ufw allow 2379:2380/tcp
|
||||
|
||||
# 开放 kubelet 和组件通信端口(仅限集群内部)
|
||||
# 注意:10250 端口需严格限制访问,避免暴露到公网
|
||||
sudo ufw allow 10250:10252/tcp
|
||||
|
||||
# 开放 NodePort 服务端口范围
|
||||
sudo ufw allow 30000:32767/tcp
|
||||
|
||||
# 开放 CNI 插件端口(如 Calico)
|
||||
sudo ufw allow 4789/udp
|
||||
sudo ufw allow 179/tcp
|
||||
|
||||
# 开放 Ingress 端口(如 Nginx Ingress)
|
||||
sudo ufw allow 80/tcp
|
||||
sudo ufw allow 443/tcp
|
||||
# sudo ufw enable
|
||||
# 主节点安装步骤
|
||||
log_info "正在master节点进行安装core和初始化"
|
||||
# kubeadm config images list
|
||||
|
||||
# 导入本地镜像减少拉取时间
|
||||
chmod 755 /opt/import_images.sh && /opt/import_images.sh
|
||||
|
||||
sleep 1
|
||||
log_info "初始化主节点..."
|
||||
# kubeadm init --image-repository=registry.aliyuncs.com/google_containers --pod-network-cidr=10.244.0.0/16 --service-cidr=10.96.0.0/12 || log_error "主节点初始化失败"
|
||||
# kubeadm init --config=kubeadm.yaml --pod-network-cidr=10.244.0.0/16 --service-cidr=10.96.0.0/12
|
||||
kubeadm init --image-repository=registry.aliyuncs.com/google_containers --pod-network-cidr=10.244.0.0/16 --service-cidr=10.96.0.0/12 --kubernetes-version=v1.28.2 || log_error "主节点初始化失败"
|
||||
# sudo chmod 644 /etc/kubernetes/pki/*
|
||||
# sudo chown -R root:root /etc/kubernetes/pki
|
||||
|
||||
# 在主节点上执行以下命令来生成副节点加入的 join 指令
|
||||
log_info "生成工作节点加入的join指令..."
|
||||
join_command=$(kubeadm token create --print-join-command 2>/dev/null)
|
||||
# join_command=$(kubeadm token create --print-join-command --ttl 0 2>/dev/null)
|
||||
if [ -z "$join_command" ]; then
|
||||
log_error "生成join指令失败"
|
||||
else
|
||||
echo "$join_command" > join_command.txt
|
||||
echo "已将join命令保存到join_command.txt文件中,请在新窗口cat查看并拷贝到worker node进行集群注册"
|
||||
# 这里可以继续执行后面的步骤
|
||||
# 配置kubectl
|
||||
log_info "配置kubectl..."
|
||||
mkdir -p $HOME/.kube
|
||||
cp -i /etc/kubernetes/admin.conf $HOME/.kube/config || log_error "复制kubeconfig文件失败"
|
||||
chown $(id -u):$(id -g) $HOME/.kube/config || log_error "更改kubeconfig文件权限失败"
|
||||
|
||||
echo "master节点安装完毕..."
|
||||
sleep 1
|
||||
# 安装网络插件
|
||||
log_info "正在安装网络插件(flannel)"
|
||||
kubectl apply -f /opt/kube-flannel.yml || log_error "本地安装flannel网络插件失败"
|
||||
log_info "正在安装MetricsServer插件"
|
||||
kubectl apply -f /opt/components.yaml || log_error "本地安装MetricsServer插件失败"
|
||||
log_info "正在安装Ingress-nginx-controller插件"
|
||||
kubectl apply -f /opt/ingress-nginx-controller.yaml || log_error "本地安装ingress-nginx-controller插件失败"
|
||||
log_info "正在安装GPU模式必要插件"
|
||||
kubectl apply -f /opt/nvidia-device-plugin.yml || log_error "本地安装GPU模式必要插件失败"
|
||||
log_info "正在安装nfs-client-provisioner插件"
|
||||
aptitude -y install nfs-kernel-server nfs-common=1:1.3.4-2.5ubuntu3.7
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "NFS 服务器端安装失败,请检查网络连接或软件源。"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 创建集群共享目录
|
||||
# 检查 NFS 共享目录是否存在,若不存在则创建
|
||||
# 目前是控制节点承担所有共享存储,后期需要换成动态的NFS服务器
|
||||
mkdir -p $nfs_share_path
|
||||
|
||||
# 定义要添加到 /etc/exports 的配置行
|
||||
line="$nfs_share_path *(rw,sync,no_root_squash,no_subtree_check)"
|
||||
|
||||
# 检查 /etc/exports 文件是否已经包含指定行
|
||||
if ! grep -qF "$line" /etc/exports; then
|
||||
# 若不包含,则添加该行
|
||||
echo "$line" >> /etc/exports
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "共享目录配置文件修改失败,请检查文件权限。"
|
||||
exit 1
|
||||
else
|
||||
echo "成功添加共享目录配置。"
|
||||
fi
|
||||
else
|
||||
echo "共享目录配置已存在,无需重复添加。"
|
||||
fi
|
||||
|
||||
# 启动 NFS 服务
|
||||
echo "启动 NFS 服务..."
|
||||
systemctl restart nfs-kernel-server
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "NFS 服务启动失败,请检查配置文件。"
|
||||
exit 1
|
||||
fi
|
||||
kubectl apply -f /opt/storage_class.yaml || log_error "集群存储类nfs-storage-class初始化失败"
|
||||
#kubectl apply -f /opt/nfs-provisioner-deploy.yaml || log_error "动态存储nfs-provisioner-deploy初始化失败"
|
||||
echo "!!! 此处更换成读取动态的NFS服务器: xxx.xx.xx.xxx 及共享目录: /a/b/c !!!"
|
||||
nfs_provisioner_yaml='
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: nfs-client-provisioner
|
||||
labels:
|
||||
app: nfs-client-provisioner
|
||||
spec:
|
||||
replicas: 1
|
||||
strategy:
|
||||
type: Recreate ## 设置升级策略为删除再创建(默认为滚动更新)
|
||||
selector:
|
||||
matchLabels:
|
||||
app: nfs-client-provisioner
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: nfs-client-provisioner
|
||||
spec:
|
||||
serviceAccountName: nfs-client-provisioner
|
||||
containers:
|
||||
- name: nfs-client-provisioner
|
||||
#image: gcr.io/k8s-staging-sig-storage/nfs-subdir-external-provisioner:v4.0.0
|
||||
image: registry.cn-beijing.aliyuncs.com/xngczl/nfs-subdir-external-provisione:v4.0.0
|
||||
volumeMounts:
|
||||
- name: nfs-client-root
|
||||
mountPath: /persistentvolumes
|
||||
env:
|
||||
- name: PROVISIONER_NAME ## Provisioner的名称,以后设置的storageclass要和这个保持一致
|
||||
value: k8s-sigs.io/nfs-subdir-external-provisioner
|
||||
- name: NFS_SERVER ## NFS服务器地址,需和valumes参数中配置的保持一致
|
||||
value: '"$nfs_server_ip"' ## 替换为实际的NFS服务器IP
|
||||
- name: NFS_PATH ## NFS服务器数据存储目录,需和valumes参数中配置的保持一致
|
||||
value: '"$nfs_share_path"' ## 替换为实际的NFS服务器共享目录
|
||||
volumes:
|
||||
- name: nfs-client-root
|
||||
nfs:
|
||||
server: '"$nfs_server_ip"' ## NFS服务器地址
|
||||
path: '"$nfs_share_path"' ## NFS服务器数据存储目录
|
||||
readOnly: false
|
||||
'
|
||||
echo "$nfs_provisioner_yaml" | kubectl apply -f -
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "nfs动态工具链创建失败"
|
||||
exit 1
|
||||
fi
|
||||
kubectl apply -f /opt/nfs-rbac.yaml || log_error "集群共享存储权限nfs-rbac初始化失败"
|
||||
# 修改 deployment.yaml 文件,设置 NFS 服务器地址和共享目录
|
||||
# sed -i 's|NFS_SERVER|your_nfs_server_ip|g' deployment.yaml
|
||||
# sed -i 's|NFS_PATH|your_nfs_shared_directory|g' deployment.yaml
|
||||
|
||||
# # 创建资源
|
||||
# kubectl apply -f rbac.yaml
|
||||
# kubectl apply -f deployment.yaml
|
||||
# kubectl apply -f class.yaml
|
||||
|
||||
sleep 3
|
||||
|
||||
# 查询组件状态
|
||||
log_info "查询组件状态..."
|
||||
# 检查是否有组件状态为 Unhealthy
|
||||
if kubectl get componentstatuses 2>/dev/null | grep -q 'Unhealthy'; then
|
||||
echo "检测到组件状态为 Unhealthy, 开始修复..."
|
||||
|
||||
# 注释掉 --port=0 参数(添加备份文件)
|
||||
sed -i.bak '/--port=0/s/^/#/' /etc/kubernetes/manifests/kube-controller-manager.yaml
|
||||
sed -i.bak '/--port=0/s/^/#/' /etc/kubernetes/manifests/kube-scheduler.yaml
|
||||
|
||||
echo "已生成备份文件: kube-controller-manager.yaml.bak 和 kube-scheduler.yaml.bak"
|
||||
echo "修复完成,等待组件重启..."
|
||||
|
||||
else
|
||||
echo "所有组件状态正常,无需修复。"
|
||||
fi
|
||||
sleep 5
|
||||
systemctl restart kubelet.service || log_error "重启kubelet服务失败"
|
||||
log_info "30秒后再次查看组件状态..."
|
||||
sleep 30
|
||||
# 再次查看组件状态(需要稍等)
|
||||
kubectl get cs || log_info "再次获取组件状态失败"
|
||||
|
||||
echo "验证集群状态(安装完毕后手动执行),查看pod状态"
|
||||
log_info "查看pod状态..."
|
||||
kubectl get nodes || log_info "获取节点状态失败"
|
||||
kubectl get pods --all-namespaces || log_info "获取所有命名空间的pod状态失败"
|
||||
fi
|
||||
|
||||
elif [ "$1" == "worker" ]; then
|
||||
# 修改主机名
|
||||
apt install telnet -y
|
||||
aptitude -y install nfs-common=1:1.3.4-2.5ubuntu3.7
|
||||
# 写入hosts
|
||||
# if ! grep -q "k8s-worker" /etc/hosts; then
|
||||
# echo "127.0.0.1 k8s-worker" | sudo tee -a /etc/hosts > /dev/null
|
||||
# fi
|
||||
# 这里假设新主机名为 k8s-node,可根据实际情况修改
|
||||
hostnamectl set-hostname "k8s-worker-$(date +%Y%m%d%H%M%S)" || log_error "修改主机名失败"
|
||||
# 副节点安装步骤
|
||||
log_info "正在worker节点进行安装"
|
||||
apt update -y || log_error "更新apt源失败"
|
||||
# 从节点重启kubeadm,可解决曾启动过导致端口被占用的问题
|
||||
log_info "从节点重启kubeadm,可解决曾启动过导致端口被占用的问题..."
|
||||
kubeadm reset -f|| log_error "重置kubeadm失败"
|
||||
# 获取主节点的join命令(假设已提前获取并保存为join_command.txt)
|
||||
|
||||
# 导入本地网络插件部分镜像减少拉取时间
|
||||
chmod 755 /opt/import_images.sh && /opt/import_images.sh
|
||||
|
||||
echo "请输入加入对方kubernetes集群的命令: (任何时候)"
|
||||
# read join_command
|
||||
# eval "$join_command" || log_error "加入k8s集群失败"
|
||||
else
|
||||
echo "请指定正确的节点类型,master或worker"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# 检查安装过程是否有错误(这里只是简单示例,实际可能需要更详细的检查)
|
||||
if [ $? -ne 0 ]; then
|
||||
log_error "安装过程中出现错误,请手动解决后再重新执行"
|
||||
fi
|
||||
|
||||
log_info "安装脚本执行完毕"
|
||||
# 输出安装完成提示
|
||||
log_info "Kubernetes 安装脚本执行完毕,请根据提示进行后续操作。"
|
||||
log_info "如果是主节点,请在新窗口cat join_command.txt查看并拷贝到worker node进行集群注册"
|
||||
log_info "如果是worker节点,请在新窗口输入主节点提供的join命令进行集群注册"
|
||||
log_info "请注意,在执行完脚本后,可能需要等待一段时间以确保所有组件正常运行。"
|
||||
log_info "可以使用 'kubectl get nodes' 和 'kubectl get pods --all-namespaces' 命令来检查集群状态。"
|
||||
log_info "如果有任何问题,请检查日志或联系管理员Ahexl。"
|
||||
log_info "感谢使用本脚本,祝您使用愉快!"
|
||||
87
script/k8s_uninstall.sh
Normal file
87
script/k8s_uninstall.sh
Normal file
@ -0,0 +1,87 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 停止K8s相关服务
|
||||
echo "停止K8s相关服务..."
|
||||
ps -aux | grep Opera | grep -v grep | awk '{print $2}' | xargs kill -9
|
||||
systemctl stop kubelet
|
||||
echo "移除缓存忘卡配置"
|
||||
ip link delete cni0
|
||||
systemctl stop kube-apiserver
|
||||
|
||||
systemctl stop nfs-kernel-server
|
||||
rm -rf /k8sdata/*
|
||||
|
||||
# 执行kubeadm reset
|
||||
echo "执行kubeadm reset..."
|
||||
kubeadm reset -f
|
||||
|
||||
apt-get purge kubelet kubectl kubeadm kubernetes-cni -y --allow-change-held-packages
|
||||
rm -rf /etc/cni/net.d
|
||||
|
||||
rm -rf /var/lib/kubelet /var/lib/kubernetes
|
||||
rm -rf /etc/kubernetes/manifests
|
||||
rm -rf /etc/kubernetes/pki
|
||||
rm -rf /etc/kubernetes
|
||||
rm -rf /var/lib/etcd
|
||||
rm -rf /var/lib/cni
|
||||
rm -rf /var/lib/docker
|
||||
rm -rf /var/lib/containerd
|
||||
rm -rf /var/lib/etcd
|
||||
rm -rf /var/lib/kubelet
|
||||
rm -rf /var/lib/kube-proxy
|
||||
# 删除K8s配置文件
|
||||
echo "删除K8s配置文件..."
|
||||
sudo rm -rf /etc/kubernetes
|
||||
|
||||
# 删除K8s相关二进制文件(假设在/usr/local/bin)
|
||||
echo "删除K8s相关二进制文件..."
|
||||
sudo rm /usr/local/bin/kube*
|
||||
|
||||
# 清理Containerd数据(谨慎操作)
|
||||
echo "清理Containerd数据..."
|
||||
sudo rm -rf /var/lib/containerd /usr/bin/containerd*
|
||||
sudo apt purge -y containerd containerd.io cri-tools --allow-change-held-packages
|
||||
rm -rf /etc/containerd /var/lib/containerd /run/containerd
|
||||
rm -f /etc/systemd/system/multi-user.target.wants/containerd.service
|
||||
rm /lib/systemd/system/containerd.service
|
||||
systemctl daemon-reload
|
||||
|
||||
# 清理iptables规则
|
||||
echo "清理iptables规则..."
|
||||
# sudo iptables -F && sudo iptables -t nat -F && sudo iptables -t mangle -F && sudo iptables -X
|
||||
# ipvsadm --clear
|
||||
apt autoremove -y
|
||||
apt autoclean -y
|
||||
apt clean -y
|
||||
apt update -y
|
||||
|
||||
# 停止docker
|
||||
echo "停止docker并清理..."
|
||||
docker rmi $(docker images -q)
|
||||
docker stop $(docker ps -aq) && docker rm $(docker ps -aq) && sudo systemctl stop docker
|
||||
sudo systemctl stop docker.service
|
||||
sudo systemctl stop docker.socket
|
||||
rm -rf /etc/docker/daemon.json
|
||||
rm -rf /usr/bin/docker-compose
|
||||
|
||||
# 清理Docker
|
||||
apt-get purge docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin docker-ce-rootless-extras -y --allow-change-held-packages
|
||||
apt purge -y containerd.io containerd
|
||||
|
||||
# 检查并删除当前root用户的kubeconfig文件
|
||||
echo "检查并删除当前root用户的kubeconfig文件..."
|
||||
sudo rm -rf $HOME/.kube/config
|
||||
|
||||
# kubord
|
||||
echo "清理kuboard相关配置..."
|
||||
sed -i '/\/opt \*(rw,sync,no_root_squash)/d' /etc/exports
|
||||
|
||||
rm -rf /etc/apt/sources.list.d/docker*
|
||||
rm -rf /etc/apt/sources.list.d/kubernetes*
|
||||
rm -rf /etc/apt/sources.list.d/kuboard*
|
||||
|
||||
apt autoremove -y
|
||||
apt autoclean -y
|
||||
apt clean -y
|
||||
|
||||
echo "恭喜你!!! K8s相关内容已清理, 可准备重新安装。"
|
||||
BIN
script/libnvidia-container-tools_1.17.8-1_amd64.deb
Normal file
BIN
script/libnvidia-container-tools_1.17.8-1_amd64.deb
Normal file
Binary file not shown.
BIN
script/libnvidia-container1_1.17.8-1_amd64.deb
Normal file
BIN
script/libnvidia-container1_1.17.8-1_amd64.deb
Normal file
Binary file not shown.
BIN
script/nvidia-container-toolkit-base_1.17.8-1_amd64.deb
Normal file
BIN
script/nvidia-container-toolkit-base_1.17.8-1_amd64.deb
Normal file
Binary file not shown.
BIN
script/nvidia-container-toolkit_1.17.8-1_amd64.deb
Normal file
BIN
script/nvidia-container-toolkit_1.17.8-1_amd64.deb
Normal file
Binary file not shown.
15
script_test/readyaml.sh
Executable file
15
script_test/readyaml.sh
Executable file
@ -0,0 +1,15 @@
|
||||
#!/bin/bash
|
||||
|
||||
nfs_dynamic_yaml=""
|
||||
while IFS= read -r line; do
|
||||
nfs_dynamic_yaml+="$line\n" # 手动添加换行符
|
||||
done < ../files/nfs-provisioner-deploy.yaml
|
||||
|
||||
echo -e "内容:\n$nfs_dynamic_yaml"
|
||||
|
||||
echo "$nfs_dynamic_yaml" | kubectl apply -f -
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "资源创建失败"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
11
wwwroot/api/v1/cluster/common/delete_cluster_node/index.dspy
Normal file
11
wwwroot/api/v1/cluster/common/delete_cluster_node/index.dspy
Normal file
@ -0,0 +1,11 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
#debug(f"accept cpcc node: {params_kw=}")
|
||||
try:
|
||||
result_dict["data"] = delete_cluster_node(params_kw)
|
||||
result_dict["status"] = True
|
||||
result_dict["info"] = "operate success"
|
||||
except:
|
||||
import traceback
|
||||
debug(traceback.format_exc())
|
||||
return result_dict
|
||||
11
wwwroot/api/v1/cluster/common/delete_cluster_pod/index.dspy
Normal file
11
wwwroot/api/v1/cluster/common/delete_cluster_pod/index.dspy
Normal file
@ -0,0 +1,11 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
#debug(f"accept cpcc node: {params_kw=}")
|
||||
try:
|
||||
result_dict["data"] = delete_cluster_pod(params_kw)
|
||||
result_dict["status"] = True
|
||||
result_dict["info"] = "operate success"
|
||||
except:
|
||||
import traceback
|
||||
debug(traceback.format_exc())
|
||||
return result_dict
|
||||
11
wwwroot/api/v1/cluster/common/delete_cpcpod/index.dspy
Normal file
11
wwwroot/api/v1/cluster/common/delete_cpcpod/index.dspy
Normal file
@ -0,0 +1,11 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
#debug(f"delete_cpcpod接收cpcc参数:{params_kw=}")
|
||||
|
||||
try:
|
||||
result_dict["data"] = yaml_apply_delete(params_kw)
|
||||
result_dict["status"] = True
|
||||
result_dict["info"] = "operate success"
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return result_dict
|
||||
@ -0,0 +1,16 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
#endpoint=params_kw["endpoint"]
|
||||
|
||||
#if "_" not in endpoint and "-" not in endpoint:
|
||||
# result_dict["info"] = "endpoint format not allowed"
|
||||
# return result_dict
|
||||
#debug(f"accept cpcc node: {params_kw=}")
|
||||
try:
|
||||
result_dict["data"] = determine_accommodat_by_kubeconfig(params_kw)
|
||||
result_dict["status"] = True
|
||||
result_dict["info"] = "operate success"
|
||||
except:
|
||||
import traceback
|
||||
debug(traceback.format_exc())
|
||||
return result_dict
|
||||
16
wwwroot/api/v1/cluster/common/get_cluster_nodes/index.dspy
Normal file
16
wwwroot/api/v1/cluster/common/get_cluster_nodes/index.dspy
Normal file
@ -0,0 +1,16 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
#endpoint=params_kw["endpoint"]
|
||||
|
||||
#if "_" not in endpoint and "-" not in endpoint:
|
||||
# result_dict["info"] = "endpoint format not allowed"
|
||||
# return result_dict
|
||||
#debug(f"accept cpcc node: {params_kw=}")
|
||||
try:
|
||||
result_dict["data"] = get_cluster_nodes_by_kubeconfig(params_kw)
|
||||
result_dict["status"] = True
|
||||
result_dict["info"] = "operate success"
|
||||
except:
|
||||
import traceback
|
||||
debug(traceback.format_exc())
|
||||
return result_dict
|
||||
16
wwwroot/api/v1/cluster/common/get_cluster_pods/index.dspy
Normal file
16
wwwroot/api/v1/cluster/common/get_cluster_pods/index.dspy
Normal file
@ -0,0 +1,16 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
#endpoint=params_kw["endpoint"]
|
||||
|
||||
#if "_" not in endpoint and "-" not in endpoint:
|
||||
# result_dict["info"] = "endpoint format not allowed"
|
||||
# return result_dict
|
||||
#debug(f"accept cpcc node: {params_kw=}")
|
||||
try:
|
||||
result_dict["data"] = get_cluster_pods_by_kubeconfig(params_kw)
|
||||
result_dict["status"] = True
|
||||
result_dict["info"] = "operate success"
|
||||
except:
|
||||
import traceback
|
||||
debug(traceback.format_exc())
|
||||
return result_dict
|
||||
10
wwwroot/api/v1/cluster/common/multiple_cluster/index.dspy
Normal file
10
wwwroot/api/v1/cluster/common/multiple_cluster/index.dspy
Normal file
@ -0,0 +1,10 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
try:
|
||||
result_dict["data"] = get_multiple_cluster()
|
||||
result_dict["status"] = True
|
||||
result_dict["info"] = "operate success"
|
||||
except Exception as e:
|
||||
print(e)
|
||||
result_dict["data"] = [result]
|
||||
return result_dict
|
||||
@ -0,0 +1,10 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
try:
|
||||
result_dict["data"] = get_multiple_cluster_pod()
|
||||
result_dict["status"] = True
|
||||
result_dict["info"] = "operate success"
|
||||
except Exception as e:
|
||||
print(e)
|
||||
result_dict["data"] = [result]
|
||||
return result_dict
|
||||
11
wwwroot/api/v1/cluster/common/new_cluster/index.dspy
Normal file
11
wwwroot/api/v1/cluster/common/new_cluster/index.dspy
Normal file
@ -0,0 +1,11 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
debug(f"接收cpcc参数:{params_kw=}")
|
||||
|
||||
try:
|
||||
result_dict["data"] = new_cluster_install(params_kw)
|
||||
result_dict["status"] = True
|
||||
result_dict["info"] = "operate success"
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return result_dict
|
||||
11
wwwroot/api/v1/cluster/common/new_worker/index.dspy
Normal file
11
wwwroot/api/v1/cluster/common/new_worker/index.dspy
Normal file
@ -0,0 +1,11 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
debug(f"接收cpcc参数:{params_kw=}")
|
||||
|
||||
try:
|
||||
result_dict["data"] = new_cluster_install(params_kw)
|
||||
result_dict["status"] = True
|
||||
result_dict["info"] = "operate success"
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return result_dict
|
||||
11
wwwroot/api/v1/cluster/common/node_label_opt/index.dspy
Normal file
11
wwwroot/api/v1/cluster/common/node_label_opt/index.dspy
Normal file
@ -0,0 +1,11 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
debug(f"node_label_opt接收cpcc参数:{params_kw=}")
|
||||
|
||||
try:
|
||||
result_dict["data"] = node_label_opt(params_kw)
|
||||
result_dict["status"] = True
|
||||
result_dict["info"] = "operate success"
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return result_dict
|
||||
11
wwwroot/api/v1/cluster/common/node_state_switch/index.dspy
Normal file
11
wwwroot/api/v1/cluster/common/node_state_switch/index.dspy
Normal file
@ -0,0 +1,11 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
debug(f"接收cpcc参数:{params_kw=}")
|
||||
|
||||
try:
|
||||
result_dict["data"] = node_state_switch(params_kw)
|
||||
result_dict["status"] = True
|
||||
result_dict["info"] = "operate success"
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return result_dict
|
||||
11
wwwroot/api/v1/cluster/common/update_cpcpod/index.dspy
Normal file
11
wwwroot/api/v1/cluster/common/update_cpcpod/index.dspy
Normal file
@ -0,0 +1,11 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
#debug(f"update_cpcpod接收cpcc参数:{params_kw=}")
|
||||
|
||||
try:
|
||||
result_dict["data"] = yaml_apply_delete(params_kw)
|
||||
result_dict["status"] = True
|
||||
result_dict["info"] = "operate success"
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return result_dict
|
||||
12
wwwroot/api/v1/cluster/common/yaml_apply/index.dspy
Normal file
12
wwwroot/api/v1/cluster/common/yaml_apply/index.dspy
Normal file
@ -0,0 +1,12 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
debug(f"接收cpcc参数:{params_kw=}")
|
||||
|
||||
try:
|
||||
result_dict["data"] = yaml_apply_delete(params_kw)
|
||||
result_dict["status"] = True
|
||||
result_dict["info"] = "operate success"
|
||||
except Exception as e:
|
||||
debug(f'{e}')
|
||||
result_dict["info"] = e
|
||||
return result_dict
|
||||
73
wwwroot/api/v1/create_pod.dspy
Normal file
73
wwwroot/api/v1/create_pod.dspy
Normal file
@ -0,0 +1,73 @@
|
||||
async def create_pod(ns={}):
|
||||
import hashlib
|
||||
ns['pvcname'] = hashlib.md5(str(time.time()).encode()).hexdigest()[:10]
|
||||
ns['podname'] = ns['pvcname']
|
||||
ns['containername'] = ns['pvcname']
|
||||
ns['volumename'] = ns['pvcname']
|
||||
ns['namespace'] = ns['namespace'] if ns.get('namespace') else 'default'
|
||||
|
||||
namespace = ns['namespace'] # 使用的命名空间
|
||||
core_api = client.CoreV1Api()
|
||||
|
||||
# 创建 PVC
|
||||
#create_persistent_volume_claim(core_api, namespace)
|
||||
pvc = client.V1PersistentVolumeClaim(
|
||||
metadata=client.V1ObjectMeta(name=ns['pvcname']),
|
||||
spec=client.V1PersistentVolumeClaimSpec(
|
||||
access_modes=["ReadWriteOnce"],
|
||||
resources=client.V1ResourceRequirements(
|
||||
requests={"storage": str(ns['storage']) + "Gi"}
|
||||
)
|
||||
)
|
||||
)
|
||||
core_api.create_namespaced_persistent_volume_claim(namespace=namespace, body=pvc)
|
||||
print("PVC created.")
|
||||
|
||||
# 创建 Pod
|
||||
# create_pod(core_api, namespace)
|
||||
pod = client.V1Pod(
|
||||
metadata=client.V1ObjectMeta(name=ns['podname']),
|
||||
spec=client.V1PodSpec(
|
||||
containers=[
|
||||
client.V1Container(
|
||||
name=ns['containername'],
|
||||
image=ns['image'], # 使用 Nginx 容器
|
||||
command=["tail", "-f", "/dev/null"], # 确保容器保持运行
|
||||
resources=client.V1ResourceRequirements(
|
||||
requests={
|
||||
"cpu": str(ns['cpu']), # 请求 1 个 CPU
|
||||
"memory": str(ns['memory']) + "Gi", # 请求 5 GB 内存
|
||||
},
|
||||
limits={
|
||||
"cpu": str(ns['cpu']), # 限制最多使用 4 个 CPU
|
||||
"memory": str(ns['memory']) + "Gi", # 限制最多使用 8 GB 内存
|
||||
"nvidia.com/gpu": ns['gpu'],
|
||||
"nvidia.com/gpumem": ns['gpumem']
|
||||
},
|
||||
),
|
||||
volume_mounts=[
|
||||
client.V1VolumeMount(
|
||||
name=ns['volumename'],
|
||||
mount_path="/usr/share/", # 挂载路径
|
||||
)
|
||||
],
|
||||
)
|
||||
],
|
||||
volumes=[
|
||||
client.V1Volume(
|
||||
name=ns['volumename'],
|
||||
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
||||
claim_name=ns['pvcname']
|
||||
),
|
||||
)
|
||||
],
|
||||
)
|
||||
)
|
||||
core_api.create_namespaced_pod(namespace=namespace, body=pod)
|
||||
ns['status'] = True
|
||||
ns['msg'] = '创建实例成功'
|
||||
return ns
|
||||
|
||||
ret = await create_pod(params_kw)
|
||||
return ret
|
||||
|
||||
103
wwwroot/api/v1/get_available_resources.dspy
Normal file
103
wwwroot/api/v1/get_available_resources.dspy
Normal file
@ -0,0 +1,103 @@
|
||||
async def get_available_resources(ns={}):
|
||||
|
||||
# 创建 API 实例
|
||||
v1 = client.CoreV1Api()
|
||||
|
||||
# 获取所有节点
|
||||
nodes = v1.list_node()
|
||||
# 获取所有 Pod
|
||||
pods = v1.list_pod_for_all_namespaces()
|
||||
|
||||
# 存储节点资源信息
|
||||
node_resources = {}
|
||||
|
||||
total_allocatable = {
|
||||
'cpu': 0,
|
||||
'memory': 0,
|
||||
'gpu': 0,
|
||||
'storage': 0
|
||||
}
|
||||
total_used = {
|
||||
'cpu': 0,
|
||||
'memory': 0,
|
||||
'gpu': 0,
|
||||
'storage': 0
|
||||
}
|
||||
|
||||
for node in nodes.items:
|
||||
name = node.metadata.name
|
||||
allocatable = node.status.allocatable
|
||||
node_resources[name] = {
|
||||
'cpu_allocatable': int(allocatable.get('cpu', '0').rstrip('m')) / 1000 if 'm' in allocatable.get('cpu', '0') else int(allocatable.get('cpu', '0')),
|
||||
'memory_allocatable': int(allocatable.get('memory', '0').rstrip('Ki')) / 1024 / 1024,
|
||||
'gpu_allocatable': int(allocatable.get('nvidia.com/gpu', '0')),
|
||||
'storage_allocatable': int(allocatable.get('ephemeral-storage', '0').rstrip('Ki')) / 1024 / 1024
|
||||
}
|
||||
# 累加总可分配资源
|
||||
total_allocatable['cpu'] += node_resources[name]['cpu_allocatable']
|
||||
total_allocatable['memory'] += node_resources[name]['memory_allocatable']
|
||||
total_allocatable['gpu'] += node_resources[name]['gpu_allocatable']
|
||||
total_allocatable['storage'] += node_resources[name]['storage_allocatable']
|
||||
|
||||
# 初始化已分配
|
||||
node_resources[name].update({
|
||||
'cpu_used': 0,
|
||||
'memory_used': 0,
|
||||
'gpu_used': 0,
|
||||
'storage_used': 0
|
||||
})
|
||||
|
||||
# 遍历所有 Pod,统计每个节点的已分配资源
|
||||
for pod in pods.items:
|
||||
if pod.spec.node_name: # 确保 Pod 已被调度到节点
|
||||
node_name = pod.spec.node_name
|
||||
for container in pod.spec.containers:
|
||||
reque = container.resources.requests or {}
|
||||
node_resources[node_name]['cpu_used'] += float(reque.get('cpu', '0').rstrip('m')) / 1000 if 'm' in reque.get('cpu', '0') else float(reque.get('cpu', '0'))
|
||||
node_resources[node_name]['memory_used'] += int(reque.get('memory', '0').rstrip('Mi')) if 'Mi' in reque.get('memory', '0') else int(reque.get('memory', '0').rstrip('Gi')) * 1024
|
||||
node_resources[node_name]['gpu_used'] += int(reque.get('nvidia.com/gpu', '0'))
|
||||
node_resources[node_name]['storage_used'] += int(reque.get('ephemeral-storage', '0').rstrip('Mi')) if 'Mi' in reque.get('ephemeral-storage', '0') else 0
|
||||
|
||||
# 计算总已使用资源
|
||||
for node_name, resources in node_resources.items():
|
||||
total_used['cpu'] += resources['cpu_used']
|
||||
total_used['memory'] += resources['memory_used']
|
||||
total_used['gpu'] += resources['gpu_used']
|
||||
total_used['storage'] += resources['storage_used']
|
||||
|
||||
# 计算节点的资源占用情况
|
||||
print(f"Node: {node_name}")
|
||||
print(f" CPU Remaining: {resources['cpu_allocatable'] - resources['cpu_used']} cores")
|
||||
print(f" Memory Remaining: {resources['memory_allocatable'] - resources['memory_used']} Mi")
|
||||
print(f" GPU Remaining: {resources['gpu_allocatable'] - resources['gpu_used']} GPUs")
|
||||
print(f" Storage Remaining: {resources['storage_allocatable'] - resources['storage_used']} Mi")
|
||||
print()
|
||||
|
||||
# 计算总剩余资源和使用百分比
|
||||
total_remaining = {key: total_allocatable[key] - total_used[key] for key in total_allocatable}
|
||||
usage_percentage = {key: (total_used[key] / total_allocatable[key] * 100 if total_allocatable[key] > 0 else 0) for key in total_allocatable}
|
||||
|
||||
# 输出总资源和使用情况
|
||||
print("Cluster Resource Summary:")
|
||||
print(f" Total Allocatable CPU: {total_allocatable['cpu']} cores")
|
||||
print(f" Total Allocatable Memory: {total_allocatable['memory']} Mi")
|
||||
print(f" Total Allocatable GPU: {total_allocatable['gpu']} GPUs")
|
||||
print(f" Total Allocatable Storage: {total_allocatable['storage']} Mi")
|
||||
print()
|
||||
print(f" CPU Usage Percentage: {usage_percentage['cpu']:.2f}%")
|
||||
print(f" Memory Usage Percentage: {usage_percentage['memory']:.2f}%")
|
||||
print(f" GPU Usage Percentage: {usage_percentage['gpu']:.2f}%")
|
||||
print(f" Storage Usage Percentage: {usage_percentage['storage']:.2f}%")
|
||||
print()
|
||||
|
||||
# 返回数据
|
||||
return {
|
||||
"total_allocatable": total_allocatable,
|
||||
"total_used": total_used,
|
||||
"total_remaining": total_remaining,
|
||||
"usage_percentage": usage_percentage
|
||||
}
|
||||
|
||||
ret = await get_available_resources(params_kw)
|
||||
return ret
|
||||
|
||||
11
wwwroot/api/v1/ldap/add_ldap_user/index.dspy
Normal file
11
wwwroot/api/v1/ldap/add_ldap_user/index.dspy
Normal file
@ -0,0 +1,11 @@
|
||||
info('test .....{params_kw=}')
|
||||
print(params_kw)
|
||||
uid=params_kw["uid"]
|
||||
uid_number=params_kw["uid_number"]
|
||||
plaintext_password=params_kw["plaintext_password"]
|
||||
cn=params_kw["cn"]
|
||||
|
||||
result=add_ldap_user(uid,uid_number,plaintext_password,cn)
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
7
wwwroot/api/v1/ldap/delete_ldap_user/index.dspy
Normal file
7
wwwroot/api/v1/ldap/delete_ldap_user/index.dspy
Normal file
@ -0,0 +1,7 @@
|
||||
info('test .....{params_kw=}')
|
||||
uid=params_kw["uid"]
|
||||
|
||||
result=delete_ldap_user(uid)
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
6
wwwroot/api/v1/ldap/get_all_ldap_cn/index.dspy
Normal file
6
wwwroot/api/v1/ldap/get_all_ldap_cn/index.dspy
Normal file
@ -0,0 +1,6 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
result=get_all_ldap_cn()
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
6
wwwroot/api/v1/ldap/get_all_ldap_user/index.dspy
Normal file
6
wwwroot/api/v1/ldap/get_all_ldap_user/index.dspy
Normal file
@ -0,0 +1,6 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
result=get_all_ldap_user()
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
7
wwwroot/api/v1/ldap/get_one_cn/index.dspy
Normal file
7
wwwroot/api/v1/ldap/get_one_cn/index.dspy
Normal file
@ -0,0 +1,7 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
cn=params_kw["cn"]
|
||||
result=get_one_cn(cn)
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
91
wwwroot/api/v1/resource_enough.dspy
Normal file
91
wwwroot/api/v1/resource_enough.dspy
Normal file
@ -0,0 +1,91 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# @Time: 2024/12/5 17:07
|
||||
|
||||
import kubernetes.client
|
||||
from kubernetes.client.rest import ApiException
|
||||
from kubernetes import config
|
||||
|
||||
# 加载 Kubernetes 配置(如果在集群内运行则可以忽略)
|
||||
config.load_kube_config()
|
||||
|
||||
|
||||
def check_resource_availability(cpu_request, memory_request, storage_request, gpu_request):
|
||||
"""
|
||||
检查集群是否有足够资源来创建 Pod。
|
||||
:param cpu_request: 请求的 CPU 核数
|
||||
:param memory_request: 请求的内存(单位为MiB)
|
||||
:param storage_request: 请求的存储(单位为GiB)
|
||||
:param gpu_request: 请求的 GPU 数量
|
||||
:return: 是否有足够资源
|
||||
"""
|
||||
v1 = kubernetes.client.CoreV1Api()
|
||||
|
||||
# 获取所有节点的信息
|
||||
nodes = v1.list_node()
|
||||
|
||||
for node in nodes.items:
|
||||
cpu_capacity = node.status.capacity['cpu']
|
||||
memory_capacity = node.status.capacity['memory']
|
||||
storage_capacity = node.status.capacity.get('ephemeral-storage', '0Gi') # 有些节点存储没有显示
|
||||
gpu_capacity = 0 # 默认无 GPU
|
||||
if 'nvidia.com/gpu' in node.status.capacity:
|
||||
gpu_capacity = node.status.capacity['nvidia.com/gpu']
|
||||
|
||||
# 将单位转换成整数进行比较
|
||||
cpu_capacity = int(cpu_capacity)
|
||||
memory_capacity = int(memory_capacity[:-2]) # 去掉最后的Mi
|
||||
storage_capacity = int(storage_capacity[:-2]) # 去掉最后的Gi
|
||||
gpu_capacity = int(gpu_capacity)
|
||||
|
||||
# 判断该节点是否满足资源请求
|
||||
if (cpu_capacity >= cpu_request and
|
||||
memory_capacity >= memory_request and
|
||||
storage_capacity >= storage_request and
|
||||
gpu_capacity >= gpu_request):
|
||||
print(f"Node {node.metadata.name} has enough resources.")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def create_pod(cpu_request, memory_request, storage_request, gpu_request):
|
||||
"""创建 Pod,先检查资源是否足够"""
|
||||
if not check_resource_availability(cpu_request, memory_request, storage_request, gpu_request):
|
||||
print("No node has enough resources to fulfill the request.")
|
||||
return
|
||||
|
||||
# 如果资源足够,则创建 Pod
|
||||
v1 = kubernetes.client.CoreV1Api()
|
||||
pod_manifest = {
|
||||
"apiVersion": "v1",
|
||||
"kind": "Pod",
|
||||
"metadata": {"name": "my-pod"},
|
||||
"spec": {
|
||||
"containers": [{
|
||||
"name": "my-container",
|
||||
"image": "nginx", # 可替换为需要的镜像
|
||||
"resources": {
|
||||
"requests": {
|
||||
"cpu": f"{cpu_request}m", # 单位是millicpu
|
||||
"memory": f"{memory_request}Mi", # 单位是Mi
|
||||
"ephemeral-storage": f"{storage_request}Gi", # 单位是Gi
|
||||
},
|
||||
"limits": {
|
||||
"cpu": f"{cpu_request}m",
|
||||
"memory": f"{memory_request}Mi",
|
||||
"ephemeral-storage": f"{storage_request}Gi",
|
||||
}
|
||||
}
|
||||
}]
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
# 创建 Pod
|
||||
v1.create_namespaced_pod(namespace="default", body=pod_manifest)
|
||||
print("Pod created successfully.")
|
||||
except ApiException as e:
|
||||
print(f"Error creating pod: {e}")
|
||||
|
||||
|
||||
# 示例使用:请求 1 CPU、2G 内存、30G 存储、0 GPU
|
||||
create_pod(1000, 2048, 30, 0) # 1000m CPU, 2048Mi 内存, 30Gi 存储, 0 GPU
|
||||
33
wwwroot/api/v1/server_instance_delete.dspy
Normal file
33
wwwroot/api/v1/server_instance_delete.dspy
Normal file
@ -0,0 +1,33 @@
|
||||
async def server_instance_delete(ns={}):
|
||||
from kubernetes.client.rest import ApiException
|
||||
|
||||
# 加载 kubeconfig 配置
|
||||
# config.load_kube_config()
|
||||
|
||||
# 设置你要删除的 pod 名称和所在的命名空间
|
||||
namespace = ns['namespace'] if ns.get('namespace') else 'default'
|
||||
podname = ns.get('podname')
|
||||
pvcname = ns.get('pvcname')
|
||||
|
||||
# 创建 Pod API 客户端
|
||||
v1 = client.CoreV1Api()
|
||||
try:
|
||||
# 删除 Pod
|
||||
v1.delete_namespaced_pod(name=podname, namespace=namespace)
|
||||
print(f"Pod {podname} 删除成功")
|
||||
v1.delete_namespaced_persistent_volume_claim(name=pvcname, namespace=namespace)
|
||||
print(f"PVC {pvcname} 删除成功")
|
||||
return {
|
||||
'status': True,
|
||||
'msg': '实例删除成功'
|
||||
}
|
||||
except ApiException as e:
|
||||
print(f"删除 Pod/PVC 失败: {e}")
|
||||
return {
|
||||
'status': False,
|
||||
'msg': '实例删除失败, %s' % str(e)
|
||||
}
|
||||
|
||||
ret = await server_instance_delete(params_kw)
|
||||
return ret
|
||||
|
||||
7
wwwroot/api/v1/slurm/job/get_history_list/index.dspy
Normal file
7
wwwroot/api/v1/slurm/job/get_history_list/index.dspy
Normal file
@ -0,0 +1,7 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
|
||||
result=get_history_list(params_kw)
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
@ -0,0 +1,7 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
|
||||
result=get_history_list_json(params_kw)
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
7
wwwroot/api/v1/slurm/job/get_real_time_list/index.dspy
Normal file
7
wwwroot/api/v1/slurm/job/get_real_time_list/index.dspy
Normal file
@ -0,0 +1,7 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
|
||||
result=get_real_time_list(params_kw)
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
@ -0,0 +1,7 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
|
||||
result=get_real_time_list_json(params_kw)
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
7
wwwroot/api/v1/slurm/job/kill_job/index.dspy
Normal file
7
wwwroot/api/v1/slurm/job/kill_job/index.dspy
Normal file
@ -0,0 +1,7 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
|
||||
result=kill_job(params_kw["jobId"])
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
7
wwwroot/api/v1/slurm/job/resume_job/index.dspy
Normal file
7
wwwroot/api/v1/slurm/job/resume_job/index.dspy
Normal file
@ -0,0 +1,7 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
|
||||
result=resume_job(params_kw["jobId"])
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
7
wwwroot/api/v1/slurm/job/submit_job/index.dspy
Normal file
7
wwwroot/api/v1/slurm/job/submit_job/index.dspy
Normal file
@ -0,0 +1,7 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
|
||||
result=submit_job(params_kw["command"])
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
7
wwwroot/api/v1/slurm/job/suspend_job/index.dspy
Normal file
7
wwwroot/api/v1/slurm/job/suspend_job/index.dspy
Normal file
@ -0,0 +1,7 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
|
||||
result=suspend_job(params_kw["jobId"])
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
20
wwwroot/api/v1/slurm/node/list_node_details_json/index.dspy
Normal file
20
wwwroot/api/v1/slurm/node/list_node_details_json/index.dspy
Normal file
@ -0,0 +1,20 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
print(params_kw)
|
||||
if "PartitionName" in params_kw:
|
||||
result_partition=list_partition_detail_json(params_kw)
|
||||
if len(result_partition)>0:
|
||||
result=get_node_details_json(result_partition[0]["Nodes"])
|
||||
else:
|
||||
result=[]
|
||||
|
||||
|
||||
|
||||
result_dict["data"]=result
|
||||
|
||||
return result_dict
|
||||
else:
|
||||
result=get_node_details_json(params_kw["NodeName"])
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
|
||||
7
wwwroot/api/v1/slurm/node/update_node/index.dspy
Normal file
7
wwwroot/api/v1/slurm/node/update_node/index.dspy
Normal file
@ -0,0 +1,7 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
|
||||
result=update_node(params_kw)
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
@ -0,0 +1,7 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
|
||||
result=create_partition(params_kw)
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
@ -0,0 +1,7 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
|
||||
result=delete_partition(params_kw)
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
@ -0,0 +1,5 @@
|
||||
|
||||
result=list_partition_detail_json(params_kw)
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
@ -0,0 +1,6 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
result=list_partition_info(params_kw)
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
@ -0,0 +1,7 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
|
||||
result=update_partition(params_kw)
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
@ -0,0 +1,7 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
|
||||
result=get_storage_json(params_kw["point"])
|
||||
|
||||
result_dict["data"]=result
|
||||
return result_dict
|
||||
223
wwwroot/doc/ldap/ldap相关.md
Normal file
223
wwwroot/doc/ldap/ldap相关.md
Normal file
@ -0,0 +1,223 @@
|
||||
# Authentication
|
||||
|
||||
- HTTP Authentication, scheme: basic
|
||||
|
||||
# ldap相关
|
||||
|
||||
## GET 获取全部ldapUser
|
||||
|
||||
GET /api/v1/ldap/get_all_ldap_user
|
||||
|
||||
> 返回示例
|
||||
|
||||
> 200 Response
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "success",
|
||||
"data": [
|
||||
{
|
||||
"attributes": {
|
||||
"cn": [
|
||||
"test1"
|
||||
],
|
||||
"mail": [],
|
||||
"sn": [
|
||||
"test1"
|
||||
]
|
||||
},
|
||||
"dn": "uid=test1,ou=test,dc=test,dc=com"
|
||||
},
|
||||
{
|
||||
"attributes": {
|
||||
"cn": [
|
||||
"test"
|
||||
],
|
||||
"mail": [],
|
||||
"sn": [
|
||||
"test_add2"
|
||||
]
|
||||
},
|
||||
"dn": "uid=test_add2,ou=test,dc=test,dc=com"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### 返回结果
|
||||
|
||||
| 状态码 | 状态码含义 | 说明 | 数据模型 |
|
||||
| --- | ------------------------------------------------------- | ---- | ------ |
|
||||
| 200 | [OK](https://tools.ietf.org/html/rfc7231#section-6.3.1) | none | Inline |
|
||||
|
||||
## GET 删除某个LdapUser
|
||||
|
||||
GET /api/v1/ldap/delete_ldap_user
|
||||
|
||||
### 请求参数
|
||||
|
||||
| 名称 | 位置 | 类型 | 必选 | 说明 |
|
||||
| --- | ----- | ------ | --- | ---- |
|
||||
| uid | query | string | 否 | none |
|
||||
|
||||
> 返回示例
|
||||
|
||||
> 200 Response
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "success",
|
||||
"data":{
|
||||
"result": 0,
|
||||
"description": "success",
|
||||
"dn": "",
|
||||
"message": "",
|
||||
"referrals": null,
|
||||
"type": "delResponse"
|
||||
}
|
||||
|
||||
}
|
||||
```
|
||||
|
||||
### 返回结果
|
||||
|
||||
| 状态码 | 状态码含义 | 说明 | 数据模型 |
|
||||
| --- | ------------------------------------------------------- | ---- | ------ |
|
||||
| 200 | [OK](https://tools.ietf.org/html/rfc7231#section-6.3.1) | none | Inline |
|
||||
|
||||
###
|
||||
|
||||
## GET 获取某个CN
|
||||
|
||||
GET /api/v1/ldap/get_one_cn
|
||||
|
||||
### 请求参数
|
||||
|
||||
| 名称 | 位置 | 类型 | 必选 | 说明 |
|
||||
| --- | ----- | ------ | --- | ---- |
|
||||
| cn | query | string | 否 | none |
|
||||
|
||||
> 返回示例
|
||||
|
||||
> 200 Response
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "success",
|
||||
"data":{
|
||||
"attributes": {
|
||||
"cn": [
|
||||
"test"
|
||||
],
|
||||
"gidNumber": [
|
||||
47758
|
||||
],
|
||||
"objectClass": [
|
||||
"posixGroup",
|
||||
"top"
|
||||
]
|
||||
},
|
||||
"dn": "cn=test,ou=test,dc=test,dc=com"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 返回结果
|
||||
|
||||
| 状态码 | 状态码含义 | 说明 | 数据模型 |
|
||||
| --- | ------------------------------------------------------- | ---- | ------ |
|
||||
| 200 | [OK](https://tools.ietf.org/html/rfc7231#section-6.3.1) | none | Inline |
|
||||
|
||||
###
|
||||
|
||||
## GET 新增某个LdapUser
|
||||
|
||||
GET /api/v1/ldap/add_ldap_user
|
||||
|
||||
### 请求参数
|
||||
|
||||
| 名称 | 位置 | 类型 | 必选 | 说明 |
|
||||
| ------------------ | ----- | ------ | --- | ------ |
|
||||
| uid | query | string | 否 | 集群账号 |
|
||||
| uid_number | query | string | 否 | 集群账号id |
|
||||
| plaintext_password | query | string | 否 | 密码 |
|
||||
| cn | query | string | 否 | none |
|
||||
|
||||
> 返回示例
|
||||
|
||||
> 200 Response
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "success",
|
||||
"data":{
|
||||
"result": 0,
|
||||
"description": "success",
|
||||
"dn": "",
|
||||
"message": "",
|
||||
"referrals": null,
|
||||
"type": "modifyResponse"
|
||||
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 返回结果
|
||||
|
||||
| 状态码 | 状态码含义 | 说明 | 数据模型 |
|
||||
| --- | ------------------------------------------------------- | ---- | ------ |
|
||||
| 200 | [OK](https://tools.ietf.org/html/rfc7231#section-6.3.1) | none | Inline |
|
||||
|
||||
###
|
||||
|
||||
## GET 获取全部ldapCN
|
||||
|
||||
GET /api/v1/ldap/get_all_ldap_cn
|
||||
|
||||
> 返回示例
|
||||
|
||||
> 200 Response
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "success",
|
||||
"data": [
|
||||
{
|
||||
"attributes": {
|
||||
"cn": [
|
||||
"testGroup"
|
||||
],
|
||||
"gidNumber": [
|
||||
34423
|
||||
],
|
||||
"objectClass": [
|
||||
"posixGroup",
|
||||
"top"
|
||||
]
|
||||
},
|
||||
"dn": "cn=testGroup,ou=test,dc=test,dc=com"
|
||||
},
|
||||
{
|
||||
"attributes": {
|
||||
"cn": [
|
||||
"test"
|
||||
],
|
||||
"gidNumber": [
|
||||
47758
|
||||
],
|
||||
"objectClass": [
|
||||
"posixGroup",
|
||||
"top"
|
||||
]
|
||||
},
|
||||
"dn": "cn=test,ou=test,dc=test,dc=com"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### 返回结果
|
||||
|
||||
| 状态码 | 状态码含义 | 说明 | 数据模型 |
|
||||
| --- | ------------------------------------------------------- | ---- | ------ |
|
||||
| 200 | [OK](https://tools.ietf.org/html/rfc7231#section-6.3.1) | none | Inline |
|
||||
15
wwwroot/doc/slurm/作业相关.md
Normal file
15
wwwroot/doc/slurm/作业相关.md
Normal file
@ -0,0 +1,15 @@
|
||||
## GET V1获取历史作业json
|
||||
* url: /api/v1/slurm/job/get_history_list_json
|
||||
* query:
|
||||
* startStartTime 在这个时间之后提交的作业
|
||||
* group 集群用户组
|
||||
* jobId 作业id
|
||||
* accountUserName 集群账号
|
||||
* jobIdList 作业id列表
|
||||
* statusList 状态列表
|
||||
|
||||
|
||||
### 查看运行的作业
|
||||
`/api/v1/slurm/job/get_history_list_json?statusList=running`
|
||||
### 查看2025年的作业
|
||||
`/api/v1/slurm/job/get_history_list_json?startStartTime=2025-01-01T00:00:00`
|
||||
14
wwwroot/doc/slurm/节点相关.md
Normal file
14
wwwroot/doc/slurm/节点相关.md
Normal file
@ -0,0 +1,14 @@
|
||||
## GET V1 修改节点
|
||||
* url: /api/v1/slurm/node/update_node
|
||||
* query
|
||||
* NodeName:节点名称
|
||||
* State:状态
|
||||
* Reason:原因
|
||||
|
||||
### 将节点设置为维护状态
|
||||
`/api/v1/slurm/node/update_node?NodeName=CENI-KFSJk&State=DRAIN&Reason=weihu`
|
||||
|
||||
### 重新启动节点
|
||||
`/api/v1/slurm/node/update_node?NodeName=node01&State=RESUME`
|
||||
### 将节点设置为下线状态
|
||||
`/api/v1/slurm/node/update_node?NodeName=CENI-KFSJk&State=DOWN&Reason=weihu`
|
||||
317
wwwroot/doc/slurm/队列相关.md
Normal file
317
wwwroot/doc/slurm/队列相关.md
Normal file
@ -0,0 +1,317 @@
|
||||
## GET V1查询队列详细json
|
||||
* url /api/v1/slurm/partition/list_partition_detail_json
|
||||
* resopone
|
||||
```json
|
||||
{
|
||||
"status": "success",
|
||||
"data": [
|
||||
{
|
||||
"PartitionName": "master",
|
||||
"AllowGroups": "ALL",
|
||||
"AllowAccounts": "ALL",
|
||||
"AllowQos": "ALL",
|
||||
"AllocNodes": "ALL",
|
||||
"Default": "NO",
|
||||
"QoS": "N/A",
|
||||
"DefaultTime": "NONE",
|
||||
"DisableRootJobs": "NO",
|
||||
"ExclusiveUser": "NO",
|
||||
"GraceTime": "0",
|
||||
"Hidden": "NO",
|
||||
"MaxNodes": "UNLIMITED",
|
||||
"MaxTime": "UNLIMITED",
|
||||
"MinNodes": "0",
|
||||
"LLN": "NO",
|
||||
"MaxCPUsPerNode": "UNLIMITED",
|
||||
"Nodes": "CENI-KFSJK,CENI-CSSJK",
|
||||
"PriorityJobFactor": "1",
|
||||
"PriorityTier": "1",
|
||||
"RootOnly": "NO",
|
||||
"ReqResv": "NO",
|
||||
"OverSubscribe": "NO",
|
||||
"OverTimeLimit": "NONE",
|
||||
"PreemptMode": "OFF",
|
||||
"State": "UP",
|
||||
"TotalCPUs": "16",
|
||||
"TotalNodes": "2",
|
||||
"SelectTypeParameters": "NONE",
|
||||
"JobDefaults": "(null)",
|
||||
"DefMemPerNode": "UNLIMITED",
|
||||
"MaxMemPerNode": "UNLIMITED"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
## GET V1创建队列
|
||||
* url:/api/v1/slurm/partition/create_partition
|
||||
### 创建一个队列名为kaiyuanyun
|
||||
* query
|
||||
* PartitionName:kaiyuanyun
|
||||
|
||||
`/api/v1/slurm/partition/create_partition?PartitionName=kaiyuanyun`
|
||||
### 创建一个队列名为kaiyuanyun且节点包含CENI-KFSJK
|
||||
* query
|
||||
* PartitionName:kaiyuanyun
|
||||
* Nodes:CENI-KFSJK
|
||||
|
||||
`/api/v1/slurm/partition/create_partition?PartitionName=kaiyuanyun&nodes=CENI-KFSJK`
|
||||
|
||||
### 创建一个队列名为kaiyuanyun且节点包含CENI-KFSJK且只允许testgroup
|
||||
`/api/v1/slurm/partition/create_partition?PartitionName=kaiyuanyun&nodes=CENI-KFSJK&AllowGroups=testgroup`
|
||||
|
||||
|
||||
## GET V1修改队列
|
||||
* url:/api/v1/slurm/partition/update_partition
|
||||
|
||||
### 修改队列名为kaiyuanyun的节点 为CENI-KFSJK
|
||||
* query
|
||||
* PartitionName:kaiyuanyun
|
||||
* Nodes:CENI-KFSJK
|
||||
|
||||
`/api/v1/slurm/partition/update_partition?PartitionName=kaiyuanyun&nodes=CENI-KFSJK`
|
||||
|
||||
### 修改队列名为kaiyuanyun 节点为CENI-KFSJK且只允许testgroup
|
||||
`/api/v1/slurm/partition/update_partition?PartitionName=kaiyuanyun&nodes=CENI-KFSJK&AllowGroups=testgroup`
|
||||
### 启用队列
|
||||
`/api/v1/slurm/partition/update_partition?PartitionName=kaiyuanyun&state=UP`
|
||||
### 禁用队列
|
||||
`/api/v1/slurm/partition/update_partition?PartitionName=kaiyuanyun&state=DOWN`
|
||||
|
||||
|
||||
## GET V1删除队列
|
||||
* url:/api/v1/slurm/partition/delete_partition
|
||||
|
||||
### 删除kaiyuanyuan队列
|
||||
* query
|
||||
* PartitionName:kaiyuanyun
|
||||
|
||||
`/api/v1/slurm/partition/delete_partition?PartitionName=kaiyuanyu`
|
||||
|
||||
属性字段参考
|
||||
```markdown
|
||||
### 1. **PartitionName**
|
||||
- **含义**:分区的名称。
|
||||
- **示例**:`master`
|
||||
- **说明**:这是分区的唯一标识符,用户提交作业时可以指定分区。
|
||||
|
||||
---
|
||||
|
||||
### 2. **AllowGroups**
|
||||
- **含义**:允许使用该分区的用户组。
|
||||
- **示例**:`ALL`
|
||||
- **说明**:`ALL` 表示所有用户组都可以使用该分区。
|
||||
|
||||
---
|
||||
|
||||
### 3. **AllowAccounts**
|
||||
- **含义**:允许使用该分区的账户。
|
||||
- **示例**:`ALL`
|
||||
- **说明**:`ALL` 表示所有账户都可以使用该分区。
|
||||
|
||||
---
|
||||
|
||||
### 4. **AllowQos**
|
||||
- **含义**:允许使用该分区的服务质量(QoS)。
|
||||
- **示例**:`ALL`
|
||||
- **说明**:`ALL` 表示所有 QoS 都可以在该分区中使用。
|
||||
|
||||
---
|
||||
|
||||
### 5. **AllocNodes**
|
||||
- **含义**:允许分配节点的规则。
|
||||
- **示例**:`ALL`
|
||||
- **说明**:`ALL` 表示可以分配所有节点。
|
||||
|
||||
---
|
||||
|
||||
### 6. **Default**
|
||||
- **含义**:是否为默认分区。
|
||||
- **示例**:`NO`
|
||||
- **说明**:`NO` 表示该分区不是默认分区。如果用户未指定分区,作业将提交到默认分区。
|
||||
|
||||
---
|
||||
|
||||
### 7. **QoS**
|
||||
- **含义**:分区的默认服务质量(QoS)。
|
||||
- **示例**:`N/A`
|
||||
- **说明**:`N/A` 表示该分区没有配置默认的 QoS。
|
||||
|
||||
---
|
||||
|
||||
### 8. **DefaultTime**
|
||||
- **含义**:分区的默认作业时间限制。
|
||||
- **示例**:`NONE`
|
||||
- **说明**:`NONE` 表示该分区没有配置默认的作业时间限制。
|
||||
|
||||
---
|
||||
|
||||
### 9. **DisableRootJobs**
|
||||
- **含义**:是否禁止 root 用户提交作业。
|
||||
- **示例**:`NO`
|
||||
- **说明**:`NO` 表示允许 root 用户提交作业。
|
||||
|
||||
---
|
||||
|
||||
### 10. **ExclusiveUser**
|
||||
- **含义**:是否允许独占用户。
|
||||
- **示例**:`NO`
|
||||
- **说明**:`NO` 表示不允许用户独占分区。
|
||||
|
||||
---
|
||||
|
||||
### 11. **GraceTime**
|
||||
- **含义**:作业结束后的宽限时间(单位:秒)。
|
||||
- **示例**:`0`
|
||||
- **说明**:`0` 表示作业结束后立即释放资源。
|
||||
|
||||
---
|
||||
|
||||
### 12. **Hidden**
|
||||
- **含义**:分区是否隐藏。
|
||||
- **示例**:`NO`
|
||||
- **说明**:`NO` 表示该分区对用户可见。
|
||||
|
||||
---
|
||||
|
||||
### 13. **MaxNodes**
|
||||
- **含义**:单个作业可以使用的最大节点数。
|
||||
- **示例**:`UNLIMITED`
|
||||
- **说明**:`UNLIMITED` 表示没有限制。
|
||||
|
||||
---
|
||||
|
||||
### 14. **MaxTime**
|
||||
- **含义**:作业的最大运行时间。
|
||||
- **示例**:`UNLIMITED`
|
||||
- **说明**:`UNLIMITED` 表示作业可以无限期运行。
|
||||
|
||||
---
|
||||
|
||||
### 15. **MinNodes**
|
||||
- **含义**:单个作业可以使用的最小节点数。
|
||||
- **示例**:`0`
|
||||
- **说明**:`0` 表示作业可以使用任意数量的节点。
|
||||
|
||||
---
|
||||
|
||||
### 16. **LLN**
|
||||
- **含义**:是否为低延迟网络(Low Latency Network)分区。
|
||||
- **示例**:`NO`
|
||||
- **说明**:`NO` 表示该分区不是低延迟网络分区。
|
||||
|
||||
---
|
||||
|
||||
### 17. **MaxCPUsPerNode**
|
||||
- **含义**:每个节点上可以使用的最大 CPU 核心数。
|
||||
- **示例**:`UNLIMITED`
|
||||
- **说明**:`UNLIMITED` 表示没有限制。
|
||||
|
||||
---
|
||||
|
||||
### 18. **Nodes**
|
||||
- **含义**:分区中包含的节点列表。
|
||||
- **示例**:`CENI-KFSJK,CENI-CSSJK`
|
||||
- **说明**:该分区包含 `CENI-KFSJK` 和 `CENI-CSSJK` 两个节点。
|
||||
|
||||
---
|
||||
|
||||
### 19. **PriorityJobFactor**
|
||||
- **含义**:作业优先级因子。
|
||||
- **示例**:`1`
|
||||
- **说明**:`1` 表示该分区的作业优先级因子为 1。
|
||||
|
||||
---
|
||||
|
||||
### 20. **PriorityTier**
|
||||
- **含义**:分区的优先级层级。
|
||||
- **示例**:`1`
|
||||
- **说明**:`1` 表示该分区的优先级层级为 1。
|
||||
|
||||
---
|
||||
|
||||
### 21. **RootOnly**
|
||||
- **含义**:是否仅允许 root 用户提交作业。
|
||||
- **示例**:`NO`
|
||||
- **说明**:`NO` 表示允许所有用户提交作业。
|
||||
|
||||
---
|
||||
|
||||
### 22. **ReqResv**
|
||||
- **含义**:是否要求预留资源。
|
||||
- **示例**:`NO`
|
||||
- **说明**:`NO` 表示不要求预留资源。
|
||||
|
||||
---
|
||||
|
||||
### 23. **OverSubscribe**
|
||||
- **含义**:是否允许超额订阅资源。
|
||||
- **示例**:`NO`
|
||||
- **说明**:`NO` 表示不允许超额订阅资源。
|
||||
|
||||
---
|
||||
|
||||
### 24. **OverTimeLimit**
|
||||
- **含义**:作业超时后的处理方式。
|
||||
- **示例**:`NONE`
|
||||
- **说明**:`NONE` 表示作业超时后不采取任何特殊处理。
|
||||
|
||||
---
|
||||
|
||||
### 25. **PreemptMode**
|
||||
- **含义**:作业抢占模式。
|
||||
- **示例**:`OFF`
|
||||
- **说明**:`OFF` 表示不允许作业抢占。
|
||||
|
||||
---
|
||||
|
||||
### 26. **State**
|
||||
- **含义**:分区的当前状态。
|
||||
- **示例**:`UP`
|
||||
- **说明**:`UP` 表示该分区处于可用状态。
|
||||
|
||||
---
|
||||
|
||||
### 27. **TotalCPUs**
|
||||
- **含义**:分区中所有节点的总 CPU 核心数。
|
||||
- **示例**:`16`
|
||||
- **说明**:该分区总共有 16 个 CPU 核心。
|
||||
|
||||
---
|
||||
|
||||
### 28. **TotalNodes**
|
||||
- **含义**:分区中包含的节点总数。
|
||||
- **示例**:`2`
|
||||
- **说明**:该分区包含 2 个节点。
|
||||
|
||||
---
|
||||
|
||||
### 29. **SelectTypeParameters**
|
||||
- **含义**:节点选择类型的参数。
|
||||
- **示例**:`NONE`
|
||||
- **说明**:`NONE` 表示没有特殊的节点选择参数。
|
||||
|
||||
---
|
||||
|
||||
### 30. **JobDefaults**
|
||||
- **含义**:作业的默认配置。
|
||||
- **示例**:`(null)`
|
||||
- **说明**:`(null)` 表示没有配置作业的默认参数。
|
||||
|
||||
---
|
||||
|
||||
### 31. **DefMemPerNode**
|
||||
- **含义**:每个节点的默认内存限制。
|
||||
- **示例**:`UNLIMITED`
|
||||
- **说明**:`UNLIMITED` 表示没有默认的内存限制。
|
||||
|
||||
---
|
||||
|
||||
### 32. **MaxMemPerNode**
|
||||
- **含义**:每个节点的最大内存限制。
|
||||
- **示例**:`UNLIMITED`
|
||||
- **说明**:`UNLIMITED` 表示没有最大内存限制。
|
||||
|
||||
---
|
||||
```
|
||||
88
wwwroot/doc/storage/通用存储.md
Normal file
88
wwwroot/doc/storage/通用存储.md
Normal file
@ -0,0 +1,88 @@
|
||||
## GET 获取全部ldapUser
|
||||
|
||||
* GET /api/v1/storage/common/get_storage_json
|
||||
|
||||
* 请求参数 point 挂载点
|
||||
|
||||
* 返回示例
|
||||
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "success",
|
||||
"data": [
|
||||
{
|
||||
"Filesystem": "tmpfs",
|
||||
"Type": "tmpfs",
|
||||
"Size": "3.2G",
|
||||
"Used": "1.3M",
|
||||
"Avail": "3.2G",
|
||||
"Use%": "1%",
|
||||
"Mounted": "/run"
|
||||
},
|
||||
{
|
||||
"Filesystem": "/dev/mapper/ubuntu--vg-ubuntu--lv",
|
||||
"Type": "ext4",
|
||||
"Size": "48G",
|
||||
"Used": "13G",
|
||||
"Avail": "33G",
|
||||
"Use%": "29%",
|
||||
"Mounted": "/"
|
||||
},
|
||||
{
|
||||
"Filesystem": "tmpfs",
|
||||
"Type": "tmpfs",
|
||||
"Size": "16G",
|
||||
"Used": "0",
|
||||
"Avail": "16G",
|
||||
"Use%": "0%",
|
||||
"Mounted": "/dev/shm"
|
||||
},
|
||||
{
|
||||
"Filesystem": "tmpfs",
|
||||
"Type": "tmpfs",
|
||||
"Size": "5.0M",
|
||||
"Used": "0",
|
||||
"Avail": "5.0M",
|
||||
"Use%": "0%",
|
||||
"Mounted": "/run/lock"
|
||||
},
|
||||
{
|
||||
"Filesystem": "/dev/vda2",
|
||||
"Type": "ext4",
|
||||
"Size": "2.0G",
|
||||
"Used": "253M",
|
||||
"Avail": "1.6G",
|
||||
"Use%": "14%",
|
||||
"Mounted": "/boot"
|
||||
},
|
||||
{
|
||||
"Filesystem": "nfsserver:/d",
|
||||
"Type": "nfs4",
|
||||
"Size": "1.8T",
|
||||
"Used": "114G",
|
||||
"Avail": "1.6T",
|
||||
"Use%": "7%",
|
||||
"Mounted": "/d"
|
||||
},
|
||||
{
|
||||
"Filesystem": "tmpfs",
|
||||
"Type": "tmpfs",
|
||||
"Size": "3.2G",
|
||||
"Used": "4.0K",
|
||||
"Avail": "3.2G",
|
||||
"Use%": "1%",
|
||||
"Mounted": "/run/user/0"
|
||||
},
|
||||
{
|
||||
"Filesystem": "tmpfs",
|
||||
"Type": "tmpfs",
|
||||
"Size": "3.2G",
|
||||
"Used": "4.0K",
|
||||
"Avail": "3.2G",
|
||||
"Use%": "1%",
|
||||
"Mounted": "/run/user/1000"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
8
wwwroot/index.dspy
Normal file
8
wwwroot/index.dspy
Normal file
@ -0,0 +1,8 @@
|
||||
info('test .....{params_kw=}')
|
||||
|
||||
data = {
|
||||
"k":"${key}$",
|
||||
"s":"${secretkey}$"
|
||||
}
|
||||
ns = paramify(data, params_kw)
|
||||
return ns
|
||||
Loading…
x
Reference in New Issue
Block a user