first commit

2025-07-16 14:32:22 +08:00 · 2025-07-16 14:32:22 +08:00 · a5e8899b5a
commit a5e8899b5a
13 changed files with 281 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,58 @@
 # Qwen2-VL deployment instances
 ## dependents
 git+https://git.kaiyuancloud.cn/yumoqing/apppublic
 git+https://git.kaiyuancloud.cn/yumoqing/ahserver
 ## preinstallation
 first, create a new python virtual env
 ```
 python3 -m venv ~/vl
 ```
 create two shell scripts named vlpy and vlpip:
 ```vlpy
 #!/usr/bin/bash
 ~/vl/bin/python $*
 ```
 and 
 ```vlpip
 #!/usr/bin/bash
 ~/vl/bin/pip $*
 ```
 and copy them to the bin under you $HOME folder, and chmod +x to them
 ```
 mv vlpip vlpy ~/bin
 chmod +x ~/bin/vl*
 ```
 ## isntallation
 follow instuctions from [Qwen2-VL](https://github.com/QwenLM/Qwen2-VL), remember to change pip to vlpip
 do the following
 ```
 vlpip install git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830 accelerate
 pip install qwen-vl-utils[decord]
 git clone https://git.kaiyauncloud.cn/yumoqing/qwenvl
 cd qwenvl/script
 sudo isntall.sh
 ```
 ## Change model or http port
 there is a config.json file under qwenvl folder, change the "modelname" and "port" value to suite your requirements
 ## model to use
 * Qwen/Qwen2-VL-7B-Instruct-AWQ
 * Qwen/Qwen2-VL-7B-Instruct
 * Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4
 * Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8
 * Qwen/Qwen2-VL-72B-Instruct
 * Qwen/Qwen2-VL-72B-Instruct-AWQ
 * Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4
 * Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8
 * Qwen/Qwen2-VL-2B-Instruct
 * Qwen/Qwen2-VL-2B-Instruct-AWQ
 * Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4
 * Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8
--- a/app/README.md
+++ b/app/README.md
--- a/app/qwenvl.py
+++ b/app/qwenvl.py
@ -0,0 +1,105 @@
 import torch
 from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
 from appPublic.worker import awaitify
 from appPublic.jsonConfig import getConfig
 from ahserver.serverenv import ServerEnv
 from ahserver.webapp import webapp
 class Qwen2VLClass:
 	def __init__(self, modelname):
 		# default: Load the model on the available device(s)
 		self.model = Qwen2VLForConditionalGeneration.from_pretrained(
 			modelname,
 			torch_dtype=torch.bfloat16,
 			# attn_implementation="flash_attention_2",
 			device_map="auto"
 		)
 		self.min_pixels = 256 * 28 * 28
 		self.max_pixels = 1280 * 28 * 28
 		# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
 		# model = Qwen2VLForConditionalGeneration.from_pretrained(
 		#     "Qwen/Qwen2-VL-7B-Instruct",
 		#     torch_dtype=torch.bfloat16,
 		#     attn_implementation="flash_attention_2",
 		#     device_map="auto",
 		# )
 		# default processer
 		self.processor = AutoProcessor.from_pretrained(modelname,
 			min_pixels=self.min_pixels,
 			max_pixels=self.max_pixels
 		)
 		# The default range for the number of visual tokens per image in the model is 4-16384.
 		# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
 		# min_pixels = 256*28*28
 		# max_pixels = 1280*28*28
 		# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
 	def inference(self, prompt, image, videofile):
 		content = [
 			{
 				"type":"text",
 				"text":prompt
 			}
 		]
 		if image:
 			if not image.startswith('file:///') \
 						and not image.startswith('http://') \
 						and not image.startswith('https://'):
 				image = f'data:image;base64,{image}'
 			content.append({
 				"type":"image",
 				"image":image
 			})
 		if videofile:
 			if not videofile.startswith('file:///'):
 				return 'only local video file support'
 			content.append({
 				"type":"video",
 				"video":videofile
 			})
 		messages = [
 			{
 				"role": "user",
 				"content": content
 			}
 		]
 		# Preparation for inference
 		text = self.processor.apply_chat_template(
 			messages, tokenize=False, add_generation_prompt=True
 		)
 		image_inputs, video_inputs = process_vision_info(messages)
 		inputs = self.processor(
 			text=[text],
 			images=image_inputs,
 			videos=video_inputs,
 			padding=True,
 			return_tensors="pt",
 		)
 		inputs = inputs.to("cuda")
 		# Inference: Generation of the output
 		generated_ids = self.model.generate(**inputs, max_new_tokens=128)
 		generated_ids_trimmed = [
 			out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
 		]
 		output_text = self.processor.batch_decode(
 			generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
 		)
 		return output_text[0]
 def main():
 	config = getConfig()
 	modelname = config.modelname
 	m = Qwen2VLClass(modelname)
 	g = ServerEnv()
 	g.inference = awaitify(m.inference)
 if __name__ == '__main__':
 	webapp(main)
--- a/app/test.py
+++ b/app/test.py
@ -0,0 +1,32 @@
 import time
 import requests
 import base64
 def file2b64(file_path):
    # 读取文件内容
    with open(file_path, 'rb') as file:
        file_content = file.read()
    # 将文件内容转换为Base64编码
    base64_encoded_data = base64.b64encode(file_content)
    # 将Base64编码的数据转换为字符串
    base64_encoded_str = base64_encoded_data.decode('utf-8')
    return base64_encoded_str
 while True:
 	print('prompt:')
 	p = input()
 	print('input image path:')
 	i = input()
 	if p == '' or i == '':
 		continue
 	t1 = time.time()
 	ret = requests.post('http://pd4e.com:10090/api',
 					data={
 						'prompt':p,
 						'image':file2b64(i)
 					})
 	t2 = time.time()
 	print(ret.text, t2 - t1, 'seconds')
--- a/conf/README.md
+++ b/conf/README.md
--- a/conf/config.json
+++ b/conf/config.json
@ -0,0 +1,48 @@
 {
 	"password_key":"!@#$%^&*(*&^%$QWERTYUIqwertyui234567",
 	"modelname":"Qwen/Qwen2-VL-7B-Instruct",
 	"modelname":"Qwen/Qwen2-VL-2B-Instruct",
 	"logger":{
 		"name":"qwenvl",
 		"levelname":"info",
 		"logfile":"$[workdir]$/logs/qwenvl.log"
 	},
 	"filesroot":"$[workdir]$/files",
 	"website":{
 		"paths":[
 			["$[workdir]$/wwwroot",""]
 		],
 		"client_max_size":10000,
 		"host":"0.0.0.0",
 		"port":10090,
 		"coding":"utf-8",
 		"indexes":[
 			"index.html",
 			"index.tmpl",
 			"index.ui",
 			"index.dspy",
 			"index.md"
 		],
 		"startswiths":[
 			{
 				"leading":"/idfile",
 				"registerfunction":"idFileDownload"
 			}
 		],
 		"processors":[
 			[".dspy","dspy"],
 			[".md","md"]
 		],
 		"session_max_time":3000,
 		"session_issue_time":2500,
 		"session_redis_notuse":{
 			"url":"redis://127.0.0.1:6379"
 		}
 	},
 	"langMapping":{
 		"zh-Hans-CN":"zh-cn",
 		"zh-CN":"zh-cn",
 		"en-us":"en",
 		"en-US":"en"
 	}
 }
--- a/files/README.md
+++ b/files/README.md
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,8 @@
 git+https://git.kaiyuancloud.cn/yumoqing/apppublic
 git+https://git.kaiyuancloud.cn/yumoqing/ahserver
 git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830 
 accelerate
 torch
 torchaudio
 optimum
 auto_gptq
--- a/script/install.sh
+++ b/script/install.sh
@ -0,0 +1,3 @@
 sudo cp qwenvl.service /etc/systemd/system
 sudo systemctl enable qwenvl.service
 sudo systemctl start qwenvl
--- a/script/qwenvl.service
+++ b/script/qwenvl.service
@ -0,0 +1,13 @@
 [Unit]
 Description=qwen2-vl inference service
 Documention=qwen2-vl inference service to control sage service start or stop
 Wants=systemd-networkd.service
 Requires=nginx.service
 [Service]
 Type=forking
 ExecStart=su - ymq -c "/d/ymq/py/qwenvl/script/qwenvl.sh"
 ExecStop=su - ymq "/d/ymq/bin/killname qwenvl.py"
 [Install]
 WantedBy=multi-user.target
--- a/script/qwenvl.sh
+++ b/script/qwenvl.sh
@ -0,0 +1,5 @@
 #!/usr/bin/bash
 killname /py/qwenvl/app/qwenvl.py
 ~/ve/qwenvl/bin/python ~/py/qwenvl/app/qwenvl.py -w ~/py/qwenvl >~/py/qwenvl/logs/stderr.log 2>&1 &
 exit 0
--- a/wwwroot/README.md
+++ b/wwwroot/README.md
--- a/wwwroot/api/index.dspy
+++ b/wwwroot/api/index.dspy
@ -0,0 +1,9 @@
 info(f'{params_kw=}')
 t1 = time.time()
 ret = await inference(params_kw.prompt, params_kw.image, params_kw.video)
 t2 = time.time()
 info(f'{ret=}')
 return {
 	"content":ret,
 	"time_cost":t2 - t1
 }