From a5e8899b5a34bb5d4cc01950c6898a522d64ca06 Mon Sep 17 00:00:00 2001
From: yumoqing <yumoqing@gmail.com>
Date: Wed, 16 Jul 2025 14:32:22 +0800
Subject: [PATCH] first commit

---
 README.md              |  58 +++++++++++++++++++++++
 app/README.md          |   0
 app/qwenvl.py          | 105 +++++++++++++++++++++++++++++++++++++++++
 app/test.py            |  32 +++++++++++++
 conf/README.md         |   0
 conf/config.json       |  48 +++++++++++++++++++
 files/README.md        |   0
 requirements.txt       |   8 ++++
 script/install.sh      |   3 ++
 script/qwenvl.service  |  13 +++++
 script/qwenvl.sh       |   5 ++
 wwwroot/README.md      |   0
 wwwroot/api/index.dspy |   9 ++++
 13 files changed, 281 insertions(+)
 create mode 100644 README.md
 create mode 100644 app/README.md
 create mode 100644 app/qwenvl.py
 create mode 100644 app/test.py
 create mode 100644 conf/README.md
 create mode 100755 conf/config.json
 create mode 100644 files/README.md
 create mode 100644 requirements.txt
 create mode 100755 script/install.sh
 create mode 100644 script/qwenvl.service
 create mode 100755 script/qwenvl.sh
 create mode 100644 wwwroot/README.md
 create mode 100644 wwwroot/api/index.dspy

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e464fbc
--- /dev/null
+++ b/README.md
@@ -0,0 +1,58 @@
+# Qwen2-VL deployment instances
+
+## dependents
+git+https://git.kaiyuancloud.cn/yumoqing/apppublic
+git+https://git.kaiyuancloud.cn/yumoqing/ahserver
+
+## preinstallation
+first, create a new python virtual env
+```
+python3 -m venv ~/vl
+```
+create two shell scripts named vlpy and vlpip:
+```vlpy
+#!/usr/bin/bash
+~/vl/bin/python $*
+```
+and 
+```vlpip
+#!/usr/bin/bash
+~/vl/bin/pip $*
+```
+and copy them to the bin under you $HOME folder, and chmod +x to them
+```
+mv vlpip vlpy ~/bin
+chmod +x ~/bin/vl*
+```
+
+## isntallation
+
+follow instuctions from [Qwen2-VL](https://github.com/QwenLM/Qwen2-VL), remember to change pip to vlpip
+
+do the following
+```
+vlpip install git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830 accelerate
+pip install qwen-vl-utils[decord]
+
+git clone https://git.kaiyauncloud.cn/yumoqing/qwenvl
+cd qwenvl/script
+sudo isntall.sh
+```
+
+## Change model or http port
+there is a config.json file under qwenvl folder, change the "modelname" and "port" value to suite your requirements
+
+## model to use
+
+* Qwen/Qwen2-VL-7B-Instruct-AWQ
+* Qwen/Qwen2-VL-7B-Instruct
+* Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4
+* Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8
+* Qwen/Qwen2-VL-72B-Instruct
+* Qwen/Qwen2-VL-72B-Instruct-AWQ
+* Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int4
+* Qwen/Qwen2-VL-72B-Instruct-GPTQ-Int8
+* Qwen/Qwen2-VL-2B-Instruct
+* Qwen/Qwen2-VL-2B-Instruct-AWQ
+* Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4
+* Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8
diff --git a/app/README.md b/app/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/app/qwenvl.py b/app/qwenvl.py
new file mode 100644
index 0000000..614ec27
--- /dev/null
+++ b/app/qwenvl.py
@@ -0,0 +1,105 @@
+import torch
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+from appPublic.worker import awaitify
+from appPublic.jsonConfig import getConfig
+from ahserver.serverenv import ServerEnv
+from ahserver.webapp import webapp
+
+class Qwen2VLClass:
+	def __init__(self, modelname):
+		# default: Load the model on the available device(s)
+		self.model = Qwen2VLForConditionalGeneration.from_pretrained(
+			modelname,
+			torch_dtype=torch.bfloat16,
+			# attn_implementation="flash_attention_2",
+			device_map="auto"
+		)
+		self.min_pixels = 256 * 28 * 28
+		self.max_pixels = 1280 * 28 * 28
+		# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+		# model = Qwen2VLForConditionalGeneration.from_pretrained(
+		#     "Qwen/Qwen2-VL-7B-Instruct",
+		#     torch_dtype=torch.bfloat16,
+		#     attn_implementation="flash_attention_2",
+		#     device_map="auto",
+		# )
+
+		# default processer
+		self.processor = AutoProcessor.from_pretrained(modelname,
+			min_pixels=self.min_pixels,
+			max_pixels=self.max_pixels
+		)
+
+		# The default range for the number of visual tokens per image in the model is 4-16384.
+		# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
+		# min_pixels = 256*28*28
+		# max_pixels = 1280*28*28
+		# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
+		
+	def inference(self, prompt, image, videofile):
+		content = [
+			{
+				"type":"text",
+				"text":prompt
+			}
+		]
+		if image:
+			if not image.startswith('file:///') \
+						and not image.startswith('http://') \
+						and not image.startswith('https://'):
+				image = f'data:image;base64,{image}'
+			content.append({
+				"type":"image",
+				"image":image
+			})
+		if videofile:
+			if not videofile.startswith('file:///'):
+				return 'only local video file support'
+
+			content.append({
+				"type":"video",
+				"video":videofile
+			})
+		
+		messages = [
+			{
+				"role": "user",
+				"content": content
+			}
+		]
+
+		# Preparation for inference
+		text = self.processor.apply_chat_template(
+			messages, tokenize=False, add_generation_prompt=True
+		)
+		image_inputs, video_inputs = process_vision_info(messages)
+		inputs = self.processor(
+			text=[text],
+			images=image_inputs,
+			videos=video_inputs,
+			padding=True,
+			return_tensors="pt",
+		)
+		inputs = inputs.to("cuda")
+
+		# Inference: Generation of the output
+		generated_ids = self.model.generate(**inputs, max_new_tokens=128)
+		generated_ids_trimmed = [
+			out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+		]
+		output_text = self.processor.batch_decode(
+			generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+		)
+		return output_text[0]
+
+def main():
+	config = getConfig()
+	modelname = config.modelname
+	m = Qwen2VLClass(modelname)
+	g = ServerEnv()
+	g.inference = awaitify(m.inference)
+
+
+if __name__ == '__main__':
+	webapp(main)
diff --git a/app/test.py b/app/test.py
new file mode 100644
index 0000000..e0ff008
--- /dev/null
+++ b/app/test.py
@@ -0,0 +1,32 @@
+import time
+import requests
+import base64
+
+def file2b64(file_path):
+    # 读取文件内容
+    with open(file_path, 'rb') as file:
+        file_content = file.read()
+    
+    # 将文件内容转换为Base64编码
+    base64_encoded_data = base64.b64encode(file_content)
+    
+    # 将Base64编码的数据转换为字符串
+    base64_encoded_str = base64_encoded_data.decode('utf-8')
+    
+    return base64_encoded_str
+
+while True:
+	print('prompt:')
+	p = input()
+	print('input image path:')
+	i = input()
+	if p == '' or i == '':
+		continue
+	t1 = time.time()
+	ret = requests.post('http://pd4e.com:10090/api',
+					data={
+						'prompt':p,
+						'image':file2b64(i)
+					})
+	t2 = time.time()
+	print(ret.text, t2 - t1, 'seconds')
diff --git a/conf/README.md b/conf/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/conf/config.json b/conf/config.json
new file mode 100755
index 0000000..4b047ef
--- /dev/null
+++ b/conf/config.json
@@ -0,0 +1,48 @@
+{
+	"password_key":"!@#$%^&*(*&^%$QWERTYUIqwertyui234567",
+	"modelname":"Qwen/Qwen2-VL-7B-Instruct",
+	"modelname":"Qwen/Qwen2-VL-2B-Instruct",
+	"logger":{
+		"name":"qwenvl",
+		"levelname":"info",
+		"logfile":"$[workdir]$/logs/qwenvl.log"
+	},
+	"filesroot":"$[workdir]$/files",
+	"website":{
+		"paths":[
+			["$[workdir]$/wwwroot",""]
+		],
+		"client_max_size":10000,
+		"host":"0.0.0.0",
+		"port":10090,
+		"coding":"utf-8",
+		"indexes":[
+			"index.html",
+			"index.tmpl",
+			"index.ui",
+			"index.dspy",
+			"index.md"
+		],
+		"startswiths":[
+			{
+				"leading":"/idfile",
+				"registerfunction":"idFileDownload"
+			}
+		],
+		"processors":[
+			[".dspy","dspy"],
+			[".md","md"]
+		],
+		"session_max_time":3000,
+		"session_issue_time":2500,
+		"session_redis_notuse":{
+			"url":"redis://127.0.0.1:6379"
+		}
+	},
+	"langMapping":{
+		"zh-Hans-CN":"zh-cn",
+		"zh-CN":"zh-cn",
+		"en-us":"en",
+		"en-US":"en"
+	}
+}
diff --git a/files/README.md b/files/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..1b6b397
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+git+https://git.kaiyuancloud.cn/yumoqing/apppublic
+git+https://git.kaiyuancloud.cn/yumoqing/ahserver
+git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830 
+accelerate
+torch
+torchaudio
+optimum
+auto_gptq
diff --git a/script/install.sh b/script/install.sh
new file mode 100755
index 0000000..bbd75cd
--- /dev/null
+++ b/script/install.sh
@@ -0,0 +1,3 @@
+sudo cp qwenvl.service /etc/systemd/system
+sudo systemctl enable qwenvl.service
+sudo systemctl start qwenvl
diff --git a/script/qwenvl.service b/script/qwenvl.service
new file mode 100644
index 0000000..cd4d304
--- /dev/null
+++ b/script/qwenvl.service
@@ -0,0 +1,13 @@
+[Unit]
+Description=qwen2-vl inference service
+Documention=qwen2-vl inference service to control sage service start or stop
+Wants=systemd-networkd.service
+Requires=nginx.service
+
+[Service]
+Type=forking
+ExecStart=su - ymq -c "/d/ymq/py/qwenvl/script/qwenvl.sh"
+ExecStop=su - ymq "/d/ymq/bin/killname qwenvl.py"
+[Install]
+WantedBy=multi-user.target
+
diff --git a/script/qwenvl.sh b/script/qwenvl.sh
new file mode 100755
index 0000000..133c934
--- /dev/null
+++ b/script/qwenvl.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/bash
+
+killname /py/qwenvl/app/qwenvl.py
+~/ve/qwenvl/bin/python ~/py/qwenvl/app/qwenvl.py -w ~/py/qwenvl >~/py/qwenvl/logs/stderr.log 2>&1 &
+exit 0
diff --git a/wwwroot/README.md b/wwwroot/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/wwwroot/api/index.dspy b/wwwroot/api/index.dspy
new file mode 100644
index 0000000..1873f9a
--- /dev/null
+++ b/wwwroot/api/index.dspy
@@ -0,0 +1,9 @@
+info(f'{params_kw=}')
+t1 = time.time()
+ret = await inference(params_kw.prompt, params_kw.image, params_kw.video)
+t2 = time.time()
+info(f'{ret=}')
+return {
+	"content":ret,
+	"time_cost":t2 - t1
+}