From 7d81f9093a1043e712908ff49554ee108d3a7de2 Mon Sep 17 00:00:00 2001
From: yumoqing <yumoqing@gmail.com>
Date: Wed, 16 Jul 2025 14:29:05 +0800
Subject: [PATCH] first commit

---
 README.md                      |  35 +++++++++++
 app/fastvlm.py                 | 106 +++++++++++++++++++++++++++++++++
 conf/config.json               |  74 +++++++++++++++++++++++
 files/README.md                |   0
 fvlm.service                   |  17 ++++++
 logs/README.md                 |   0
 requirements.txt               |   5 ++
 start.sh                       |   3 +
 stop.sh                        |   3 +
 wwwroot/bricks                 |   1 +
 wwwroot/index.md               |  11 ++++
 wwwroot/index.ui               |   7 +++
 wwwroot/v1/generate/index.dspy |   8 +++
 13 files changed, 270 insertions(+)
 create mode 100644 README.md
 create mode 100644 app/fastvlm.py
 create mode 100644 conf/config.json
 create mode 100644 files/README.md
 create mode 100644 fvlm.service
 create mode 100644 logs/README.md
 create mode 100644 requirements.txt
 create mode 100755 start.sh
 create mode 100755 stop.sh
 create mode 120000 wwwroot/bricks
 create mode 100644 wwwroot/index.md
 create mode 100644 wwwroot/index.ui
 create mode 100644 wwwroot/v1/generate/index.dspy

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..20b4649
--- /dev/null
+++ b/README.md
@@ -0,0 +1,35 @@
+# A FastVLM webserver
+[fastvlm](https://github.com/apple/ml-fastvlm) is apple released a open source
+multiple modal llm, it can input image, promt and generate text
+
+## Create  virtual environment
+```
+python3 -m venv ~/fastvlm.env
+source ~/fastvlm/bin/activate
+```
+
+## Installation
+```
+git clone https://github.com/apple/ml-fastvlm.git
+cd ml-fastvlm
+pip install -U .
+git clone https://git.kaiyuancloud.cn/yumoqing/fvlm.git
+pip install git+https://git.kaiyuancloud.cn/yumoqing/apppublic.git
+pip install git+https://git.kaiyuancloud.cn/yumoqing/sqlor.git
+pip install git+https://git.kaiyuancloud.cn/yumoqing/ahserver.git
+```
+
+## start web server
+```
+cd fvlm
+python app/fastvlm.py
+```
+the server will bind on port 9994
+
+## Usage
+```
+curl http://localhost:9994/v1/generate \
+	-F "prompt=描述这张图片" \
+	-F "image_path=@path_to_image"
+```
+
diff --git a/app/fastvlm.py b/app/fastvlm.py
new file mode 100644
index 0000000..b864f9f
--- /dev/null
+++ b/app/fastvlm.py
@@ -0,0 +1,106 @@
+#
+# Modified from LLaVA/predict.py
+# Please see ACKNOWLEDGEMENTS for details about LICENSE
+#
+import os
+import torch
+import time
+from PIL import Image
+
+from llava.utils import disable_torch_init
+from llava.conversation import conv_templates
+from llava.model.builder import load_pretrained_model
+from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from ahserver.webapp import webapp
+from ahserver.serverenv import ServerEnv
+from appPublic.jsonConfig import getConfig
+from appPublic.log import debug, exception, error
+from appPublic.worker import awaitify
+
+class FastVLM:
+	def __init__(self):
+		self.config = getConfig()
+		model_path = self.config.model_path
+		"""
+		generation_config = None
+		if os.path.exists(os.path.join(model_path, 'generation_config.json')):
+			generation_config = os.path.join(model_path, '.generation_config.json')
+			os.rename(os.path.join(model_path, 'generation_config.json'),
+					  generation_config)
+		"""
+
+		# Load model
+		disable_torch_init()
+		model_name = get_model_name_from_path(model_path)
+		model_base = None
+		device = self.config.device
+		tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, model_base, model_name, device=device)
+		self.tokenizer = tokenizer
+		self.model = model
+		self.image_processor = image_processor
+		self.context_len = context_len
+
+	def _generate(self, image_file, prompt, 
+				temperature=0.2,
+				top_p=None,
+				num_beams=1,
+				conv_mode='qwen_2'):
+		qs = prompt
+		t1 = time.time()
+		if self.model.config.mm_use_im_start_end:
+			qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+		else:
+			qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+		conv = conv_templates[conv_mode].copy()
+		conv.append_message(conv.roles[0], qs)
+		conv.append_message(conv.roles[1], None)
+		prompt = conv.get_prompt()
+
+		# Set the pad token id for generation
+		self.model.generation_config.pad_token_id = self.tokenizer.pad_token_id
+
+		# Tokenize prompt
+		input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') \
+					.unsqueeze(0).to(self.model.device)
+
+		# Load and preprocess image
+		image = Image.open(image_file).convert('RGB')
+		image_tensor = process_images([image], self.image_processor, self.model.config)[0]
+
+		# Run inference
+		with torch.inference_mode():
+			output_ids = self.model.generate(
+				input_ids,
+				images=image_tensor.unsqueeze(0).half(),
+				image_sizes=[image.size],
+				do_sample=True if temperature > 0 else False,
+				temperature=temperature,
+				top_p=top_p,
+				num_beams=num_beams,
+				max_new_tokens=256,
+				use_cache=True)
+
+			outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+			t2 = time.time()
+			return {
+				'timecost': t2 - t1,
+				'content': outputs
+			}
+		debug(f'Exception happened .......')
+		return None
+
+	async def generate(self, image_file, prompt):
+		f = awaitify(self._generate)
+		return await f(image_file, prompt)
+
+fastvlm = None
+def init():
+	global fastvlm
+	g = ServerEnv()
+	g.fastvlm = fastvlm
+	fastvlm = FastVLM()
+	g.generate = fastvlm.generate
+
+if __name__ == "__main__":
+	webapp(init)
diff --git a/conf/config.json b/conf/config.json
new file mode 100644
index 0000000..02c03ac
--- /dev/null
+++ b/conf/config.json
@@ -0,0 +1,74 @@
+{
+	"language":{
+		"zh":{
+			"sentence_splitter":"[。？！]|\r?\n"
+		},
+		"en":{
+			"sentence_splitter":"[.?!] |\r?\n"
+		}
+	},
+	"model_path":"/share/models/apple/llava-fastvithd_0.5b_stage3",
+	"device":"cuda:0",
+	"filesroot":"$[workdir]$/files",
+	"logger":{
+		"name":"fvlm",
+		"levelname":"info",
+		"logfile":"$[workdir]$/logs/fvlm.log"
+	},
+	"website":{
+		"paths":[
+			["$[workdir]$/wwwroot",""]
+		],
+		"client_max_size":10000,
+		"host":"0.0.0.0",
+		"port":9994,
+		"coding":"utf-8",
+		"ssl_gg":{
+			"crtfile":"$[workdir]$/conf/www.bsppo.com.pem",
+			"keyfile":"$[workdir]$/conf/www.bsppo.com.key"
+		},
+		"indexes":[
+			"index.html",
+			"index.tmpl",
+			"index.ui",
+			"index.dspy",
+			"index.md"
+		],
+		"startswiths":[
+			{
+				"leading":"/idfile",
+				"registerfunction":"idfile"
+			}
+		],
+		"processors":[
+			[".ws","ws"],
+			[".xterm","xterm"],
+			[".proxy","proxy"],
+			[".llm", "llm"],
+			[".llms", "llms"],
+			[".llma", "llma"],
+			[".xlsxds","xlsxds"],
+			[".sqlds","sqlds"],
+			[".tmpl.js","tmpl"],
+			[".tmpl.css","tmpl"],
+			[".html.tmpl","tmpl"],
+			[".bcrud", "bricks_crud"],
+			[".tmpl","tmpl"],
+			[".app","app"],
+			[".bui","bui"],
+			[".ui","bui"],
+			[".dspy","dspy"],
+			[".md","md"]
+		],
+		"rsakey":{
+			"privatekey":"$[workdir]$/conf/rsa_private_key.pem",
+			"publickey":"$[workdir]$/conf/rsa_public_key.pem"
+		},
+		"session_max_time":3000,
+		"session_issue_time":2500,
+		"session_redis_notuse":{
+			"url":"redis://127.0.0.1:6379"
+		}
+	}
+}
+
diff --git a/files/README.md b/files/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/fvlm.service b/fvlm.service
new file mode 100644
index 0000000..2820fc9
--- /dev/null
+++ b/fvlm.service
@@ -0,0 +1,17 @@
+[Unit]
+Wants=systemd-networkd.service
+
+[Service]
+User=ymq
+Group=ymq
+Type=forking
+WorkingDirectory=/share/ymq/run/fvlm
+# ExecStart=/share/ymq/run/fvlm/fvlm.env/bin/python app/fastvlm.py -p 9994
+ExecStart=/share/ymq/run/fvlm/start.sh
+ExecStop=/share/ymq/run/fvlm/stop.sh
+StandardOutput=append:/var/log/fvlm/fvlm.log
+StandardError=append:/var/log/fvlm/fvlm.log
+SyslogIdentifier=fvlm
+
+[Install]
+WantedBy=multi-user.target
diff --git a/logs/README.md b/logs/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..66ee57d
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+llava
+# git+https://github.com/apple/ml-fastvlm
+git+https://git.kaiyuancloud.cn/yumoqing/apppublic
+git+https://git.kaiyuancloud.cn/yumoqing/sqlor
+git+https://git.kaiyuancloud.cn/yumoqing/ahserver
diff --git a/start.sh b/start.sh
new file mode 100755
index 0000000..0d483b1
--- /dev/null
+++ b/start.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/bash
+
+CUDA_VISIBLE_DEVICES=6 /share/ymq/run/fvlm/fvlm.env/bin/python app/fastvlm.py -p 9994 &
diff --git a/stop.sh b/stop.sh
new file mode 100755
index 0000000..5b4db65
--- /dev/null
+++ b/stop.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/bash
+
+/d/ymq/bin/killname fastvlm.py
diff --git a/wwwroot/bricks b/wwwroot/bricks
new file mode 120000
index 0000000..8bcfc42
--- /dev/null
+++ b/wwwroot/bricks
@@ -0,0 +1 @@
+/tmp/dist
\ No newline at end of file
diff --git a/wwwroot/index.md b/wwwroot/index.md
new file mode 100644
index 0000000..c8a04db
--- /dev/null
+++ b/wwwroot/index.md
@@ -0,0 +1,11 @@
+# A FastVLM webserver
+[fastvlm](https://github.com/apple/ml-fastvlm) is apple released a open source
+multiple modal llm, it can input image, promt and generate text
+
+## Usage
+```
+curl https://{domain}/v1/generate \
+	-F "prompt=描述这张图片" \
+	-F "image_path=@path_to_image"
+```
+
diff --git a/wwwroot/index.ui b/wwwroot/index.ui
new file mode 100644
index 0000000..fcf0ee2
--- /dev/null
+++ b/wwwroot/index.ui
@@ -0,0 +1,7 @@
+{
+	"widgettype":"MdWidget",
+	"options":{
+		"md_url":"{{entire_url('index.md')}}",
+		"width":"100%"
+	}
+}
diff --git a/wwwroot/v1/generate/index.dspy b/wwwroot/v1/generate/index.dspy
new file mode 100644
index 0000000..874009d
--- /dev/null
+++ b/wwwroot/v1/generate/index.dspy
@@ -0,0 +1,8 @@
+debug(f'{params_kw=}')
+image_path = realpath(params_kw.image_path)
+prompt = params_kw.prompt
+debug(f'{image_path=}, {prompt=}')
+d = await generate(image_path, prompt)
+debug(f'{image_path=}, {prompt=}, {d=}')
+return d
+