From 7d81f9093a1043e712908ff49554ee108d3a7de2 Mon Sep 17 00:00:00 2001 From: yumoqing Date: Wed, 16 Jul 2025 14:29:05 +0800 Subject: [PATCH] first commit --- README.md | 35 +++++++++++ app/fastvlm.py | 106 +++++++++++++++++++++++++++++++++ conf/config.json | 74 +++++++++++++++++++++++ files/README.md | 0 fvlm.service | 17 ++++++ logs/README.md | 0 requirements.txt | 5 ++ start.sh | 3 + stop.sh | 3 + wwwroot/bricks | 1 + wwwroot/index.md | 11 ++++ wwwroot/index.ui | 7 +++ wwwroot/v1/generate/index.dspy | 8 +++ 13 files changed, 270 insertions(+) create mode 100644 README.md create mode 100644 app/fastvlm.py create mode 100644 conf/config.json create mode 100644 files/README.md create mode 100644 fvlm.service create mode 100644 logs/README.md create mode 100644 requirements.txt create mode 100755 start.sh create mode 100755 stop.sh create mode 120000 wwwroot/bricks create mode 100644 wwwroot/index.md create mode 100644 wwwroot/index.ui create mode 100644 wwwroot/v1/generate/index.dspy diff --git a/README.md b/README.md new file mode 100644 index 0000000..20b4649 --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +# A FastVLM webserver +[fastvlm](https://github.com/apple/ml-fastvlm) is apple released a open source +multiple modal llm, it can input image, promt and generate text + +## Create virtual environment +``` +python3 -m venv ~/fastvlm.env +source ~/fastvlm/bin/activate +``` + +## Installation +``` +git clone https://github.com/apple/ml-fastvlm.git +cd ml-fastvlm +pip install -U . +git clone https://git.kaiyuancloud.cn/yumoqing/fvlm.git +pip install git+https://git.kaiyuancloud.cn/yumoqing/apppublic.git +pip install git+https://git.kaiyuancloud.cn/yumoqing/sqlor.git +pip install git+https://git.kaiyuancloud.cn/yumoqing/ahserver.git +``` + +## start web server +``` +cd fvlm +python app/fastvlm.py +``` +the server will bind on port 9994 + +## Usage +``` +curl http://localhost:9994/v1/generate \ + -F "prompt=描述这张图片" \ + -F "image_path=@path_to_image" +``` + diff --git a/app/fastvlm.py b/app/fastvlm.py new file mode 100644 index 0000000..b864f9f --- /dev/null +++ b/app/fastvlm.py @@ -0,0 +1,106 @@ +# +# Modified from LLaVA/predict.py +# Please see ACKNOWLEDGEMENTS for details about LICENSE +# +import os +import torch +import time +from PIL import Image + +from llava.utils import disable_torch_init +from llava.conversation import conv_templates +from llava.model.builder import load_pretrained_model +from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path +from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +from ahserver.webapp import webapp +from ahserver.serverenv import ServerEnv +from appPublic.jsonConfig import getConfig +from appPublic.log import debug, exception, error +from appPublic.worker import awaitify + +class FastVLM: + def __init__(self): + self.config = getConfig() + model_path = self.config.model_path + """ + generation_config = None + if os.path.exists(os.path.join(model_path, 'generation_config.json')): + generation_config = os.path.join(model_path, '.generation_config.json') + os.rename(os.path.join(model_path, 'generation_config.json'), + generation_config) + """ + + # Load model + disable_torch_init() + model_name = get_model_name_from_path(model_path) + model_base = None + device = self.config.device + tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, model_base, model_name, device=device) + self.tokenizer = tokenizer + self.model = model + self.image_processor = image_processor + self.context_len = context_len + + def _generate(self, image_file, prompt, + temperature=0.2, + top_p=None, + num_beams=1, + conv_mode='qwen_2'): + qs = prompt + t1 = time.time() + if self.model.config.mm_use_im_start_end: + qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs + else: + qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + conv = conv_templates[conv_mode].copy() + conv.append_message(conv.roles[0], qs) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + # Set the pad token id for generation + self.model.generation_config.pad_token_id = self.tokenizer.pad_token_id + + # Tokenize prompt + input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') \ + .unsqueeze(0).to(self.model.device) + + # Load and preprocess image + image = Image.open(image_file).convert('RGB') + image_tensor = process_images([image], self.image_processor, self.model.config)[0] + + # Run inference + with torch.inference_mode(): + output_ids = self.model.generate( + input_ids, + images=image_tensor.unsqueeze(0).half(), + image_sizes=[image.size], + do_sample=True if temperature > 0 else False, + temperature=temperature, + top_p=top_p, + num_beams=num_beams, + max_new_tokens=256, + use_cache=True) + + outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() + t2 = time.time() + return { + 'timecost': t2 - t1, + 'content': outputs + } + debug(f'Exception happened .......') + return None + + async def generate(self, image_file, prompt): + f = awaitify(self._generate) + return await f(image_file, prompt) + +fastvlm = None +def init(): + global fastvlm + g = ServerEnv() + g.fastvlm = fastvlm + fastvlm = FastVLM() + g.generate = fastvlm.generate + +if __name__ == "__main__": + webapp(init) diff --git a/conf/config.json b/conf/config.json new file mode 100644 index 0000000..02c03ac --- /dev/null +++ b/conf/config.json @@ -0,0 +1,74 @@ +{ + "language":{ + "zh":{ + "sentence_splitter":"[。?!]|\r?\n" + }, + "en":{ + "sentence_splitter":"[.?!] |\r?\n" + } + }, + "model_path":"/share/models/apple/llava-fastvithd_0.5b_stage3", + "device":"cuda:0", + "filesroot":"$[workdir]$/files", + "logger":{ + "name":"fvlm", + "levelname":"info", + "logfile":"$[workdir]$/logs/fvlm.log" + }, + "website":{ + "paths":[ + ["$[workdir]$/wwwroot",""] + ], + "client_max_size":10000, + "host":"0.0.0.0", + "port":9994, + "coding":"utf-8", + "ssl_gg":{ + "crtfile":"$[workdir]$/conf/www.bsppo.com.pem", + "keyfile":"$[workdir]$/conf/www.bsppo.com.key" + }, + "indexes":[ + "index.html", + "index.tmpl", + "index.ui", + "index.dspy", + "index.md" + ], + "startswiths":[ + { + "leading":"/idfile", + "registerfunction":"idfile" + } + ], + "processors":[ + [".ws","ws"], + [".xterm","xterm"], + [".proxy","proxy"], + [".llm", "llm"], + [".llms", "llms"], + [".llma", "llma"], + [".xlsxds","xlsxds"], + [".sqlds","sqlds"], + [".tmpl.js","tmpl"], + [".tmpl.css","tmpl"], + [".html.tmpl","tmpl"], + [".bcrud", "bricks_crud"], + [".tmpl","tmpl"], + [".app","app"], + [".bui","bui"], + [".ui","bui"], + [".dspy","dspy"], + [".md","md"] + ], + "rsakey":{ + "privatekey":"$[workdir]$/conf/rsa_private_key.pem", + "publickey":"$[workdir]$/conf/rsa_public_key.pem" + }, + "session_max_time":3000, + "session_issue_time":2500, + "session_redis_notuse":{ + "url":"redis://127.0.0.1:6379" + } + } +} + diff --git a/files/README.md b/files/README.md new file mode 100644 index 0000000..e69de29 diff --git a/fvlm.service b/fvlm.service new file mode 100644 index 0000000..2820fc9 --- /dev/null +++ b/fvlm.service @@ -0,0 +1,17 @@ +[Unit] +Wants=systemd-networkd.service + +[Service] +User=ymq +Group=ymq +Type=forking +WorkingDirectory=/share/ymq/run/fvlm +# ExecStart=/share/ymq/run/fvlm/fvlm.env/bin/python app/fastvlm.py -p 9994 +ExecStart=/share/ymq/run/fvlm/start.sh +ExecStop=/share/ymq/run/fvlm/stop.sh +StandardOutput=append:/var/log/fvlm/fvlm.log +StandardError=append:/var/log/fvlm/fvlm.log +SyslogIdentifier=fvlm + +[Install] +WantedBy=multi-user.target diff --git a/logs/README.md b/logs/README.md new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..66ee57d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +llava +# git+https://github.com/apple/ml-fastvlm +git+https://git.kaiyuancloud.cn/yumoqing/apppublic +git+https://git.kaiyuancloud.cn/yumoqing/sqlor +git+https://git.kaiyuancloud.cn/yumoqing/ahserver diff --git a/start.sh b/start.sh new file mode 100755 index 0000000..0d483b1 --- /dev/null +++ b/start.sh @@ -0,0 +1,3 @@ +#!/usr/bin/bash + +CUDA_VISIBLE_DEVICES=6 /share/ymq/run/fvlm/fvlm.env/bin/python app/fastvlm.py -p 9994 & diff --git a/stop.sh b/stop.sh new file mode 100755 index 0000000..5b4db65 --- /dev/null +++ b/stop.sh @@ -0,0 +1,3 @@ +#!/usr/bin/bash + +/d/ymq/bin/killname fastvlm.py diff --git a/wwwroot/bricks b/wwwroot/bricks new file mode 120000 index 0000000..8bcfc42 --- /dev/null +++ b/wwwroot/bricks @@ -0,0 +1 @@ +/tmp/dist \ No newline at end of file diff --git a/wwwroot/index.md b/wwwroot/index.md new file mode 100644 index 0000000..c8a04db --- /dev/null +++ b/wwwroot/index.md @@ -0,0 +1,11 @@ +# A FastVLM webserver +[fastvlm](https://github.com/apple/ml-fastvlm) is apple released a open source +multiple modal llm, it can input image, promt and generate text + +## Usage +``` +curl https://{domain}/v1/generate \ + -F "prompt=描述这张图片" \ + -F "image_path=@path_to_image" +``` + diff --git a/wwwroot/index.ui b/wwwroot/index.ui new file mode 100644 index 0000000..fcf0ee2 --- /dev/null +++ b/wwwroot/index.ui @@ -0,0 +1,7 @@ +{ + "widgettype":"MdWidget", + "options":{ + "md_url":"{{entire_url('index.md')}}", + "width":"100%" + } +} diff --git a/wwwroot/v1/generate/index.dspy b/wwwroot/v1/generate/index.dspy new file mode 100644 index 0000000..874009d --- /dev/null +++ b/wwwroot/v1/generate/index.dspy @@ -0,0 +1,8 @@ +debug(f'{params_kw=}') +image_path = realpath(params_kw.image_path) +prompt = params_kw.prompt +debug(f'{image_path=}, {prompt=}') +d = await generate(image_path, prompt) +debug(f'{image_path=}, {prompt=}, {d=}') +return d +