1.系统环境

  • NVIDIA T4 * 2 /16G * 2 Driver Version: 535.154.05 CUDA Version: 12.2

2.打开modelscope选择对应的模型,我选择Qwen3.5-4B

https://www.modelscope.cn/models

3.vllm镜像下载,使用vllm容器加载模型

docker pull vllm/vllm-openai:latest

4.使用vllm镜像下载Qwen3.5-4B模型文件

docker run --rm -it \
    --gpus all \
    --entrypoint /bin/bash \
    --pids-limit -1 \
    --security-opt seccomp=unconfined \
    -v /root/lipengcheng/qwen35_4b:/models \
    -e OMP_NUM_THREADS=8 \
    vllm/vllm-openai:latest \
    -c "pip install modelscope && python3 -c \"from modelscope import snapshot_download; snapshot_download('Qwen/Qwen3.5-4B', cache_dir='/models')\""

5.使用vllm加载Qwen3.5-4B模型

version: '3.8'

services:
  vllm-qwen:
    image: vllm/vllm-openai:latest
    container_name: vllm-qwen35-4b
    privileged: true
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 2
              capabilities: [gpu]
    volumes:
      - /root/lipengcheng/qwen35_4b/Qwen:/models
    ports:
      - "23333:8000"
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
    command: >
      --model /models/Qwen3___5-4B
      --host 0.0.0.0
      --port 8000
      --dtype half
      --served-model-name qwen3.5-4b
      --max-model-len 8192
      --tensor-parallel-size 2
      --default-chat-template-kwargs '{"enable_thinking": false}'
    restart: unless-stopped

6.测试

curl http://localhost:23333/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "qwen3.5-4b",
    "messages": [
      {"role": "user", "content": "你好"}
    ],
    "temperature": 0.7
  }'

更多推荐