1.系统环境

  • NVIDIA T4 * 2 /16G * 2 Driver Version: 535.154.05 CUDA Version: 12.2

2…在modelscope获取查找对应的模型,比如我选择Qwen3.5-9B

https://www.modelscope.cn/models

3.下载vllm镜像,我是用vllm容器来加载Qwen3.5-9B模型

docker pull vllm/vllm-openai:latest

4.下载Qwen3.5-9B模型

docker run --rm -it \
    --gpus all \
    --entrypoint /bin/bash \
    --pids-limit -1 \
    --security-opt seccomp=unconfined \
    -v /root/lipengcheng/qwen35_9b:/models \
    -e OMP_NUM_THREADS=8 \
    vllm/vllm-openai:latest \
    -c "pip install modelscope && python3 -c \"from modelscope import snapshot_download; snapshot_download('Qwen/Qwen3.5-9B', cache_dir='/models')\""

5.vllm加载Qwen3.5-9B模型

  • –default-chat-template-kwargs ‘{“enable_thinking”: false}’ #关闭模型推理,业务需要
version: '3.8'

services:
  vllm-qwen:
    image: vllm/vllm-openai:latest
    container_name: vllm-qwen35-9b
    privileged: true
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 2
              capabilities: [gpu]
    volumes:
      - /root/lipengcheng/qwen35_9b/Qwen:/models
    ports:
      - "23333:8000"
    environment:
      - NVIDIA_VISIBLE_DEVICES=all
    command: >
      --model /models/Qwen3___5-9B
      --host 0.0.0.0
      --port 8000
      --dtype half
      --served-model-name qwen3.5-9b
      --max-model-len 4096
      --tensor-parallel-size 2
      --default-chat-template-kwargs '{"enable_thinking": false}'
      --max-num-seqs 128
    restart: unless-stopped

6.模型测试


curl http://localhost:23333/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{
    "model": "qwen3.5-9b",
    "messages": [
      {"role": "user", "content": "你好"}
    ],
    "temperature": 0.7
  }'
{"id":"chatcmpl-9b2f7392ff7aa127","object":"chat.completion","created":1779679227,"model":"qwen3.5-9b","choices":[{"index":0,"message":{"role":"assistant","content":"你好!有什么我可以帮你的吗?无论是回答问题、创作内容,还是提供建议,我都会尽力协助你。","refusal":null,"annotations":null,"audio":null,"function_call":null,"tool_calls":[],"reasoning":null},"logprobs":null,"finish_reason":"stop","stop_reason":null,"token_ids":null}],"service_tier":null,"system_fingerprint":null,"usage":{"prompt_tokens":13,"total_tokens":37,"completion_tokens":24,"prompt_tokens_details":null},"prompt_logprobs":null,"prompt_token_ids":null,"kv_transfer_params":null}

更多推荐