在hugging face上下载训练好的的大语言模型,这里使用了Qwen3-VL-2B-Instruct这个图文模型,然后调用该模型,并实现与模型对话。cpu和gpu都可用。

1、模型下载:

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] ="0"

# huggingface主页:https://huggingface.co/


# save_path = "./my_qwen3_model"

# 模型下载
# 方式一:
# 选择一个你需要的模型,例如:'Qwen/Qwen3-7B-Instruct'
# 模型和分词器会被下载到 'your_local_model_path'
# model_id = "Qwen/Qwen3-VL-2B-Instruct"
save_path = "qwen3-vl-2b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForImageTextToText.from_pretrained(model_id, device_map="auto", torch_dtype="auto", trust_remote_code=True)

# 保存到指定路径
tokenizer.save_pretrained(save_path)
model.save_pretrained(save_path)

# 模型下载方法二,使用snapshot_download下载
from huggingface_hub import snapshot_download
import os

# 镜像站配置 (Windows用)
os.environ["HF_TOKEN"] = "your token"
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# 如果用了上面的镜像站还是慢,可以再关闭加速模块
# os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '0'

snapshot_download(
    repo_id="Qwen/Qwen3-VL-2B-Instruct",
    local_dir="./qwen3-vl-2b-instruct", # 本地保存路径
    local_dir_use_symlinks=False, # 直接复制文件,不创建符号链接
    resume_download=True,          # 启用断点续传,关键!
    # max_workers=8,               # 可选的,多线程下载
)

# 方法三:直接在hugging face网站下载,网址:https://huggingface.co/
# 在网站上选择一个自己想要的模型,比如Qwen3-VL-2B-Instruct,将其模型页面下Files中的所有文件下载下来,
# 并放到一个文件夹中,比如文件夹名字为qwen3-vl-2b-instruct,然后后续将该目录路径作为模型加载路径

网页下载如下图将所有文件下载到本地特定目录:

2、模型调用:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Qwen3-VL-2B-Instruct 终端交互脚本
功能:支持图文混合对话,可以加载本地图片进行视觉理解
使用方法:运行脚本后,输入图片路径开始对话,或直接输入文本开始纯文本对话
"""

import sys
import torch
from transformers import AutoModelForImageTextToText, AutoProcessor
from PIL import Image
import os
os.environ["HF_TOKEN"] = "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"
from pathlib import Path

# 配置项 - 请根据实际情况修改
MODEL_NAME = "Qwen/Qwen3-VL-2B-Instruct"
MODEL_PATH="/data/learning_example/use_llm/demo_use_qwen/qwen3-vl-2b-instruct"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"  # 自动检测GPU
USE_4BIT = False  # 是否使用4bit量化(降低显存占用,速度会慢一些)

def load_model():
    """加载模型和处理器"""
    print(f"正在加载模型: {MODEL_NAME}")
    print(f"使用设备: {DEVICE}")
    
    # 设置数据类型
    if DEVICE == "cuda":
        torch_dtype = torch.bfloat16
    else:
        torch_dtype = torch.float32
    
    # 加载处理器
    print("开始加载处理器...")
    processor = AutoProcessor.from_pretrained(
        MODEL_PATH,
        trust_remote_code=True
    )

    print("成功加载处理器,开始加载模型...")
    # 加载模型
    if USE_4BIT and DEVICE == "cuda":
        # 4bit量化加载(需要bitsandbytes库)
        try:
            from transformers import BitsAndBytesConfig
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.bfloat16
            )
            model = AutoModelForImageTextToText.from_pretrained(
                MODEL_PATH,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                quantization_config=quantization_config,
                trust_remote_code=True
            )
        except ImportError:
            print("警告: bitsandbytes未安装,回退到普通加载模式")
            model = AutoModelForImageTextToText.from_pretrained(
                MODEL_PATH,
                torch_dtype=torch_dtype,
                device_map="auto" if DEVICE == "cuda" else None,
                trust_remote_code=True
            )
    else:
        # 普通加载
        model = AutoModelForImageTextToText.from_pretrained(
            MODEL_PATH,
            torch_dtype=torch_dtype,
            device_map="auto" if DEVICE == "cuda" else None,
            trust_remote_code=True
        )
    print("成功加载模型。")
    # CPU模式需要额外处理
    if DEVICE == "cpu":
        print("当前为cpu模式,对model进行额外处理...")
        model = model.to("cpu")
    
    print("模型加载完成!")
    return model, processor

def load_image(image_path):
    """加载图片并转换为RGB"""
    try:
        image = Image.open(image_path).convert("RGB")
        print(f"已加载图片: {image_path}")
        print(f"图片尺寸: {image.size}")
        return image
    except Exception as e:
        print(f"加载图片失败: {e}")
        return None

def chat_with_model(model, processor, image, user_input, history=None):
    """
    与模型进行对话
    Args:
        model: 模型实例
        processor: 处理器实例
        image: PIL Image对象或None
        user_input: 用户输入的文本
        history: 历史对话记录(可选)
    Returns:
        response: 模型回复
        updated_history: 更新后的历史记录
    """
    # 构建消息格式
    if image is not None:
        # 图文混合输入
        content = [
            {"type": "image", "image": image},
            {"type": "text", "text": user_input}
        ]
    else:
        # 纯文本输入
        content = [{"type": "text", "text": user_input}]
    
    messages = [{"role": "user", "content": content}]
    
    # 如果有历史记录,将历史消息加入
    if history:
        messages = history + messages
    
    # 应用聊天模板
    text = processor.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    
    # 处理输入
    inputs = processor(
        text=[text], 
        images=[image] if image is not None else None, 
        return_tensors="pt"
    )
    
    # 移动到设备
    if DEVICE == "cuda":
        inputs = inputs.to(DEVICE)
    else:
        inputs = {k: v.to("cpu") for k, v in inputs.items()}
    
    # 生成回复
    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            pad_token_id=processor.tokenizer.pad_token_id,
            eos_token_id=processor.tokenizer.eos_token_id
        )
    
    # import ipdb
    # ipdb.set_trace()
    # 解码回复(只取新生成的部分)
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, 
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]
    
    # 更新历史记录
    assistant_message = {"role": "assistant", "content": [{"type": "text", "text": output_text}]}
    updated_history = messages + [assistant_message]
    
    return output_text, updated_history

def main():
    """主函数:交互式对话循环"""
    print("=" * 50)
    print("Qwen3-VL-2B-Instruct 终端交互程序")
    print("=" * 50)
    print("使用说明:")
    print("  1. 输入图片路径并回车,然后输入问题")
    print("  2. 直接输入文本开始纯文本对话")
    print("  3. 特殊命令:")
    print("     /quit 或 /exit  - 退出程序")
    print("     /clear          - 清除对话历史")
    print("     /image 图片路径  - 切换/加载新图片")
    print("     /status         - 显示当前状态")
    print("=" * 50)
    
    # 加载模型
    print("开始加载模型...")
    model, processor = load_model()
    print(f"success load model and tokenizer from path: {MODEL_PATH}.")
    print("=" * 50)
    
    # 对话状态
    current_image = None
    current_image_path = None
    history = []
    
    print("\n准备好了!请输入图片路径开始对话,或直接输入文本。\n")
    
    while True:
        # try:
        # 获取用户输入
        user_input = input("\n[你] >>> ").strip()
        
        if not user_input:
            continue
        
        # 处理特殊命令
        if user_input.startswith('/'):
            cmd_parts = user_input.split(maxsplit=1)
            cmd = cmd_parts[0].lower()
            
            if cmd in ['/quit', '/exit']:
                print("再见!")
                break
            elif cmd == '/clear':
                history = []
                print("对话历史已清除")
                continue
            elif cmd == '/image' and len(cmd_parts) > 1:
                new_image_path = cmd_parts[1].strip()
                new_image = load_image(new_image_path)
                if new_image:
                    current_image = new_image
                    current_image_path = new_image_path
                    # 清除历史以便开始新话题
                    history = []
                    print(f"已切换到图片: {current_image_path}")
                    print("现在可以输入问题询问关于这张图片的内容。")
                else:
                    print("图片加载失败,请检查路径")
                continue
            elif cmd == '/status':
                print("\n当前状态:")
                print(f"  设备: {DEVICE}")
                print(f"  量化: {'4bit' if USE_4BIT else 'FP16'}")
                print(f"  图片: {current_image_path if current_image_path else '无'}")
                print(f"  历史消息数: {len(history)}")
                print(f"历史消息:\n {history}")
                continue
            else:
                print(f"未知命令: {cmd}")
                continue
        
        # 如果是纯文本输入且没有加载图片,直接进行纯文本对话
        if current_image is None:
            response, history = chat_with_model(model, processor, None, user_input, history)
        else:
            # 有图片的情况
            response, history = chat_with_model(model, processor, current_image, user_input, history)
        
        # 输出回复
        print(f"\n[模型] >>> {response}")
            
        # except KeyboardInterrupt:
        #     print("\n\n检测到中断,输入 /quit 退出程序")
        #     continue
        # except Exception as e:
        #     print(f"\n发生错误: {e}")
        #     print("如果持续出错,请输入 /clear 清除历史或重新加载图片")

if __name__ == "__main__":
    main()

3、使用效果:

更多推荐