使用SGLang部署Qwen3 Reranker系列模型

使用vLLM部署的版本可查看另一篇文章:使用vLLM部署Qwen3 Reranker系列模型
实测使用vLLM部署的推理速度更快,QPS更高
SGLang安装
根据官方文档进行安装(SGLang官方文档,Qwen官方SGLang安装文档)
conda create -n myenv python=3.10 -y
conda activate myenv
pip install "sglang[all]"
SGLang部署Qwen3 Reranker系列(0.6B/4B/8B)模型
根据官方部署Reranker模型的教程,使用SGLang部署Qwen3 Reranker系列的模型时,会出现报错,先说结论,SGLang是可以部署Qwen3 Reranker系列的模型的,只是需要进行一定的转换。
在官方流程中,使用SGLang部署Reranker模型的方法为(官方Reranker模型部署流程),对于BGE等系列的reranker模型可以直接按照官方流程进行,在启动参数中加入 --is-embedding 参数即可进行模型的部署上线。
然而,依照上述流程应用在Qwen3 Reranker系列的模型时,会出现如下的报错:
{'object': 'error', 'message': '1 validation error for RerankResponse\nscore\n Input should be a valid number [type=float_type, input_value=[-0.023193359375, 0.03881...3671875, 0.015869140625], input_type=list]\n For further information visit https://errors.pydantic.dev/2.12/v/float_type', 'type': 'BadRequest', 'param': None, 'code': 400}
SGLang无法通过 --is-embedding 参数正确的加载模型,尽管可以部署模型,但是客户端是完全调用不了相应接口的,这是因为,Qwen3 Reranker所采用的是 Qwen3ForCausalLM 架构,是基于生成式的架构改良的,SGLang目前无法将这类架构的模型封装为Reranker进行使用。但是,借鉴VLLM部署的解决思路,可以将模型转换为真正的二分类模型,然后使用 classify 接口进行调用,转换模型代码如下所示。(代码参考来源)
import torch
from transformers import Qwen3ForCausalLM, Qwen3ForSequenceClassification, AutoTokenizer
def convert_model(model_path, save_path):
# --- Step 1: Load the Causal LM and extract lm_head weights ---
print(f"1. Loading Causal LM: {model_path}")
tokenizer = AutoTokenizer.from_pretrained(model_path)
causal_lm = Qwen3ForCausalLM.from_pretrained(model_path)
# The lm_head is the final linear layer that maps hidden states to vocabulary logits
lm_head_weights = causal_lm.lm_head.weight
print(f" lm_head weight shape: {lm_head_weights.shape}") # (vocab_size, hidden_size)
# --- Step 2: Get the token IDs for "yes" and "no" ---
print("\n2. Finding token IDs for 'yes' and 'no'")
yes_token_id = tokenizer.convert_tokens_to_ids("yes")
no_token_id = tokenizer.convert_tokens_to_ids("no")
print(f" ID for 'yes': {yes_token_id}, ID for 'no': {no_token_id}")
# --- Step 3: Create the classifier vector ---
print("\n3. Creating the classifier vector from lm_head weights")
# Extract the specific rows (weight vectors) for our target tokens
yes_vector = lm_head_weights[yes_token_id]
no_vector = lm_head_weights[no_token_id]
# The new classifier is the difference between the 'yes' and 'no' vectors
classifier_vector = yes_vector - no_vector
print(f" Shape of the new classifier vector: {classifier_vector.shape}")
# --- Step 4: Load the model as a Sequence Classifier ---
print(f"\n4. Loading Sequence Classification model with num_labels=1")
# num_labels=1 is key for binary classification represented by a single logit
seq_cls_model = Qwen3ForSequenceClassification.from_pretrained(
model_path,
num_labels=2,
ignore_mismatched_sizes=True
)
# --- Step 5: Replace the classifier's weights ---
print("\n5. Replacing the randomly initialized classifier weights")
# The classification head in Qwen is named 'score'. It's a torch.nn.Linear layer.
# Its weight matrix has shape (num_labels, hidden_size), which is (1, hidden_size) here.
with torch.no_grad():
weight_matrix = torch.stack([no_vector, yes_vector], dim=0) # (2, hidden_size)
# We need to add a dimension to our vector to match the (1, hidden_size) shape
seq_cls_model.score.weight.copy_(weight_matrix)
# It's good practice to zero out the bias for a clean transfer
if seq_cls_model.score.bias is not None:
seq_cls_model.score.bias.zero_()
print(" Classifier head replaced successfully.")
# --- Verification: Prove that the logic works ---
print("\n--- VERIFICATION ---")
text = "Is this a good example?"
inputs = tokenizer(text, return_tensors="pt")
# A. Get logits from the original Causal LM
with torch.no_grad():
outputs_causal = causal_lm(**inputs)
last_token_logits = outputs_causal.logits[0, -1, :]
manual_logit_diff = last_token_logits[yes_token_id] - last_token_logits[no_token_id]
# Compute probs (yes/no) and extract 'yes' prob
concat_logits = torch.stack([last_token_logits[yes_token_id], last_token_logits[no_token_id]])
causal_prob = torch.softmax(concat_logits, dim=-1)[0]
# B. Get the single logit from our new Sequence Classification model
with torch.no_grad():
outputs_seq_cls = seq_cls_model(**inputs)
logits = outputs_seq_cls.logits.squeeze(0) # shape: (2,)
probs = torch.softmax(logits, dim=-1) # shape: (2,)
print(f"Input text: '{text}'")
print(f"\nManual logit difference ('yes' - 'no'): {manual_logit_diff.item():.4f}")
# Probs
print(f"\nCausal prob (2 classes): {causal_prob.item():.4f}")
print(f"Classification prob (2 class): {probs[1].item():.4f}")
print(f"Are they almost identical? {torch.allclose(causal_prob, probs[1])}")
seq_cls_model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"Save model to: {save_path}")
if __name__ == "__main__":
model_path = "/home/Qwen/Qwen3-Reranker-0.6B"
save_path = "/home/Qwen/Qwen3-Reranker-0.6B-sglang"
convert_model(model_path, save_path)
转换结果如下所示,可以看到,转换后的模型和转换前输出是一致的,转换之后我们只看单类概率即可,即 P(yes)

模型部署上线
python3 -m sglang.launch_server --model /home/Qwen/Qwen3-Reranker-0.6B-sglang --port 30000
客户端调用 classify 接口
import requests
import json
prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
document_template = "<Document>: {doc}{suffix}"
instruction = (
"Given a web search query, retrieve relevant passages that answer the query"
)
queries = [
"What is the capital of China?",
"Explain gravity",
]
documents = [
"I want to eat an apple",
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]
for query, doc in zip(queries, documents):
query = query_template.format(prefix=prefix, instruction=instruction, query=query)
doc = document_template.format(doc=doc, suffix=suffix)
input_content = query + doc
# Make classification request
response = requests.post(
"http://127.0.0.1:30000/v1/classify",
headers={"Content-Type": "application/json"},
json={
"input": input_content
}
)
# Parse response
result = response.json()
print(json.dumps(result, indent=2))
调用输出
{
"id": "classify-1a2edda3af624e24bac6e0af73a98477",
"object": "list",
"created": 1765270422,
"model": "/home/Qwen/Qwen3-Reranker-0.6B-sglang",
"data": [
{
"index": 0,
"label": "LABEL_0",
"probs": [
0.9999761581420898,
2.3783744836691767e-05
],
"num_classes": 2
}
],
"usage": {
"prompt_tokens": 84,
"total_tokens": 84,
"completion_tokens": 0,
"prompt_tokens_details": null,
"reasoning_tokens": 0
}
}
{
"id": "classify-59ac286af68c4d36bf568812f4872be8",
"object": "list",
"created": 1765270422,
"model": "/home/Qwen/Qwen3-Reranker-0.6B-sglang",
"data": [
{
"index": 0,
"label": "LABEL_1",
"probs": [
0.0006023541209287941,
0.9993976354598999
],
"num_classes": 2
}
],
"usage": {
"prompt_tokens": 104,
"total_tokens": 104,
"completion_tokens": 0,
"prompt_tokens_details": null,
"reasoning_tokens": 0
}
}
这里我们只看 LABEL_1 的概率即可, LABEL_1 的概率即为相关性分数。
更多推荐

所有评论(0)