趋近智
既然我们已经研究过评估量化 (quantization)大型语言模型的各种指标和方法,现在就将这些知识付诸实践。本节提供一个动手指南,指导您如何设置和执行一项基准测试,以对比量化大型语言模型与其全精度模型的性能。我们将衡量延迟、吞吐量 (throughput)和内存使用等性能特点,并同时评估使用困惑度对模型质量的影响。
我们的目标是获得具体数据,这些数据将展现量化过程中涉及的权衡,从而能够做出明智的模型部署决策。
在开始之前,请确保您已具备所需环境。本次实践假设您能够使用配备CUDA的GPU机器,并且已经安装了所需的Python库。
pip install transformerspip install acceleratepip install datasetspip install evaluatebitsandbytes量化 (quantization)模型进行基准测试)pip install bitsandbytespip install auto-gptq 或 pip install autoawq对于这个例子,我们将对比一个基线FP16模型(例如,meta-llama/Llama-2-7b-hf)与对应的INT4量化版本(例如,使用GPTQ)。您需要将模型标识符替换为您要评估的具体模型。请确保量化模型与您的环境和库兼容(例如,GPTQ需要auto-gptq)。
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
import evaluate
from datasets import load_dataset
import numpy as np
# 配置
baseline_model_id = "meta-llama/Llama-2-7b-hf" # 替换为您的基线模型
quantized_model_id = "TheBloke/Llama-2-7B-GPTQ" # 替换为您的量化模型
device = "cuda" if torch.cuda.is_available() else "cpu"
num_samples = 50 # 用于基准测试延迟/吞吐量的样本数量
max_new_tokens = 100 # 要生成的token数量
perplexity_dataset = "wikitext"
perplexity_dataset_config = "wikitext-2-raw-v1"
perplexity_split = "test"
perplexity_max_samples = 50 # 减少此值可加快评估速度
# 用于生成任务的示例提示
prompt = "The field of Large Language Models is "
print(f"Using device: {device}")
if device == "cpu":
print("警告:在CPU上进行基准测试会显著慢得多,并且内存使用模式也不同。")
# --- 辅助函数 ---
def load_model_and_tokenizer(model_id, is_quantized=False):
print(f"正在加载模型: {model_id}...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token # 如果缺少,则设置填充token
model_kwargs = {"device_map": "auto"}
if is_quantized:
# 如果需要,添加量化特有的加载参数
# AutoGPTQ示例:
# model_kwargs["use_safetensors"] = True
# model_kwargs["trust_remote_code"] = True # 对trust_remote_code要谨慎
pass # 在此处为您的量化模型类型添加特定的加载参数
model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
model.eval() # 将模型设置为评估模式
print("模型已加载。")
return model, tokenizer
def measure_latency(model, tokenizer, prompt, max_new_tokens, num_runs=10):
print("正在测量延迟...")
latencies = []
inputs = tokenizer(prompt, return_tensors="pt").to(device)
# 热身运行
with torch.no_grad():
_ = model.generate(**inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.pad_token_id)
torch.cuda.synchronize() # 确保GPU操作完成
for _ in range(num_runs):
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
with torch.no_grad():
_ = model.generate(**inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.pad_token_id)
end_event.record()
torch.cuda.synchronize() # 等待操作完成
latency_ms = start_event.elapsed_time(end_event)
latencies.append(latency_ms)
# print(f"单次运行延迟: {latency_ms:.2f} ms") # 可选:打印单次运行延迟
avg_latency = np.mean(latencies)
print(f"平均延迟 ({num_runs} 次运行): {avg_latency:.2f} ms")
return avg_latency
def measure_throughput(model, tokenizer, prompt, max_new_tokens, num_samples=50):
print("正在测量吞吐量...")
inputs = tokenizer(prompt, return_tensors="pt").to(device)
total_tokens_generated = 0
total_time_sec = 0
# 热身
with torch.no_grad():
_ = model.generate(**inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.pad_token_id)
torch.cuda.synchronize()
start_time = time.time()
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
with torch.no_grad():
# 注意:这个简单的循环不会并行化请求。
" # 吞吐量通常涉及批量处理或并发请求。"
# 为简单起见,我们在此测量顺序生成速度。
outputs = model.generate(**inputs, max_new_tokens=num_samples * max_new_tokens, pad_token_id=tokenizer.pad_token_id, do_sample=False) # 生成一个长序列
generated_tokens = outputs[0][inputs.input_ids.shape[1]:].size(0) # 统计生成的token数量,不包括提示
total_tokens_generated = generated_tokens
end_event.record()
torch.cuda.synchronize()
total_time_sec = start_event.elapsed_time(end_event) / 1000.0 # 时间(秒)
# 替代方案:CPU计时(对于GPU操作精度较低)
# total_time_sec = time.time() - start_time
throughput_tokens_per_sec = total_tokens_generated / total_time_sec if total_time_sec > 0 else 0
print(f"生成的总token数: {total_tokens_generated}")
print(f"总时间: {total_time_sec:.2f} 秒")
print(f"吞吐量: {throughput_tokens_per_sec:.2f} tokens/秒")
return throughput_tokens_per_sec
def measure_memory_usage(model_load_fn, *args, **kwargs):
print("正在测量峰值内存使用...")
torch.cuda.reset_peak_memory_stats(device)
initial_memory = torch.cuda.max_memory_allocated(device)
# 在此函数中加载模型以捕获其内存占用
model, tokenizer = model_load_fn(*args, **kwargs)
memory_after_load = torch.cuda.max_memory_allocated(device)
print(f"加载后内存: {memory_after_load / (1024**3):.2f} GB")
# 执行一次采样推理运行以捕获运行时内存
inputs = tokenizer("Sample text for memory measurement.", return_tensors="pt").to(device)
with torch.no_grad():
_ = model.generate(**inputs, max_new_tokens=10, pad_token_id=tokenizer.pad_token_id)
torch.cuda.synchronize()
peak_memory = torch.cuda.max_memory_allocated(device)
peak_memory_gb = peak_memory / (1024**3) # 将字节转换为GB
print(f"推理期间峰值内存使用: {peak_memory_gb:.2f} GB")
# 清理内存
del model
del tokenizer
torch.cuda.empty_cache()
return peak_memory_gb
def calculate_perplexity(model, tokenizer, dataset_name, dataset_config, split, max_samples=50):
print(f"正在计算 {dataset_name} ({split} 分割) 上的困惑度...")
try:
perplexity_metric = evaluate.load("perplexity", module_type="metric")
data = load_dataset(dataset_name, dataset_config, split=f"{split}[:{max_samples}]") # 使用切片进行更快的评估
data = data.map(lambda examples: tokenizer(examples["text"]), batched=True) # 使用默认设置
results = perplexity_metric.compute(model=model,
tokenizer=tokenizer,
data=data["text"], # 传递原始文本
batch_size=1, # 根据GPU内存调整批量大小
device=device)
ppl = results["perplexity"]
print(f"困惑度: {ppl:.4f}")
return ppl
except Exception as e:
print(f"计算困惑度出错: {e}")
print("跳过困惑度计算。")
return None
# --- 基准测试执行 ---
results = {}
# 基准测试基线模型
print("\n--- 正在基准测试基线模型 ---")
# 分别测量内存以捕获加载期间的峰值使用情况
baseline_memory = measure_memory_usage(load_model_and_tokenizer, baseline_model_id, is_quantized=False)
# 再次加载以进行其他基准测试
baseline_model, baseline_tokenizer = load_model_and_tokenizer(baseline_model_id, is_quantized=False)
baseline_latency = measure_latency(baseline_model, baseline_tokenizer, prompt, max_new_tokens)
baseline_throughput = measure_throughput(baseline_model, baseline_tokenizer, prompt, max_new_tokens, num_samples)
baseline_perplexity = calculate_perplexity(baseline_model, baseline_tokenizer, perplexity_dataset, perplexity_dataset_config, perplexity_split, perplexity_max_samples)
results["baseline"] = {
"latency_ms": baseline_latency,
"throughput_tokens_sec": baseline_throughput,
"peak_memory_gb": baseline_memory,
"perplexity": baseline_perplexity
}
# 在加载量化模型之前清理基线模型内存
print("正在清理基线模型...")
del baseline_model
del baseline_tokenizer
torch.cuda.empty_cache()
print("基线模型清理完成。")
# 基准测试量化模型
print("\n--- 正在基准测试量化模型 ---")
# 分别测量内存
quantized_memory = measure_memory_usage(load_model_and_tokenizer, quantized_model_id, is_quantized=True)
# 再次加载以进行其他基准测试
quantized_model, quantized_tokenizer = load_model_and_tokenizer(quantized_model_id, is_quantized=True)
quantized_latency = measure_latency(quantized_model, quantized_tokenizer, prompt, max_new_tokens)
quantized_throughput = measure_throughput(quantized_model, quantized_tokenizer, prompt, max_new_tokens, num_samples)
quantized_perplexity = calculate_perplexity(quantized_model, quantized_tokenizer, perplexity_dataset, perplexity_dataset_config, perplexity_split, perplexity_max_samples)
results["quantized"] = {
"latency_ms": quantized_latency,
"throughput_tokens_sec": quantized_throughput,
"peak_memory_gb": quantized_memory,
"perplexity": quantized_perplexity
}
# 清理量化模型内存
print("正在清理量化模型...")
del quantized_model
del quantized_tokenizer
torch.cuda.empty_cache()
print("量化模型清理完成。")
# --- 结果分析 ---
print("\n--- 基准测试结果总结 ---")
print(f"{'指标':<25} {'基线':<15} {'量化':<15} {'变化 (%)':<15}")
print("-" * 70)
# 延迟
base_lat = results["baseline"]["latency_ms"]
quant_lat = results["quantized"]["latency_ms"]
lat_change = ((quant_lat - base_lat) / base_lat) * 100 if base_lat else 0
print(f"{'平均延迟 (ms)':<25} {base_lat:<15.2f} {quant_lat:<15.2f} {lat_change:<15.2f}")
# 吞吐量
base_thr = results["baseline"]["throughput_tokens_sec"]
quant_thr = results["quantized"]["throughput_tokens_sec"]
thr_change = ((quant_thr - base_thr) / base_thr) * 100 if base_thr else 0
print(f"{'吞吐量 (tokens/秒)':<25} {base_thr:<15.2f} {quant_thr:<15.2f} {thr_change:<15.2f}")
# 内存
base_mem = results["baseline"]["peak_memory_gb"]
quant_mem = results["quantized"]["peak_memory_gb"]
mem_change = ((quant_mem - base_mem) / base_mem) * 100 if base_mem else 0
print(f"{'峰值内存 (GB)':<25} {base_mem:<15.2f} {quant_mem:<15.2f} {mem_change:<15.2f}")
# 困惑度
base_ppl = results["baseline"]["perplexity"]
quant_ppl = results["quantized"]["perplexity"]
if base_ppl is not None and quant_ppl is not None:
ppl_change = ((quant_ppl - base_ppl) / base_ppl) * 100
print(f"{'困惑度':<25} {base_ppl:<15.4f} {quant_ppl:<15.4f} {ppl_change:<15.2f}")
else:
print(f"{'困惑度':<25} {'N/A':<15} {'N/A':<15} {'N/A':<15}")
print("-" * 70)
# 可选:可视化
# 准备绘图数据(请替换为您运行的实际数字)
# 这些是示意性的占位值。
base_lat_val = results["baseline"]["latency_ms"] if results["baseline"]["latency_ms"] else 1000
quant_lat_val = results["quantized"]["latency_ms"] if results["quantized"]["latency_ms"] else 500
base_thr_val = results["baseline"]["throughput_tokens_sec"] if results["baseline"]["throughput_tokens_sec"] else 50
quant_thr_val = results["quantized"]["throughput_tokens_sec"] if results["quantized"]["throughput_tokens_sec"] else 100
base_mem_val = results["baseline"]["peak_memory_gb"] if results["baseline"]["peak_memory_gb"] else 15
quant_mem_val = results["quantized"]["peak_memory_gb"] if results["quantized"]["peak_memory_gb"] else 8
base_ppl_val = results["baseline"]["perplexity"] if results["baseline"]["perplexity"] else 5.0
quant_ppl_val = results["quantized"]["perplexity"] if results["quantized"]["perplexity"] else 5.5
```plotly
{"layout": {"title": "基线 (FP16) 与 量化 (INT4) LLM 性能对比", "barmode": "group", "xaxis": {"title": "指标"}, "yaxis": {"title": "值"}, "legend_title_text": "模型类型", "height": 400}, "data": [{"type": "bar", "name": "基线 (FP16)", "x": ["延迟 (ms)", "吞吐量 (tok/秒)", "内存 (GB)", "困惑度"], "y": [base_lat_val, base_thr_val, base_mem_val, base_ppl_val], "marker": {"color": "#4263eb"}}, {"type": "bar", "name": "量化 (INT4)", "x": ["延迟 (ms)", "吞吐量 (tok/秒)", "内存 (GB)", "困惑度"], "y": [quant_lat_val, quant_thr_val, quant_mem_val, quant_ppl_val], "marker": {"color": "#12b886"}}]}
基线FP16模型与其INT4量化版本在性能和质量指标上的对比。延迟、内存和困惑度越低越好。吞吐量 (throughput)越高越好。(注:数值为示意性的)。
输出表格和图表提供了量化 (quantization)比较。您应该注意到:
du -sh model_directory)。本次实践基准测试提供了重要的数据点。请记住,这些结果与所用硬件、所选模型、量化技术(例如GPTQ、AWQ、bitsandbytes)以及具体的基准测试设置(批量大小、序列长度、数据集)相关。对于生产环境,您可以通过以下方式扩展此项工作:
通过系统地进行基准测试,您可以放心地选择和部署能够满足您性能要求并保持可接受质量的量化模型。
这部分内容有帮助吗?
© 2026 ApX Machine LearningAI伦理与透明度•