既然我们已经研究过评估量化大型语言模型的各种指标和方法,现在就将这些知识付诸实践。本节提供一个动手指南,指导您如何设置和执行一项基准测试,以对比量化大型语言模型与其全精度模型的性能。我们将衡量延迟、吞吐量和内存使用等性能特点,并同时评估使用困惑度对模型质量的影响。我们的目标是获得具体数据,这些数据将展现量化过程中涉及的权衡,从而能够做出明智的模型部署决策。前提条件和设置在开始之前,请确保您已具备所需环境。本次实践假设您能够使用配备CUDA的GPU机器,并且已经安装了所需的Python库。Python: 版本3.8或更高。PyTorch: 安装时包含CUDA支持。Transformers: pip install transformersAccelerate: pip install accelerateDatasets: pip install datasetsEvaluate: pip install evaluatebitsandbytes: (如果要对bitsandbytes量化模型进行基准测试)pip install bitsandbytesAutoGPTQ/AutoAWQ: (如果要对GPTQ/AWQ模型进行基准测试)pip install auto-gptq 或 pip install autoawq对于这个例子,我们将对比一个基线FP16模型(例如,meta-llama/Llama-2-7b-hf)与对应的INT4量化版本(例如,使用GPTQ)。您需要将模型标识符替换为您要评估的具体模型。请确保量化模型与您的环境和库兼容(例如,GPTQ需要auto-gptq)。import torch from transformers import AutoModelForCausalLM, AutoTokenizer import time import evaluate from datasets import load_dataset import numpy as np # 配置 baseline_model_id = "meta-llama/Llama-2-7b-hf" # 替换为您的基线模型 quantized_model_id = "TheBloke/Llama-2-7B-GPTQ" # 替换为您的量化模型 device = "cuda" if torch.cuda.is_available() else "cpu" num_samples = 50 # 用于基准测试延迟/吞吐量的样本数量 max_new_tokens = 100 # 要生成的token数量 perplexity_dataset = "wikitext" perplexity_dataset_config = "wikitext-2-raw-v1" perplexity_split = "test" perplexity_max_samples = 50 # 减少此值可加快评估速度 # 用于生成任务的示例提示 prompt = "The field of Large Language Models is " print(f"Using device: {device}") if device == "cpu": print("警告:在CPU上进行基准测试会显著慢得多,并且内存使用模式也不同。") # --- 辅助函数 --- def load_model_and_tokenizer(model_id, is_quantized=False): print(f"正在加载模型: {model_id}...") tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # 如果缺少,则设置填充token model_kwargs = {"device_map": "auto"} if is_quantized: # 如果需要,添加量化特有的加载参数 # AutoGPTQ示例: # model_kwargs["use_safetensors"] = True # model_kwargs["trust_remote_code"] = True # 对trust_remote_code要谨慎 pass # 在此处为您的量化模型类型添加特定的加载参数 model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs) model.eval() # 将模型设置为评估模式 print("模型已加载。") return model, tokenizer def measure_latency(model, tokenizer, prompt, max_new_tokens, num_runs=10): print("正在测量延迟...") latencies = [] inputs = tokenizer(prompt, return_tensors="pt").to(device) # 热身运行 with torch.no_grad(): _ = model.generate(**inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.pad_token_id) torch.cuda.synchronize() # 确保GPU操作完成 for _ in range(num_runs): start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) start_event.record() with torch.no_grad(): _ = model.generate(**inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.pad_token_id) end_event.record() torch.cuda.synchronize() # 等待操作完成 latency_ms = start_event.elapsed_time(end_event) latencies.append(latency_ms) # print(f"单次运行延迟: {latency_ms:.2f} ms") # 可选:打印单次运行延迟 avg_latency = np.mean(latencies) print(f"平均延迟 ({num_runs} 次运行): {avg_latency:.2f} ms") return avg_latency def measure_throughput(model, tokenizer, prompt, max_new_tokens, num_samples=50): print("正在测量吞吐量...") inputs = tokenizer(prompt, return_tensors="pt").to(device) total_tokens_generated = 0 total_time_sec = 0 # 热身 with torch.no_grad(): _ = model.generate(**inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.pad_token_id) torch.cuda.synchronize() start_time = time.time() start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) start_event.record() with torch.no_grad(): # 注意:这个简单的循环不会并行化请求。 " # 吞吐量通常涉及批量处理或并发请求。" # 为简单起见,我们在此测量顺序生成速度。 outputs = model.generate(**inputs, max_new_tokens=num_samples * max_new_tokens, pad_token_id=tokenizer.pad_token_id, do_sample=False) # 生成一个长序列 generated_tokens = outputs[0][inputs.input_ids.shape[1]:].size(0) # 统计生成的token数量,不包括提示 total_tokens_generated = generated_tokens end_event.record() torch.cuda.synchronize() total_time_sec = start_event.elapsed_time(end_event) / 1000.0 # 时间(秒) # 替代方案:CPU计时(对于GPU操作精度较低) # total_time_sec = time.time() - start_time throughput_tokens_per_sec = total_tokens_generated / total_time_sec if total_time_sec > 0 else 0 print(f"生成的总token数: {total_tokens_generated}") print(f"总时间: {total_time_sec:.2f} 秒") print(f"吞吐量: {throughput_tokens_per_sec:.2f} tokens/秒") return throughput_tokens_per_sec def measure_memory_usage(model_load_fn, *args, **kwargs): print("正在测量峰值内存使用...") torch.cuda.reset_peak_memory_stats(device) initial_memory = torch.cuda.max_memory_allocated(device) # 在此函数中加载模型以捕获其内存占用 model, tokenizer = model_load_fn(*args, **kwargs) memory_after_load = torch.cuda.max_memory_allocated(device) print(f"加载后内存: {memory_after_load / (1024**3):.2f} GB") # 执行一次采样推理运行以捕获运行时内存 inputs = tokenizer("Sample text for memory measurement.", return_tensors="pt").to(device) with torch.no_grad(): _ = model.generate(**inputs, max_new_tokens=10, pad_token_id=tokenizer.pad_token_id) torch.cuda.synchronize() peak_memory = torch.cuda.max_memory_allocated(device) peak_memory_gb = peak_memory / (1024**3) # 将字节转换为GB print(f"推理期间峰值内存使用: {peak_memory_gb:.2f} GB") # 清理内存 del model del tokenizer torch.cuda.empty_cache() return peak_memory_gb def calculate_perplexity(model, tokenizer, dataset_name, dataset_config, split, max_samples=50): print(f"正在计算 {dataset_name} ({split} 分割) 上的困惑度...") try: perplexity_metric = evaluate.load("perplexity", module_type="metric") data = load_dataset(dataset_name, dataset_config, split=f"{split}[:{max_samples}]") # 使用切片进行更快的评估 data = data.map(lambda examples: tokenizer(examples["text"]), batched=True) # 使用默认设置 results = perplexity_metric.compute(model=model, tokenizer=tokenizer, data=data["text"], # 传递原始文本 batch_size=1, # 根据GPU内存调整批量大小 device=device) ppl = results["perplexity"] print(f"困惑度: {ppl:.4f}") return ppl except Exception as e: print(f"计算困惑度出错: {e}") print("跳过困惑度计算。") return None # --- 基准测试执行 --- results = {} # 基准测试基线模型 print("\n--- 正在基准测试基线模型 ---") # 分别测量内存以捕获加载期间的峰值使用情况 baseline_memory = measure_memory_usage(load_model_and_tokenizer, baseline_model_id, is_quantized=False) # 再次加载以进行其他基准测试 baseline_model, baseline_tokenizer = load_model_and_tokenizer(baseline_model_id, is_quantized=False) baseline_latency = measure_latency(baseline_model, baseline_tokenizer, prompt, max_new_tokens) baseline_throughput = measure_throughput(baseline_model, baseline_tokenizer, prompt, max_new_tokens, num_samples) baseline_perplexity = calculate_perplexity(baseline_model, baseline_tokenizer, perplexity_dataset, perplexity_dataset_config, perplexity_split, perplexity_max_samples) results["baseline"] = { "latency_ms": baseline_latency, "throughput_tokens_sec": baseline_throughput, "peak_memory_gb": baseline_memory, "perplexity": baseline_perplexity } # 在加载量化模型之前清理基线模型内存 print("正在清理基线模型...") del baseline_model del baseline_tokenizer torch.cuda.empty_cache() print("基线模型清理完成。") # 基准测试量化模型 print("\n--- 正在基准测试量化模型 ---") # 分别测量内存 quantized_memory = measure_memory_usage(load_model_and_tokenizer, quantized_model_id, is_quantized=True) # 再次加载以进行其他基准测试 quantized_model, quantized_tokenizer = load_model_and_tokenizer(quantized_model_id, is_quantized=True) quantized_latency = measure_latency(quantized_model, quantized_tokenizer, prompt, max_new_tokens) quantized_throughput = measure_throughput(quantized_model, quantized_tokenizer, prompt, max_new_tokens, num_samples) quantized_perplexity = calculate_perplexity(quantized_model, quantized_tokenizer, perplexity_dataset, perplexity_dataset_config, perplexity_split, perplexity_max_samples) results["quantized"] = { "latency_ms": quantized_latency, "throughput_tokens_sec": quantized_throughput, "peak_memory_gb": quantized_memory, "perplexity": quantized_perplexity } # 清理量化模型内存 print("正在清理量化模型...") del quantized_model del quantized_tokenizer torch.cuda.empty_cache() print("量化模型清理完成。") # --- 结果分析 --- print("\n--- 基准测试结果总结 ---") print(f"{'指标':<25} {'基线':<15} {'量化':<15} {'变化 (%)':<15}") print("-" * 70) # 延迟 base_lat = results["baseline"]["latency_ms"] quant_lat = results["quantized"]["latency_ms"] lat_change = ((quant_lat - base_lat) / base_lat) * 100 if base_lat else 0 print(f"{'平均延迟 (ms)':<25} {base_lat:<15.2f} {quant_lat:<15.2f} {lat_change:<15.2f}") # 吞吐量 base_thr = results["baseline"]["throughput_tokens_sec"] quant_thr = results["quantized"]["throughput_tokens_sec"] thr_change = ((quant_thr - base_thr) / base_thr) * 100 if base_thr else 0 print(f"{'吞吐量 (tokens/秒)':<25} {base_thr:<15.2f} {quant_thr:<15.2f} {thr_change:<15.2f}") # 内存 base_mem = results["baseline"]["peak_memory_gb"] quant_mem = results["quantized"]["peak_memory_gb"] mem_change = ((quant_mem - base_mem) / base_mem) * 100 if base_mem else 0 print(f"{'峰值内存 (GB)':<25} {base_mem:<15.2f} {quant_mem:<15.2f} {mem_change:<15.2f}") # 困惑度 base_ppl = results["baseline"]["perplexity"] quant_ppl = results["quantized"]["perplexity"] if base_ppl is not None and quant_ppl is not None: ppl_change = ((quant_ppl - base_ppl) / base_ppl) * 100 print(f"{'困惑度':<25} {base_ppl:<15.4f} {quant_ppl:<15.4f} {ppl_change:<15.2f}") else: print(f"{'困惑度':<25} {'N/A':<15} {'N/A':<15} {'N/A':<15}") print("-" * 70) # 可选:可视化 # 准备绘图数据(请替换为您运行的实际数字) # 这些是示意性的占位值。 base_lat_val = results["baseline"]["latency_ms"] if results["baseline"]["latency_ms"] else 1000 quant_lat_val = results["quantized"]["latency_ms"] if results["quantized"]["latency_ms"] else 500 base_thr_val = results["baseline"]["throughput_tokens_sec"] if results["baseline"]["throughput_tokens_sec"] else 50 quant_thr_val = results["quantized"]["throughput_tokens_sec"] if results["quantized"]["throughput_tokens_sec"] else 100 base_mem_val = results["baseline"]["peak_memory_gb"] if results["baseline"]["peak_memory_gb"] else 15 quant_mem_val = results["quantized"]["peak_memory_gb"] if results["quantized"]["peak_memory_gb"] else 8 base_ppl_val = results["baseline"]["perplexity"] if results["baseline"]["perplexity"] else 5.0 quant_ppl_val = results["quantized"]["perplexity"] if results["quantized"]["perplexity"] else 5.5 ```plotly {"layout": {"title": "基线 (FP16) 与 量化 (INT4) LLM 性能对比", "barmode": "group", "xaxis": {"title": "指标"}, "yaxis": {"title": "值"}, "legend_title_text": "模型类型", "height": 400}, "data": [{"type": "bar", "name": "基线 (FP16)", "x": ["延迟 (ms)", "吞吐量 (tok/秒)", "内存 (GB)", "困惑度"], "y": [base_lat_val, base_thr_val, base_mem_val, base_ppl_val], "marker": {"color": "#4263eb"}}, {"type": "bar", "name": "量化 (INT4)", "x": ["延迟 (ms)", "吞吐量 (tok/秒)", "内存 (GB)", "困惑度"], "y": [quant_lat_val, quant_thr_val, quant_mem_val, quant_ppl_val], "marker": {"color": "#12b886"}}]}基线FP16模型与其INT4量化版本在性能和质量指标上的对比。延迟、内存和困惑度越低越好。吞吐量越高越好。(注:数值为示意性的)。结果解读输出表格和图表提供了量化比较。您应该注意到:延迟: 量化模型通常呈现更低的延迟(更快的单次推理),原因在于计算量和内存访问时间减少。吞吐量: 量化模型预期有更高的吞吐量,这意味着它每秒可以处理更多的token或请求。我们这里的吞吐量测量是基本的(单次长序列生成中的每秒token数);更复杂的设置可能涉及批处理或并发请求,以获得更真实的服务器吞吐量测量。内存使用: 量化模型的GPU峰值内存使用应该明显更低。这包括加载模型权重所需的内存和激活所需的运行时内存。此外,还可以对比磁盘上的模型大小(du -sh model_directory)。困惑度: 困惑度通常会随量化略微增加,表明模型基于训练分布预测下一个token的能力有轻微的下降。可接受的增长程度在很大程度上取决于具体的应用和所使用的量化方法。一个小的增长(例如,< 1点)对于性能的明显提升而言可能是可以接受的。本次实践基准测试提供了重要的数据点。请记住,这些结果与所用硬件、所选模型、量化技术(例如GPTQ、AWQ、bitsandbytes)以及具体的基准测试设置(批量大小、序列长度、数据集)相关。对于生产环境,您可以通过以下方式扩展此项工作:在目标部署硬件上进行基准测试。评估您应用程序特有的下游任务(例如,摘要的ROUGE分数,问答准确性)。使用更复杂的吞吐量测量工具或模拟生产负载。对比不同的量化方法和位精度。通过系统地进行基准测试,您可以放心地选择和部署能够满足您性能要求并保持可接受质量的量化模型。