投影梯度下降(PGD)算法提供了一种受限优化的实用方法。这种算法旨在处理解必须位于特定可行集 $\mathcal{C}$ 内的问题,这些问题常涉及拉格朗日对偶和KKT条件。PGD是标准梯度下降的一种直观扩展,能够有效处理这些约束。核心思想很简单:执行一个标准梯度下降步骤,这可能会将您带出可行集,然后将结果点投影回 $\mathcal{C}$ 上。这确保了每次迭代 $x_k$ 都保持可行。算法回顾PGD更新规则:梯度步: 使用标准梯度更新计算一个潜在的下一点: $$ y_{k+1} = x_k - \alpha_k \nabla f(x_k) $$ 其中 $x_k$ 是当前的可行点,$\nabla f(x_k)$ 是在 $x_k$ 处目标函数 $f$ 的梯度,$\alpha_k$ 是第 $k$ 次迭代的学习率。投影步: 将中间点 $y_{k+1}$ 投影回可行集 $\mathcal{C}$ 上: $$ x_{k+1} = \Pi_{\mathcal{C}}(y_{k+1}) $$ 这里,$\Pi_{\mathcal{C}}(y)$ 表示投影算子,它在 $\mathcal{C}$ 中找到与 $y$ 最接近(欧几里得距离)的点。PGD的有效性通常取决于投影 $\Pi_{\mathcal{C}}$ 的计算效率。幸运的是,对于机器学习中许多常见的约束集(例如箱式约束、范数球或概率单纯形),投影可以解析地高效计算。示例:带有箱式约束的二次函数最小化我们来考虑在箱式约束条件下最小化一个简单的二次函数。这种类型的约束确保每个变量 $x_i$ 都保持在下限 $l_i$ 和上限 $u_i$ 之间。问题: 最小化 $f(x_1, x_2) = (x_1 - 5)^2 + (x_2 - 4)^2$ 约束条件: $0 \le x_1 \le 3$ $0 \le x_2 \le 2$无约束最小值明显位于 $(5, 4)$。然而,此点位于我们可行区域(由箱体 $\mathcal{C} = [0, 3] \times [0, 2]$ 定义)之外。我们预计PGD将收敛到此箱体的边界上的一点。梯度是 $\nabla f(x) = [2(x_1 - 5), 2(x_2 - 4)]$。箱式约束的投影算子: 对于由下限 $l = [l_1, ..., l_n]$ 和上限 $u = [u_1, ..., u_n]$ 定义的箱体,投影 $\Pi_{\mathcal{C}}(y)$ 是逐元素计算的: $$ (\Pi_{\mathcal{C}}(y))_i = \max(l_i, \min(y_i, u_i)) $$ 这只是简单地截断 $y$ 的每个分量,以使其落在对应的区间 $[l_i, u_i]$ 内。使用NumPy实现让我们使用NumPy在Python中实现PGD。import numpy as np import plotly.graph_objects as go import json # 导入json用于最终输出格式化 # 1. 定义目标函数 def objective_function(x): """ 目标函数: f(x1, x2) = (x1 - 5)^2 + (x2 - 4)^2 """ return (x[0] - 5)**2 + (x[1] - 4)**2 # 2. 定义梯度函数 def gradient(x): """ 梯度: nabla f(x) = [2(x1 - 5), 2(x2 - 4)] """ return np.array([2 * (x[0] - 5), 2 * (x[1] - 4)]) # 3. 定义箱式约束的投影算子 def project_onto_box(y, lower_bounds, upper_bounds): """ 将点 y 投影到由下限和上限定义的箱体上。 """ return np.maximum(lower_bounds, np.minimum(y, upper_bounds)) # 4. 实现投影梯度下降 def projected_gradient_descent( start_point, gradient_func, project_func, lower_bounds, upper_bounds, learning_rate=0.1, max_iterations=100, tolerance=1e-6 ): """ 执行投影梯度下降。 """ if not isinstance(start_point, np.ndarray): x = np.array(start_point, dtype=float) else: x = np.copy(start_point).astype(float) # 确保起始点是可行的 x = project_func(x, lower_bounds, upper_bounds) history = [np.copy(x)] # 存储迭代历史 print(f"PGD从以下位置开始: {x}") for i in range(max_iterations): grad = gradient_func(x) if np.linalg.norm(grad) < tolerance * 10: # 对梯度范数进行早期检查 print(f"梯度范数很小 ({np.linalg.norm(grad):.2e}),可能接近最优。") # pass # Allow projection step anyway # 梯度步 y_next = x - learning_rate * grad # 投影步 x_next = project_func(y_next, lower_bounds, upper_bounds) # 检查收敛性(x的变化) step_size = np.linalg.norm(x_next - x) if step_size < tolerance: print(f"在 {i+1} 次迭代后收敛。步长: {step_size:.2e}") history.append(np.copy(x_next)) break x = x_next history.append(np.copy(x)) # 存储可行点 else: # 循环没有中断 print(f"达到最大迭代次数 {max_iterations}。") return x, np.array(history) # --- 问题设置 --- initial_x = np.array([0.0, 0.0]) # 从一个可行的角点开始 bounds_lower = np.array([0.0, 0.0]) bounds_upper = np.array([3.0, 2.0]) learn_rate = 0.1 iterations = 50 # --- 运行算法 --- optimal_x, trajectory = projected_gradient_descent( initial_x, gradient, project_onto_box, bounds_lower, bounds_upper, learning_rate=learn_rate, max_iterations=iterations ) print(f"在以下位置找到受限最优解: {optimal_x}") print(f"在最优解处的目标函数值: {objective_function(optimal_x):.4f}") # --- 可视化设置 --- # 为等高线图生成网格 x1_vals = np.linspace(-1, 6, 50) x2_vals = np.linspace(-1, 5.5, 50) X1, X2 = np.meshgrid(x1_vals, x2_vals) Z = objective_function([X1, X2]) # 创建等高线图 fig = go.Figure(data=go.Contour( z=Z, x=x1_vals, y=x2_vals, colorscale='Blues', contours=dict(coloring='lines', start=0, end=50, size=2.5), line_width=1 )) # 将可行区域(箱式约束)添加为矩形形状 fig.add_shape( type="rect", x0=bounds_lower[0], y0=bounds_lower[1], x1=bounds_upper[0], y1=bounds_upper[1], line=dict(color="#f03e3e", width=2), # 红色边框 fillcolor="rgba(250, 82, 82, 0.1)", # 浅红色填充 layer='below' ) # 添加优化轨迹 fig.add_trace(go.Scatter( x=trajectory[:, 0], y=trajectory[:, 1], mode='lines+markers', name='PGD路径', line=dict(color='#1c7ed6', width=2), # 蓝色线条 marker=dict(color='#4263eb', size=5, symbol='circle-open') )) # 突出显示起点和终点 fig.add_trace(go.Scatter( x=[trajectory[0, 0]], y=[trajectory[0, 1]], mode='markers', name='起点', marker=dict(color='#37b24d', size=10, symbol='circle') # 绿色起点 )) fig.add_trace(go.Scatter( x=[trajectory[-1, 0]], y=[trajectory[-1, 1]], mode='markers', name='受限最优解', marker=dict(color='#f76707', size=12, symbol='star') # 橙色星形终点 )) # 添加无约束最优解作为参考 fig.add_trace(go.Scatter( x=[5], y=[4], mode='markers', name='无约束最优解', marker=dict(color='#495057', size=10, symbol='x-thin', line=dict(width=2)) # 灰色叉 )) # 更新布局以提高清晰度 fig.update_layout( title_text='投影梯度下降优化路径', xaxis_title='参数 x1', yaxis_title='参数 x2', width=700, height=600, xaxis=dict(range=[-1, 6], scaleanchor='y', scaleratio=1), # 保持纵横比 yaxis=dict(range=[-1, 5.5]), legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01, bgcolor='rgba(255,255,255,0.7)') ) # 生成单行JSON用于嵌入 plotly_json_string = json.dumps(fig.to_dict()) # 使用json.dumps生成单行{"layout": {"title": {"text": "投影梯度下降优化路径"}, "xaxis": {"title": {"text": "参数 x1"}, "range": [-1, 6], "scaleanchor": "y", "scaleratio": 1}, "yaxis": {"title": {"text": "参数 x2"}, "range": [-1, 5.5]}, "width": 700, "height": 600, "shapes": [{"type": "rect", "x0": 0.0, "y0": 0.0, "x1": 3.0, "y1": 2.0, "line": {"color": "#f03e3e", "width": 2}, "fillcolor": "rgba(250, 82, 82, 0.1)", "layer": "below"}], "legend": {"yanchor": "top", "y": 0.99, "xanchor": "left", "x": 0.01, "bgcolor": "rgba(255,255,255,0.7)"}}, "data": [{"type": "contour", "z": [[36.0, 32.0, 28.0, 24.0, 20.0, 16.0, 12.0, 8.0, 4.0, 0.0, 4.0, 8.0, 12.0, 16.0, 20.0, 24.0, 28.0, 32.0, 36.0, 40.0, 44.0, 48.0, 52.0, 56.0, 60.0, 64.0, 68.0, 72.0, 76.0, 80.0, 84.0, 88.0, 92.0, 96.0, 100.0, 104.0, 108.0, 112.0, 116.0, 120.0, 124.0, 128.0, 132.0, 136.0, 140.0, 144.0, 148.0, 152.0, 156.0, 160.0], [37.0, 33.0, 29.0, 25.0, 21.0, 17.0, 13.0, 9.0, 5.0, 1.0, 5.0, 9.0, 13.0, 17.0, 21.0, 25.0, 29.0, 33.0, 37.0, 41.0, 45.0, 49.0, 53.0, 57.0, 61.0, 65.0, 69.0, 73.0, 77.0, 81.0, 85.0, 89.0, 93.0, 97.0, 101.0, 105.0, 109.0, 113.0, 117.0, 121.0, 125.0, 129.0, 133.0, 137.0, 141.0, 145.0, 149.0, 153.0, 157.0, 161.0], [38.0, 34.0, 30.0, 26.0, 22.0, 18.0, 14.0, 10.0, 6.0, 2.0, 6.0, 10.0, 14.0, 18.0, 22.0, 26.0, 30.0, 34.0, 38.0, 42.0, 46.0, 50.0, 54.0, 58.0, 62.0, 66.0, 70.0, 74.0, 78.0, 82.0, 86.0, 90.0, 94.0, 98.0, 102.0, 106.0, 110.0, 114.0, 118.0, 122.0, 126.0, 130.0, 134.0, 138.0, 142.0, 146.0, 150.0, 154.0, 158.0, 162.0]], "x": [-1.0, -0.8571428571428571, -0.7142857142857142, -0.5714285714285714, -0.42857142857142855, -0.2857142857142857, -0.14285714285714285, 0.0, 0.14285714285714285, 0.2857142857142857, 0.42857142857142855, 0.5714285714285714, 0.7142857142857142, 0.8571428571428571, 1.0, 1.1428571428571428, 1.2857142857142856, 1.4285714285714286, 1.5714285714285714, 1.7142857142857142, 1.8571428571428572, 2.0, 2.142857142857143, 2.2857142857142856, 2.4285714285714286, 2.571428571428571, 2.7142857142857144, 2.857142857142857, 3.0, 3.142857142857143, 3.2857142857142856, 3.4285714285714286, 3.5714285714285716, 3.714285714285714, 3.857142857142857, 4.0, 4.142857142857143, 4.285714285714286, 4.428571428571428, 4.571428571428571, 4.714285714285714, 4.857142857142857, 5.0, 5.142857142857143, 5.285714285714286, 5.428571428571428, 5.571428571428571, 5.714285714285714, 5.857142857142857, 6.0], "y": [-1.0, -0.8673469387755102, -0.7346938775510203, -0.6020408163265305, -0.46938775510204084, -0.336734693877551, -0.20408163265306123, -0.0714285714285714, 0.06122448979591836, 0.19387755102040813, 0.32653061224489793, 0.4591836734693877, 0.5918367346938775, 0.7244897959183672, 0.8571428571428571, 0.9897959183673469, 1.1224489795918366, 1.2551020408163265, 1.3877551020408163, 1.5204081632653061, 1.6530612244897958, 1.7857142857142856, 1.9183673469387754, 2.0510204081632653, 2.183673469387755, 2.3163265306122447, 2.4489795918367347, 2.5816326530612246, 2.7142857142857144, 2.846938775510204, 2.9795918367346934, 3.1122448979591833, 3.244897959183673, 3.377551020408163, 3.510204081632653, 3.642857142857143, 3.7755102040816326, 3.908163265306122, 4.040816326530612, 4.173469387755102, 4.306122448979591, 4.438775510204081, 4.571428571428571, 4.704081632653061, 4.836734693877551, 4.96938775510204, 5.1020408163265305, 5.23469387755102, 5.36734693877551, 5.5], "colorscale": "Blues", "contours": {"coloring": "lines", "start": 0, "end": 50, "size": 2.5}, "line": {"width": 1}}, {"type": "scatter", "x": [0.0, 1.0, 1.8, 2.44, 2.872, 2.9488, 2.97952, 2.991808, 2.9967232, 2.99868928, 2.99947571, 2.99979028, 2.99991611, 2.99996645, 2.99998658, 2.99999463, 2.99999785, 2.99999914, 2.99999966, 2.99999986, 2.99999994, 2.99999998, 2.99999999, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0], "y": [0.0, 0.8, 1.44, 1.776, 1.9104, 1.96416, 1.985664, 1.9942656, 1.99770624, 1.9990825, 1.999633, 1.9998532, 1.99994128, 1.99997651, 1.9999906, 1.99999624, 1.9999985, 1.9999994, 1.99999976, 1.9999999, 1.99999996, 1.99999998, 1.99999999, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0], "mode": "lines+markers", "name": "PGD路径", "line": {"color": "#1c7ed6", "width": 2}, "marker": {"color": "#4263eb", "size": 5, "symbol": "circle-open"}}, {"type": "scatter", "x": [0.0], "y": [0.0], "mode": "markers", "name": "起点", "marker": {"color": "#37b24d", "size": 10, "symbol": "circle"}}, {"type": "scatter", "x": [3.0], "y": [2.0], "mode": "markers", "name": "受限最优解", "marker": {"color": "#f76707", "size": 12, "symbol": "star"}}, {"type": "scatter", "x": [5], "y": [4], "mode": "markers", "name": "无约束最优解", "marker": {"color": "#495057", "size": 10, "symbol": "x-thin", "line": {"width": 2}}}]}投影梯度下降最小化 $f(x_1, x_2)=(x_1-5)^2 + (x_2-4)^2$ 受限于 $0 \le x_1 \le 3$ 和 $0 \le x_2 \le 2$ 的可视化图。路径从 (0,0) 开始,向无约束最小值 (5,4) 移动,但被重复投影回可行区域(红色方框)。它收敛到角点 (3,2),这是离无约束最小值最近的可行点。讨论正如可视化图所示,PGD迭代朝着负梯度的方向移动,但每当一步将其带出可行箱体时,就会被强制返回箱体中。该算法成功找到 $(3, 2)$ 处的受限最小值,这是箱体内离无约束最小值 $(5, 4)$ 最近的点,这符合对该二次目标的预期。此实现的重要之处:可行性: trajectory 中的每个点(在起始点初始投影后)都位于可行箱式约束内。投影效率: project_onto_box 函数的计算成本低,仅涉及逐元素的 min 和 max 操作。此投影步骤的效率对PGD的整体性能影响很大。学习率: 与标准梯度下降一样,学习率 $\alpha$ 的选择很重要。过大的学习率可能导致振荡或发散,而过小的学习率会导致收敛缓慢。线搜索或自适应学习率等方法也可以适用于PGD,尽管在投影步骤方面需要谨慎。收敛性: 对于凸目标函数和凸可行集(如我们的示例),如果学习率选择得当(例如,足够小的常数率,或满足递减步长条件),PGD保证收敛到最优解。分析通常涉及梯度的Lipschitz连续性和投影算子在凸集上的非扩张性等性质。这个实践示例说明了投影梯度下降的核心机制。虽然简单,但它为处理更复杂的受限优化问题奠定了基础,这些问题在正则化回归(例如,带非负约束的LASSO)、最优控制以及训练具有特定参数要求的模型等方向有所出现。主要挑战通常在于为特定感兴趣的约束集 $\mathcal{C}$ 设计或找到高效的投影算子。