Now that we've discussed the concept of internal covariate shift and how Batch Normalization (BN) aims to mitigate it, let's put theory into practice. In this hands-on section, we'll build and train two simple neural networks: one without BN and one with BN integrated. Our goal is to observe firsthand the effects of BN on training stability and convergence speed.
We'll use PyTorch for this exercise. Ensure you have PyTorch installed. We'll also use a simple synthetic dataset to keep the focus squarely on the impact of the normalization technique itself.
First, let's import necessary libraries and generate some synthetic classification data.
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # Or use Plotly for interactive plots
# Generate synthetic data
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10,
n_redundant=5, n_classes=2, random_state=42)
# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1) # Target needs shape (N, 1) for BCEWithLogitsLoss
# Split data
X_train, X_val, y_train, y_val = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)
# Create DataLoaders (optional but good practice)
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
val_dataset = torch.utils.data.TensorDataset(X_val, y_val)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=False)
print(f"Training samples: {len(train_loader.dataset)}")
print(f"Validation samples: {len(val_loader.dataset)}")
Let's define a basic Multi-Layer Perceptron (MLP) with two hidden layers.
class SimpleMLP(nn.Module):
def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
super(SimpleMLP, self).__init__()
self.layer1 = nn.Linear(input_size, hidden_size1)
self.relu1 = nn.ReLU()
self.layer2 = nn.Linear(hidden_size1, hidden_size2)
self.relu2 = nn.ReLU()
self.output_layer = nn.Linear(hidden_size2, output_size)
def forward(self, x):
x = self.layer1(x)
x = self.relu1(x)
x = self.layer2(x)
x = self.relu2(x)
x = self.output_layer(x)
return x
# Instantiate the model
input_dim = X_train.shape[1]
hidden_dim1 = 128
hidden_dim2 = 64
output_dim = 1 # Binary classification
model_no_bn = SimpleMLP(input_dim, hidden_dim1, hidden_dim2, output_dim)
print("Model without Batch Normalization:")
print(model_no_bn)
Now, let's create a similar MLP but add BatchNorm1d
layers after each linear transformation, just before the activation function. This is a common placement strategy.
class MLPWithBN(nn.Module):
def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
super(MLPWithBN, self).__init__()
self.layer1 = nn.Linear(input_size, hidden_size1)
self.bn1 = nn.BatchNorm1d(hidden_size1) # BN layer for layer1 output
self.relu1 = nn.ReLU()
self.layer2 = nn.Linear(hidden_size1, hidden_size2)
self.bn2 = nn.BatchNorm1d(hidden_size2) # BN layer for layer2 output
self.relu2 = nn.ReLU()
self.output_layer = nn.Linear(hidden_size2, output_size)
def forward(self, x):
x = self.layer1(x)
x = self.bn1(x) # Apply BN before activation
x = self.relu1(x)
x = self.layer2(x)
x = self.bn2(x) # Apply BN before activation
x = self.relu2(x)
x = self.output_layer(x)
return x
# Instantiate the model
model_with_bn = MLPWithBN(input_dim, hidden_dim1, hidden_dim2, output_dim)
print("\nModel with Batch Normalization:")
print(model_with_bn)
Notice the nn.BatchNorm1d
layers added. The 1d
signifies that we expect inputs of shape (Batch Size, Features), which is typical for fully connected layers processing non-spatial data.
We'll define a standard training loop function. Pay close attention to the use of model.train()
and model.eval()
. This is particularly important for models with BN (and Dropout), as these layers behave differently during training and evaluation.
model.train()
: Puts the model in training mode. BN layers use the current mini-batch statistics (μ, σ2) for normalization and update their running estimates of the population statistics.model.eval()
: Puts the model in evaluation mode. BN layers use the previously learned running estimates for normalization and do not update them.def train_model(model, train_loader, val_loader, epochs=20, lr=0.01):
criterion = nn.BCEWithLogitsLoss() # Combines Sigmoid and Binary Cross Entropy
optimizer = optim.Adam(model.parameters(), lr=lr)
train_losses = []
val_losses = []
for epoch in range(epochs):
model.train() # Set model to training mode
running_train_loss = 0.0
for inputs, labels in train_loader:
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_train_loss += loss.item() * inputs.size(0)
epoch_train_loss = running_train_loss / len(train_loader.dataset)
train_losses.append(epoch_train_loss)
model.eval() # Set model to evaluation mode
running_val_loss = 0.0
with torch.no_grad(): # Disable gradient calculations for validation
for inputs, labels in val_loader:
outputs = model(inputs)
loss = criterion(outputs, labels)
running_val_loss += loss.item() * inputs.size(0)
epoch_val_loss = running_val_loss / len(val_loader.dataset)
val_losses.append(epoch_val_loss)
if (epoch + 1) % 5 == 0 or epoch == 0:
print(f"Epoch {epoch+1}/{epochs} - Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}")
return train_losses, val_losses
# --- Train the models ---
print("\nTraining Model without Batch Normalization...")
# Re-initialize model to ensure fair comparison
model_no_bn = SimpleMLP(input_dim, hidden_dim1, hidden_dim2, output_dim)
train_losses_no_bn, val_losses_no_bn = train_model(model_no_bn, train_loader, val_loader, epochs=25, lr=0.01)
print("\nTraining Model with Batch Normalization...")
# Re-initialize model
model_with_bn = MLPWithBN(input_dim, hidden_dim1, hidden_dim2, output_dim)
train_losses_bn, val_losses_bn = train_model(model_with_bn, train_loader, val_loader, epochs=25, lr=0.01)
Now, let's visualize the training and validation loss curves for both models.
Comparison of training and validation loss curves over 25 epochs for models with and without Batch Normalization. Note: Actual results may vary slightly due to random initialization and data shuffling.
Observing the plot (based on typical expected results):
Consider trying the following:
lr=0.1
). Observe if the non-BN model struggles to converge or becomes unstable, while the BN model might handle it better.This practical exercise demonstrates how integrating BatchNorm1d
layers can lead to faster, more stable training for feedforward networks. It addresses the internal covariate shift problem by normalizing activations, making the optimization process smoother and often allowing for more aggressive learning rates. Remember the importance of model.train()
and model.eval()
to ensure BN behaves correctly during different phases.
© 2025 ApX Machine Learning