1. What Is PyTorch?
PyTorch provides a flexible and intuitive platform for building and training deep learning models with define-by-run execution.
Key Features
- Dynamic Computation Graphs: Define-by-run for flexible model architectures.
- Pythonic API: Natural Python code with minimal abstraction.
- Eager Execution: Operations run immediately for easier debugging.
- Strong GPU Acceleration: First-class CUDA support.
- Research-Friendly: Popular in academia and rapid experimentation.
- TorchScript: Compile models for production deployment.
2. How PyTorch Is Used
- Computer vision: detection, segmentation, style transfer
- Natural language processing: transformers and translation
- Reinforcement learning: policy networks and actor-critic methods
- Generative models: GANs, VAEs, diffusion models
- Research prototyping and rapid iteration
- Production deployment with TorchServe and TorchScript
3. Main Components for Neural Networks
3.1 Tensors
Tensors are the core data structure, similar to NumPy arrays but with GPU support and automatic differentiation.
import torch
import numpy as np
# Creating tensors
scalar = torch.tensor(3.0)
vector = torch.tensor([1.0, 2.0, 3.0])
matrix = torch.tensor([[1, 2], [3, 4]])
tensor_3d = torch.tensor([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
print(f"Scalar shape: {scalar.shape}")
print(f"Vector shape: {vector.shape}")
print(f"Matrix shape: {matrix.shape}")
# Create tensors with specific properties
zeros = torch.zeros(3, 4)
ones = torch.ones(2, 3)
rand = torch.rand(2, 2) # Uniform [0, 1)
randn = torch.randn(2, 2) # Normal distribution
# Move tensors to GPU
if torch.cuda.is_available():
device = torch.device('cuda')
tensor_gpu = vector.to(device)
print(f"Tensor on GPU: {tensor_gpu.device}")
3.2 Autograd (Automatic Differentiation)
# Enable gradient tracking
x = torch.tensor(2.0, requires_grad=True)
y = torch.tensor(3.0, requires_grad=True)
# Forward pass
z = x ** 2 + y ** 3
# Backward pass (compute gradients)
z.backward()
print(f"dz/dx: {x.grad}") # 2*x = 4.0
print(f"dz/dy: {y.grad}") # 3*y^2 = 27.0
3.3 nn.Module
nn.Module is the base class for neural network modules.
import torch.nn as nn
class SimpleNet(nn.Module):
def __init__(self):
super(SimpleNet, self).__init__()
self.fc1 = nn.Linear(784, 128)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x
model = SimpleNet()
print(model)
4. Essential Methods for Building Neural Networks
Sequential Container
Use nn.Sequential to define a simple chain of layers.
import torch.nn as nn
# Using Sequential
model = nn.Sequential(
nn.Linear(784, 128),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(128, 64),
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(64, 10),
nn.Softmax(dim=1)
)
# Named layers for better debugging
model = nn.Sequential(
nn.OrderedDict([
('fc1', nn.Linear(784, 128)),
('relu1', nn.ReLU()),
('dropout1', nn.Dropout(0.2)),
('fc2', nn.Linear(128, 10))
])
)
Custom nn.Module (Recommended)
class CustomNet(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(CustomNet, self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.bn1 = nn.BatchNorm1d(hidden_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(0.3)
self.fc2 = nn.Linear(hidden_size, num_classes)
def forward(self, x):
x = self.fc1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.dropout(x)
x = self.fc2(x)
return x
model = CustomNet(input_size=784, hidden_size=128, num_classes=10)
5. Common Layer Types
Linear Layers (Fully Connected)
# Linear layer: y = xW^T + b
layer = nn.Linear(
in_features=128,
out_features=64,
bias=True
)
Convolutional Layers (for Image Data)
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
self.fc1 = nn.Linear(64 * 7 * 7, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.pool(torch.relu(self.conv1(x)))
x = self.pool(torch.relu(self.conv2(x)))
x = x.view(-1, 64 * 7 * 7) # Flatten
x = torch.relu(self.fc1(x))
x = self.fc2(x)
return x
Recurrent Layers (for Sequential Data)
class RNNNet(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNNNet, self).__init__()
self.lstm = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=0.2
)
self.fc = nn.Linear(hidden_size, num_classes)
def forward(self, x):
out, (hn, cn) = self.lstm(x)
out = self.fc(out[:, -1, :])
return out
# GRU alternative
class GRUNet(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(GRUNet, self).__init__()
self.gru = nn.GRU(input_size, hidden_size, batch_first=True)
self.fc = nn.Linear(hidden_size, num_classes)
def forward(self, x):
out, hn = self.gru(x)
out = self.fc(out[:, -1, :])
return out
6. Complete Neural Network Example
End-to-end training workflow using MNIST.
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
# Define transforms
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,)) # MNIST mean and std
])
# Load MNIST dataset
train_dataset = datasets.MNIST(
root='./data',
train=True,
download=True,
transform=transform
)
test_dataset = datasets.MNIST(
root='./data',
train=False,
download=True,
transform=transform
)
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)
# Define the model
class NeuralNet(nn.Module):
def __init__(self):
super(NeuralNet, self).__init__()
self.flatten = nn.Flatten()
self.fc1 = nn.Linear(28 * 28, 128)
self.bn1 = nn.BatchNorm1d(128)
self.relu1 = nn.ReLU()
self.dropout1 = nn.Dropout(0.3)
self.fc2 = nn.Linear(128, 64)
self.bn2 = nn.BatchNorm1d(64)
self.relu2 = nn.ReLU()
self.dropout2 = nn.Dropout(0.3)
self.fc3 = nn.Linear(64, 10)
def forward(self, x):
x = self.flatten(x)
x = self.fc1(x)
x = self.bn1(x)
x = self.relu1(x)
x = self.dropout1(x)
x = self.fc2(x)
x = self.bn2(x)
x = self.relu2(x)
x = self.dropout2(x)
x = self.fc3(x)
return x
# Initialize model, loss, and optimizer
model = NeuralNet().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Training function
def train(model, device, train_loader, optimizer, criterion, epoch):
model.train()
train_loss = 0
correct = 0
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
train_loss += loss.item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
if batch_idx % 100 == 0:
print(
f"Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)}] "
f"Loss: {loss.item():.4f}"
)
train_loss /= len(train_loader)
accuracy = 100. * correct / len(train_loader.dataset)
print(
f"Train set: Average loss: {train_loss:.4f}, "
f"Accuracy: {correct}/{len(train_loader.dataset)} ({accuracy:.2f}%)"
)
# Testing function
def test(model, device, test_loader, criterion):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += criterion(output, target).item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader)
accuracy = 100. * correct / len(test_loader.dataset)
print(
f"Test set: Average loss: {test_loss:.4f}, "
f"Accuracy: {correct}/{len(test_loader.dataset)} ({accuracy:.2f}%)\n"
)
return accuracy
# Train the model
num_epochs = 10
for epoch in range(1, num_epochs + 1):
train(model, device, train_loader, optimizer, criterion, epoch)
test(model, device, test_loader, criterion)
# Save the model
torch.save(model.state_dict(), 'mnist_model.pth')
print("Model saved to mnist_model.pth")
# Load the model
loaded_model = NeuralNet().to(device)
loaded_model.load_state_dict(torch.load('mnist_model.pth'))
loaded_model.eval()
print("Model loaded successfully")
7. Key Methods Reference
| Method/Class | Description |
nn.Module | Base class for neural network modules. |
nn.Sequential() | Container for sequential layer stack. |
super().__init__() | Initialize parent class in custom modules. |
forward() | Define the forward pass computation. |
| Layer | Description |
nn.Linear(in, out) | Fully connected layer. |
nn.Conv2d(in, out, kernel) | 2D convolutional layer. |
nn.LSTM(in, hidden) | LSTM recurrent layer. |
nn.GRU(in, hidden) | GRU recurrent layer. |
nn.BatchNorm1d(features) | Batch normalization for 1D data. |
nn.Dropout(p) | Dropout regularization. |
nn.MaxPool2d(kernel) | Max pooling layer. |
| Activation | Usage |
nn.ReLU() | Rectified Linear Unit. |
nn.LeakyReLU() | Leaky ReLU. |
nn.Sigmoid() | Sigmoid activation. |
nn.Tanh() | Hyperbolic tangent. |
nn.Softmax(dim) | Softmax for multi-class. |
torch.relu(x) | Functional ReLU. |
| Loss Function | Use Case |
nn.CrossEntropyLoss() | Multi-class classification. |
nn.BCELoss() | Binary classification. |
nn.BCEWithLogitsLoss() | Binary classification with logits. |
nn.MSELoss() | Mean squared error (regression). |
nn.L1Loss() | Mean absolute error (regression). |
nn.NLLLoss() | Negative log likelihood. |
| Optimizer | Description |
optim.Adam(params, lr) | Adaptive Moment Estimation. |
optim.SGD(params, lr) | Stochastic Gradient Descent. |
optim.AdamW(params, lr) | Adam with weight decay. |
optim.RMSprop(params, lr) | Root Mean Square Propagation. |
optim.Adagrad(params, lr) | Adaptive Gradient Algorithm. |
| Method | Description |
optimizer.zero_grad() | Clear gradients from previous step. |
loss.backward() | Compute gradients via backpropagation. |
optimizer.step() | Update model parameters. |
model.train() | Set model to training mode. |
model.eval() | Set model to evaluation mode. |
torch.no_grad() | Disable gradient computation. |
| Method | Description |
torch.save(model.state_dict(), path) | Save model parameters. |
model.load_state_dict(torch.load(path)) | Load model parameters. |
torch.save(model, path) | Save entire model. |
torch.load(path) | Load entire model. |
| Method | Description |
tensor.to(device) | Move tensor to device (CPU/GPU). |
tensor.view(shape) | Reshape tensor. |
tensor.reshape(shape) | Reshape tensor (more flexible). |
tensor.flatten() | Flatten to 1D. |
tensor.unsqueeze(dim) | Add dimension. |
tensor.squeeze(dim) | Remove dimension. |
8. Advanced Features
Learning Rate Scheduling
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR
# Step decay
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)
# Reduce on plateau
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=3, factor=0.5)
# Cosine annealing
scheduler = CosineAnnealingLR(optimizer, T_max=10)
# In training loop
for epoch in range(num_epochs):
train(...)
val_loss = validate(...)
scheduler.step() # or scheduler.step(val_loss) for ReduceLROnPlateau
Data Augmentation
from torchvision import transforms
transform = transforms.Compose([
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomRotation(10),
transforms.ColorJitter(brightness=0.2, contrast=0.2),
transforms.RandomCrop(32, padding=4),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
Mixed Precision Training
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()
for data, target in train_loader:
optimizer.zero_grad()
with autocast():
output = model(data)
loss = criterion(output, target)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
Model Checkpointing
# Save checkpoint
checkpoint = {
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'loss': loss
}
torch.save(checkpoint, 'checkpoint.pth')
# Load checkpoint
checkpoint = torch.load('checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']
9. PyTorch vs TensorFlow
| Feature | PyTorch | TensorFlow |
| Graph Type | Dynamic (define-by-run) | Static (define-and-run) in TF 1.x, dynamic in TF 2.x |
| API Style | Pythonic, intuitive | More verbose, abstracted |
| Debugging | Easy with standard Python debugger | More challenging |
| Deployment | TorchScript, TorchServe | TensorFlow Serving, TensorFlow Lite |
| Community | Strong in research | Strong in production |
| Learning Curve | Gentler | Steeper |
10. Best Practices
- Use GPU when available and move models and data to the same device.
- Use
model.train() for training and model.eval() for evaluation.
- Call
optimizer.zero_grad() before backpropagation.
- Disable gradients for inference with
torch.no_grad().
- Normalize input data for faster convergence.
- Use
DataLoader for efficient batch processing.
- Monitor GPU memory and clear cache when needed.
- Prefer
state_dict() for saving models.
11. Common Pitfalls
- Forgetting
optimizer.zero_grad() leads to gradient accumulation.
- Not setting model mode makes Dropout and BatchNorm behave incorrectly.
- Shape mismatches in CNNs and RNNs cause runtime errors.
- CPU and GPU tensors must be on the same device.
- In-place tensor operations can break autograd.
- Memory leaks from storing tensors without detaching.