在深度学习领域,特别是在自然语言处理(NLP)任务中,多头注意力机制(Multi-Head Attention)已经成为提升模型性能的关键技术之一。本文将深入剖析多头注意力机制的工作原理,并探讨其如何显著增强模型对复杂特征的学习能力。
import torch
import torch.nn as nn
import torch.nn.functional as F
class MultiHeadAttention(nn.Module):
def __init__(self, embed_dim, num_heads):
super(MultiHeadAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == embed_dim, "Embedding dimension must be divisible by number of heads"
self.qkv_proj = nn.Linear(embed_dim, embed_dim * 3, bias=False)
self.o_proj = nn.Linear(embed_dim, embed_dim)
def forward(self, x):
batch_size, seq_len, embed_dim = x.size()
# Project input onto query, key, and value vectors
qkv = self.qkv_proj(x).reshape(batch_size, seq_len, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2]
# Compute attention scores
scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5)
attn_weights = F.softmax(scores, dim=-1)
# Apply attention weights to values
attn_output = torch.matmul(attn_weights, v)
# Re-assemble all head outputs side by side
attn_output = attn_output.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_len, embed_dim)
# Linear projection
output = self.o_proj(attn_output)
return output