# AI 算法面试题
# 简单讲一下 Transformer 结构
Attention
的作用:获取上下文的关系,FFN
的作用:存储知识- 使用的归一化方法是
LayerNorm
- 计算复杂度和空间复杂度都是,其中 指代的是
seq_len
# BN 与 LN
BN 和 LN 均是对数据做正则化,将输入数据归一至正态分布,加速收敛,提高训练的稳定
# 模型推理过程
# MHA、GQA、MQA
MQA(Multi Query Attention)
: 让所有的头共享同一份Key
和Value
矩阵GQA(Grouped Query Attention)
:MQA
与MHA
的择中方案,即不想损失太多性能class GroupedQueryAttention(nn.Module):
def __int__(self, hidden_size, num_heads, num_key_value_heads):
super(GroupedQueryAttention).__init__()
self.hidden_size = hidden_size
self.num_heads = num_heads
self.head_dim = hidden_size // num_heads
self.num_key_value_heads = num_key_value_heads
self.num_key_value_groups = num_heads // num_key_value_heads
self.q_proj = nn.Linear(hidden_size, self.head_dim * self.num_heads)
self.k_proj = nn.Linear(hidden_size, self.head_dim * self.num_key_value_heads)
self.v_proj = nn.Linear(hidden_size, self.head_dim * self.num_key_value_heads)
self.o_proj = nn.Linear(self.head_dim * self.num_heads, hidden_size)
self.softmax = nn.Softmax(dim=-1)
def expand(self, data):
batch_size, seq_len = data.shape[0], data.shape[2]
data = data[:, :, None, :, :].expand(batch_size, self.num_key_value_heads,
self.num_key_value_groups, seq_len, self.head_dim).contiguous()
data = data.view(batch_size, self.num_key_value_heads * self.num_key_value_groups,
seq_len, self.head_dim)
return data
def forward(self, q, k, v, mask=None):
batch_size, seq_len = q.shape[0], q.shape[1]
query_states, key_states, value_states = self.q_proj(q), self.k_proj(k), self.v_proj(v)
query_states = query_states.view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
key_states = key_states.view(batch_size, seq_len, self.num_key_value_heads, self.head_dim).permute(0, 2, 1, 3)
value_states = value_states.view(batch_size, seq_len, self.num_key_value_heads, self.head_dim).permute(0, 2, 1, 3)
key_states = self.expand(key_states)
value_states = self.expand(value_states)
attn_weights = query_states @ key_states.transpose(2, 3) / math.sqrt(self.head_dim)
if mask:
attn_weights = attn_weights.masked_fill(mask == 0, -1e9)
attn_output = self.softmax(attn_weights) @ value_states
attn_output = attn_output.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_len, self.num_heads * self.head_dim)
attn_output = self.o_proj(attn_output)
return attn_output
MHA(Multi Head Attention)
:Q/K/V
三部分具有相同数目的头,且一一对应class MultiHeadAttention(nn.Module):
def __init__(self, hidden_size, num_heads) -> None:
super(MultiHeadAttention).__init__()
self.hidden_size = hidden_size
self.num_heads = num_heads
self.head_dim = hidden_size // num_heads
self.q_proj = nn.Linear(hidden_size, self.head_dim * self.num_heads)
self.k_proj = nn.Linear(hidden_size, self.head_dim * self.num_heads)
self.v_proj = nn.Linear(hidden_size, self.head_dim * self.num_heads)
self.o_proj = nn.Linear(self.head_dim * self.num_heads, hidden_size)
self.softmax = nn.Softmax(dim=-1)
def forward(self, q, k, v, mask=None):
batch_size, seq_len, hidden_size = q.shape
query_states, key_states, value_states = self.q_proj(q), self.k_proj(k), self.v_proj(v)
query_states = query_states.view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
key_states = key_states.view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
value_states = value_states.view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
attn_weights = query_states @ key_states.transpose(2, 3) / math.sqrt(self.head_dim)
if mask:
mask = torch.trill(torch.ones(seq_len, seq_len))
attn_weights = attn_weights.masked_fill(mask==False, float('-inf'))
attn_output = self.softmax(attn_weights) @ value_states
attn_output = attn_output.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_len, self.num_heads * self.head_dim)
attn_output = self.o_proj(attn_output)
return attn_output
# Multi-Head Latent Attention
# attention
的 QK
之后为什么要除
当 维度较大时, Q
与 K
点积后的结果可能很大,会进入到 softmax
的饱和区,而导致梯度变得非常小,会使得训练变得困难。缩放有助于维持点积后结果的稳定性,确保梯度维持在一个合理的范围内.
# 位置编码
Transformer
中的位置编码
# Transformer 中的梯度裁剪
# Transformer 中的学习率预热策略
# preNorm
与 postNorm
的区别
Post-LN
是在残差之后做归一化,对参数正则化效果好,进而具有更强的鲁棒性Pre-LN
在残差之前做归一化,有一部分数据直接加到了后面,不需要对这部分参数做正则化,正好可以防止模型的梯度消失或梯度爆炸
如果层数少
Post-LN
的效果更好一些,如果把层数拉大,为了保证模型的训练,Pre-LN
要好一些