# AI 算法面试题

# 简单讲一下 Transformer 结构

  • Attention 的作用:获取上下文的关系, FFN 的作用:存储知识
  • 使用的归一化方法是 LayerNorm
  • 计算复杂度和空间复杂度都是N2N^{2},其中NN 指代的是 seq_len

# BN 与 LN

BN 和 LN 均是对数据做正则化,将输入数据归一至正态分布,加速收敛,提高训练的稳定

# 模型推理过程

# MHA、GQA、MQA

  • MQA(Multi Query Attention) : 让所有的头共享同一份 KeyValue 矩阵

  • GQA(Grouped Query Attention) : MQAMHA 的择中方案,即不想损失太多性能

    class GroupedQueryAttention(nn.Module):
        def __int__(self, hidden_size, num_heads, num_key_value_heads):
            super(GroupedQueryAttention).__init__()
            self.hidden_size = hidden_size
            self.num_heads = num_heads
            self.head_dim = hidden_size // num_heads
            self.num_key_value_heads = num_key_value_heads
            self.num_key_value_groups = num_heads // num_key_value_heads
            self.q_proj = nn.Linear(hidden_size, self.head_dim * self.num_heads)
            self.k_proj = nn.Linear(hidden_size, self.head_dim * self.num_key_value_heads)
            self.v_proj = nn.Linear(hidden_size, self.head_dim * self.num_key_value_heads)
            self.o_proj = nn.Linear(self.head_dim * self.num_heads, hidden_size)
            self.softmax = nn.Softmax(dim=-1)
        def expand(self, data):
            batch_size, seq_len = data.shape[0], data.shape[2]
            data = data[:, :, None, :, :].expand(batch_size, self.num_key_value_heads,
                                                 self.num_key_value_groups, seq_len, self.head_dim).contiguous()
            data = data.view(batch_size, self.num_key_value_heads * self.num_key_value_groups,
                             seq_len, self.head_dim)
            return data
        def forward(self, q, k, v, mask=None):
            batch_size, seq_len = q.shape[0], q.shape[1]
            query_states, key_states, value_states = self.q_proj(q), self.k_proj(k), self.v_proj(v)
            query_states = query_states.view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
            key_states = key_states.view(batch_size, seq_len, self.num_key_value_heads, self.head_dim).permute(0, 2, 1, 3)
            value_states = value_states.view(batch_size, seq_len, self.num_key_value_heads, self.head_dim).permute(0, 2, 1, 3)
            key_states = self.expand(key_states)
            value_states = self.expand(value_states)
            attn_weights = query_states @ key_states.transpose(2, 3) / math.sqrt(self.head_dim)
            if mask:
                attn_weights = attn_weights.masked_fill(mask == 0, -1e9)
            attn_output = self.softmax(attn_weights) @ value_states
            attn_output = attn_output.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_len, self.num_heads * self.head_dim)
            attn_output = self.o_proj(attn_output)
            return attn_output
  • MHA(Multi Head Attention) : Q/K/V 三部分具有相同数目的头,且一一对应

    class MultiHeadAttention(nn.Module):
        def __init__(self, hidden_size, num_heads) -> None:
            super(MultiHeadAttention).__init__()
            self.hidden_size = hidden_size
            self.num_heads = num_heads
            self.head_dim = hidden_size // num_heads
            self.q_proj = nn.Linear(hidden_size, self.head_dim * self.num_heads)
            self.k_proj = nn.Linear(hidden_size, self.head_dim * self.num_heads)
            self.v_proj = nn.Linear(hidden_size, self.head_dim * self.num_heads)
            self.o_proj = nn.Linear(self.head_dim * self.num_heads, hidden_size)
            self.softmax = nn.Softmax(dim=-1)
        def forward(self, q, k, v, mask=None):
            batch_size, seq_len, hidden_size = q.shape
            query_states, key_states, value_states = self.q_proj(q), self.k_proj(k), self.v_proj(v)
            query_states = query_states.view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
            key_states = key_states.view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
            value_states = value_states.view(batch_size, seq_len, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
            attn_weights = query_states @ key_states.transpose(2, 3) / math.sqrt(self.head_dim)
            if mask:
                mask = torch.trill(torch.ones(seq_len, seq_len))
                attn_weights = attn_weights.masked_fill(mask==False, float('-inf'))
            attn_output = self.softmax(attn_weights) @ value_states
            attn_output = attn_output.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_len, self.num_heads * self.head_dim)
            attn_output = self.o_proj(attn_output)
            return attn_output

# Multi-Head Latent Attention

# attentionQK 之后为什么要除dk\sqrt{d_{k}}

dkd_{k} 维度较大时, QK 点积后的结果可能很大,会进入到 softmax 的饱和区,而导致梯度变得非常小,会使得训练变得困难。缩放有助于维持点积后结果的稳定性,确保梯度维持在一个合理的范围内.

# 位置编码

  • Transformer 中的位置编码

# Transformer 中的梯度裁剪

# Transformer 中的学习率预热策略

# preNormpostNorm 的区别

  • Post-LN 是在残差之后做归一化,对参数正则化效果好,进而具有更强的鲁棒性
  • Pre-LN 在残差之前做归一化,有一部分数据直接加到了后面,不需要对这部分参数做正则化,正好可以防止模型的梯度消失或梯度爆炸

如果层数少 Post-LN 的效果更好一些,如果把层数拉大,为了保证模型的训练, Pre-LN 要好一些

# 为什么下载的 LLM 大多数都是 Decoder Only 结构

Edited on Views times

Give me a cup of [coffee]~( ̄▽ ̄)~*

Value WeChat Pay

WeChat Pay