classPatchEmbedding(nn.Layer): def__init__(self, image_size, patch_size, in_channels, embed_dim, dropout=0.): super().__init__() # embedding本质是一个卷积操作 self.patch_embedding = nn.Conv2D(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size, bias_attr=False) self.dropout = nn.Dropout(dropout) defforward(self, x): # x 原来为[n, c, h, w] x = self.patch_embedding(x)# 经过卷积操作后:[n, c', h', w'],c'是我们所需要的维度 x = x.flatten(2)# 将2、3维度合并:[n, c', h'*w'] x = x.transpose([0,2,1])# 维度转换:[n, h'*w', c'] x = self.dropout(x) return x
2、实现一个MLP的结构
MLP实际上就是两层全连接,并且经过MLP后维度不发生改变;
classMlp(nn.Layer): def__init__(self, embed_dim, mlp_ratio=4.0, dropout=0.): super().__init__() # 两层全连接层 self.fc1 = nn.Linear(embed_dim,int(embed_dim * mlp_ratio)) self.fc2 = nn.Linear(int(embed_dim * mlp_ratio), embed_dim) # GELU的激活函数 self.act = nn.GELU() # dropout层 self.dropout = nn.Dropout(dropout) defforward(self, x): x = self.fc1(x) x = self.act(x) x = self.dropout(x) x = self.fc2(x) x = self.dropout(x) return x
3、实现一个Encoder层
classEncoderLayer(nn.Layer): def__init__(self, embed_dim): super().__init__() # 做特征归一化操作 self.attn_norm = nn.LayerNorm(embed_dim) # Attention层在之后进行实现 self.attn = Attention() self.mlp_norm = nn.LayerNorm(embed_dim) # 之前实现的MLP结构 self.mlp = Mlp(embed_dim) defforward(self, x): # 这里也有用到残杀结构 h = x x = self.attn_norm(x) x = self.attn(x) x = x + h # 维度不变,可直接相加 h = x x = self.mlp_norm(x) x = self.mlp(x) x = x + h return x