网站备案信息页面,天津网站建设系统,怎么做试玩平台推广网站,西安必途网站建设培训中心首先可以看看ViT的流程视频#xff1a;
15分钟认识ViT#xff01;【视觉Transformer】_哔哩哔哩_bilibili
输入大小为#xff1a;
torch.Size([4, 3, 224, 224])
也就是batch_size4#xff0c;三个通道#xff0c;224*224大小的图片
具体的forward过程函数如下#…首先可以看看ViT的流程视频15分钟认识ViT【视觉Transformer】_哔哩哔哩_bilibili输入大小为torch.Size([4, 3, 224, 224])也就是batch_size4三个通道224*224大小的图片具体的forward过程函数如下patch_embed部分就是将一个图片按照16*16的大小进行分割输入前和输入后的x的大小变化前面的4代表batch_size。一个patch的大小是3*16*16。 196224*224/16*1614*14。也就是一张224*224的图片被分割成了196个14*14的图片patch这个patch可以看作一个单词。7683*16*16。也就是将一个三通道的图片patch延展成一个一维的向量。然后是增加一个CLS tokenx的变化为也就是增加一个特殊的token添加位置编码x的大小不变类似于transformer的位置编码不过这里的位置编码是一个可以学习的矩阵之后就是正常的transformer结构完整的模型结构如下模型结构: VisionTransformer( (patch_embed): PatchEmbedding( (projection): Sequential( (0): Conv2d(3, 768, kernel_size(16, 16), stride(16, 16)) (1): Rearrange(b e h w - b (h w) e) ) ) (pos_dropout): Dropout(p0.1, inplaceFalse) (blocks): ModuleList( (0-11): 12 x TransformerBlock( (norm1): LayerNorm((768,), eps1e-05, elementwise_affineTrue) (attn): MultiHeadAttention( (qkv): Linear(in_features768, out_features2304, biasTrue) (proj): Linear(in_features768, out_features768, biasTrue) (dropout): Dropout(p0.1, inplaceFalse) ) (norm2): LayerNorm((768,), eps1e-05, elementwise_affineTrue) (mlp): MLP( (fc1): Linear(in_features768, out_features3072, biasTrue) (act): GELU(approximatenone) (fc2): Linear(in_features3072, out_features768, biasTrue) (dropout): Dropout(p0.1, inplaceFalse) ) ) ) (norm): LayerNorm((768,), eps1e-05, elementwise_affineTrue) (head): Linear(in_features768, out_features1000, biasTrue) )完整的demo代码如下 Vision Transformer (ViT) 完整实现 用于图像分类任务 import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange from einops.layers.torch import Rearrange class PatchEmbedding(nn.Module): 将图像分割成patches并进行嵌入 def __init__(self, img_size224, patch_size16, in_channels3, embed_dim768): super().__init__() self.img_size img_size self.patch_size patch_size self.n_patches (img_size // patch_size) ** 2 # 使用卷积层将图像分割成patches并投影到embed_dim维度 self.projection nn.Sequential( nn.Conv2d(in_channels, embed_dim, kernel_sizepatch_size, stridepatch_size), Rearrange(b e h w - b (h w) e), # 重排维度 ) def forward(self, x): x: (batch_size, channels, height, width) return: (batch_size, n_patches, embed_dim) x self.projection(x) return x class MultiHeadAttention(nn.Module): 多头自注意力机制 def __init__(self, embed_dim768, num_heads12, dropout0.0): super().__init__() self.embed_dim embed_dim self.num_heads num_heads self.head_dim embed_dim // num_heads self.scale self.head_dim ** -0.5 assert embed_dim % num_heads 0, embed_dim必须能被num_heads整除 # Q, K, V的线性变换 self.qkv nn.Linear(embed_dim, embed_dim * 3, biasTrue) self.proj nn.Linear(embed_dim, embed_dim) self.dropout nn.Dropout(dropout) def forward(self, x): x: (batch_size, seq_len, embed_dim) batch_size, seq_len, embed_dim x.shape # 生成Q, K, V qkv self.qkv(x) # (batch_size, seq_len, embed_dim * 3) qkv qkv.reshape(batch_size, seq_len, 3, self.num_heads, self.head_dim) qkv qkv.permute(2, 0, 3, 1, 4) # (3, batch_size, num_heads, seq_len, head_dim) q, k, v qkv[0], qkv[1], qkv[2] # 计算注意力分数 attn (q k.transpose(-2, -1)) * self.scale # (batch_size, num_heads, seq_len, seq_len) attn attn.softmax(dim-1) attn self.dropout(attn) # 加权求和 out attn v # (batch_size, num_heads, seq_len, head_dim) out out.transpose(1, 2) # (batch_size, seq_len, num_heads, head_dim) out out.reshape(batch_size, seq_len, embed_dim) # 输出投影 out self.proj(out) out self.dropout(out) return out class MLP(nn.Module): 前馈神经网络 def __init__(self, embed_dim768, mlp_ratio4.0, dropout0.0): super().__init__() hidden_dim int(embed_dim * mlp_ratio) self.fc1 nn.Linear(embed_dim, hidden_dim) self.act nn.GELU() self.fc2 nn.Linear(hidden_dim, embed_dim) self.dropout nn.Dropout(dropout) def forward(self, x): x self.fc1(x) x self.act(x) x self.dropout(x) x self.fc2(x) x self.dropout(x) return x class TransformerBlock(nn.Module): Transformer编码器块 def __init__(self, embed_dim768, num_heads12, mlp_ratio4.0, dropout0.0): super().__init__() self.norm1 nn.LayerNorm(embed_dim) self.attn MultiHeadAttention(embed_dim, num_heads, dropout) self.norm2 nn.LayerNorm(embed_dim) self.mlp MLP(embed_dim, mlp_ratio, dropout) def forward(self, x): # 注意力块 残差连接 x x self.attn(self.norm1(x)) # MLP块 残差连接 x x self.mlp(self.norm2(x)) return x class VisionTransformer(nn.Module): 完整的Vision Transformer模型 def __init__( self, img_size224, patch_size16, in_channels3, num_classes1000, embed_dim768, depth12, num_heads12, mlp_ratio4.0, dropout0.0, emb_dropout0.0, ): super().__init__() # Patch嵌入 self.patch_embed PatchEmbedding(img_size, patch_size, in_channels, embed_dim) num_patches self.patch_embed.n_patches # CLS token (可学习参数) self.cls_token nn.Parameter(torch.zeros(1, 1, embed_dim)) # 位置编码 (可学习参数) self.pos_embed nn.Parameter(torch.zeros(1, num_patches 1, embed_dim)) self.pos_dropout nn.Dropout(emb_dropout) # Transformer编码器 self.blocks nn.ModuleList([ TransformerBlock(embed_dim, num_heads, mlp_ratio, dropout) for _ in range(depth) ]) # 归一化层 self.norm nn.LayerNorm(embed_dim) # 分类头 self.head nn.Linear(embed_dim, num_classes) # 初始化权重 self._init_weights() def _init_weights(self): 初始化模型权重 nn.init.trunc_normal_(self.pos_embed, std0.02) nn.init.trunc_normal_(self.cls_token, std0.02) nn.init.trunc_normal_(self.head.weight, std0.02) nn.init.constant_(self.head.bias, 0) def forward(self, x): x: (batch_size, channels, height, width) return: (batch_size, num_classes) batch_size x.shape[0] # Patch嵌入 x self.patch_embed(x) # (batch_size, n_patches, embed_dim) # 添加CLS token cls_tokens self.cls_token.expand(batch_size, -1, -1) # (batch_size, 1, embed_dim) x torch.cat([cls_tokens, x], dim1) # (batch_size, n_patches 1, embed_dim) # 添加位置编码 x x self.pos_embed x self.pos_dropout(x) # 通过Transformer编码器 for block in self.blocks: x block(x) # 归一化 x self.norm(x) # 使用CLS token进行分类 cls_token_final x[:, 0] # (batch_size, embed_dim) logits self.head(cls_token_final) # (batch_size, num_classes) return logits def create_vit_base(): 创建ViT-Base模型 return VisionTransformer( img_size224, patch_size16, in_channels3, num_classes1000, embed_dim768, depth12, num_heads12, mlp_ratio4.0, dropout0.1, emb_dropout0.1, ) def create_vit_small(): 创建ViT-Small模型 return VisionTransformer( img_size224, patch_size16, in_channels3, num_classes1000, embed_dim384, depth12, num_heads6, mlp_ratio4.0, dropout0.1, emb_dropout0.1, ) # 测试代码 if __name__ __main__: # 创建模型 model create_vit_base() print(f模型参数量: {sum(p.numel() for p in model.parameters()) / 1e6:.2f}M) # 创建随机输入 batch_size 4 x torch.randn(batch_size, 3, 224, 224) # 前向传播 with torch.no_grad(): output model(x) print(f输入形状: {x.shape}) print(f输出形状: {output.shape}) print(f输出示例: {output[0, :5]}) # 打印模型结构 print(\n模型结构:) print(model)