大家好这里是双模态遥感目标检测精读系列第三篇本期精读Expert Systems With Applications2026IF≈8.5 顶刊论文 PE-Det 聚焦红外 - 可见光IR-VIS双模态目标检测论文从论文创新、模块拆解、可运行代码到顶刊二次创新思路全覆盖适合科研党、算法工程师直接复用论文标题如下PE-Det: Prior-Guided visible preconditioning and routed expert fusion for robust infrared-visible object detection论文代码如下https://github.com/601140736/PE-DetPE-Det 是一篇发表于Expert Systems With Applications2026IF≈8.5的红外 - 可见光IR-VIS双模态目标检测论文聚焦恶劣环境下可见光严重退化低光照、雾霾散射、对比度崩塌导致的跨模态不一致、固定融合策略失效问题提出退化感知自适应融合框架通过先验引导可见光预处理PVP、多尺度动态专家融合MDE、跨尺度特征聚合颈部GS-SSFF、核心聚焦边界框回归损失CFI‑MPD‑IoU四大协同模块系统性解决退化引发的误差传播问题。论文在FLIR、M3FD两大权威双模态数据集上全面超越 YOLOv8 (Dual)、SLF‑YOLO (Dual) 及十余种主流融合检测器在mAP0.5:0.95严格定位指标上提升显著跨数据集泛化验证了对未知退化与域偏移的鲁棒性创新点一先验引导可见光预处理模块PVP该创新点首次提出面向红外 - 可见光检测的不对称可微分预处理机制仅针对易退化的可见光模态进行增强处理保持红外模态原始稳定的热特征不变通过集成物理驱动的逆散射去雾、递归式光照曲线校正与可学习拉普拉斯边缘增强三类先验算子以残差耦合方式逐级稳定可见光图像的光度统计特性并强化结构相关特征从输入源头降低退化引发的跨模态分布差异为后续模态交互提供更可靠的特征基础区别于传统视觉增强与检测任务脱节、双模态同步处理的低效方式实现检测导向的精准预处理。import torch import torch.nn as nn # ---- 模型部分low_light_enhance ---- class Low_enhance_net(nn.Module): def __init__(self, in_channels): super(Low_enhance_net, self).__init__() self.conv1 nn.Conv2d(in_channels, 4, kernel_size3, padding1, stride1) self.conv2 nn.Conv2d(4, 8, kernel_size3, padding1, stride1) self.conv5 nn.Conv2d(8, 8, kernel_size3, padding1, stride1) self.leaky_relu nn.LeakyReLU() def forward(self, x): x self.leaky_relu((self.conv1(x))) x self.leaky_relu((self.conv2(x))) x self.leaky_relu((self.conv5(x))) r1, r2, r3, r4, r5, r6, r7, r8 torch.split(x, 1, dim1) return [r1, r2, r3, r4, r5, r6, r7, r8] def low_enhance_feature(low_light_image, r): # 遍历 r 中的每个元素并逐步增强 for r_it in r: # 将 r_it 通过 sigmoid 压缩到 0 到 1 的范围内防止其值过大或过小 r_it torch.sigmoid(r_it) # 增强操作添加一个很小的常数 1e-6 来提高数值稳定性避免零除或下溢 low_light_image low_light_image r_it * (torch.pow(low_light_image, 2) - low_light_image 1e-6) # 对每次迭代的结果进行裁剪避免值过大或过小 low_light_image torch.clamp(low_light_image, min0.0, max1.0) # 将原始图像加回结果作为增强后的最终输出 return low_light_image import torch import torch.nn as nn import torch.nn.functional as F # ---- 模型部分预测透射率图和大气光 ---- class DehazeNet(nn.Module): def __init__(self, in_channel): super(DehazeNet, self).__init__() # 卷积层用于提取特征 self.conv1 nn.Conv2d(in_channel, 4, kernel_size3, padding1) self.conv2 nn.Conv2d(4, 8, kernel_size3, padding1) self.conv5 nn.Conv2d(8, 1, kernel_size3, padding1) # 输出1个通道用于透射率图 # 全连接层用于估计大气光值 self.fc1 nn.Linear(8, 128) self.fc2 nn.Linear(128, 1) def forward(self, x): # 提取透射率图的特征 x nn.LeakyReLU()(self.conv1(x)) x nn.LeakyReLU()(self.conv2(x)) transmission torch.sigmoid(self.conv5(x)) # 透射率图输出范围为[0,1] # 用于估计大气光值的特征池化 #(x.shape) pooled F.adaptive_avg_pool2d(x, (1, 1)) pooled pooled.view(pooled.size(0), -1) # flatten #print(pooled.shape) A torch.sigmoid(self.fc1(pooled)) A torch.sigmoid(self.fc2(A)) # 大气光值输出范围为[0,1]RGB return transmission, A # ---- 去雾过程通过大气散射模型公式去雾 ---- def dehaze_feature(hazy_image, transmission, A, t00.01): # 假设 hazy_image 形状为 (B, C, H, W) transmission torch.clamp(transmission, mint0) # 避免除0情况透射率下限 t0 # 扩展 A 到 (B, C, H, W) 维度 A A.view(A.size(0), A.size(1), 1, 1) # A shape: (B, 1, 1, 1) A A.expand_as(hazy_image) # 去雾公式J(x) (I(x) - A) / t(x) A dehazed_image (hazy_image - A) / transmission A #dehazed_image torch.clamp(dehazed_image, 0, 1) # 确保像素值在[0,1]范围内 return dehazed_image class IN_poir(nn.Module): def __init__(self): super(IN_poir, self).__init__() self.L_N Low_enhance_net(in_channels3) self.F_N DehazeNet(in_channel3) self.w1 nn.Parameter(torch.randn(1)) self.w2 nn.Parameter(torch.randn(1)) def forward(self, x): vi, ir x[:,:3,:,:], x[:, 3:, :, :] r self.L_N(vi) t, a self.F_N(vi) vi vi * (1-self.w1) self.w1* dehaze_feature(vi, t, a) vi vi * (1-self.w2) self.w2* low_enhance_feature(vi, r) return torch.cat([vi, ir],dim1)创新点二多尺度动态专家融合模块MDE这个创新点打开了新的思路对于跨模态特征融合将几个经典的模块融合起来这个几个“专家融合模块”由网络自适应融合。该创新点首次构建基于金字塔层级的路由式专家融合机制针对不同场景与目标尺度下模态可靠性差异大、固定融合策略失效的问题设计包含全局令牌融合、跨模态交互、掩码引导局部互补与直接拼接四类互补融合专家的共享池在 P2 至 P5 每个特征金字塔层级独立计算跨模态差异并生成路由决策采用硬 Top-1 选择机制动态激活当前尺度最优的融合专家使模型能够根据场景复杂度与尺度依赖的模态差异自适应调整融合行为突破传统单一融合方式无法适配全场景与全尺度目标的局限实现精细化的退化感知自适应融合。# YOLOv5 common modules import math from copy import copy from pathlib import Path import numpy as np import pandas as pd import requests import torch import torch.nn as nn from PIL import Image from torch.cuda import amp import torch.nn.functional as F from torch.nn import init, Sequential def autopad(k, pNone): # kernel, padding # Pad to same if p is None: p k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad return p def DWConv(c1, c2, k1, s1, actTrue): # Depthwise convolution return Conv(c1, c2, k, s, gmath.gcd(c1, c2), actact) class Conv(nn.Module): # Standard convolution def __init__(self, c1, c2, k1, s1, pNone, g1, actTrue): # ch_in, ch_out, kernel, stride, padding, groups super(Conv, self).__init__() # print(c1, c2, k, s,) self.conv nn.Conv2d(c1, c2, k, s, autopad(k, p), groupsg, biasFalse) self.bn nn.BatchNorm2d(c2) self.act nn.SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity()) def forward(self, x): # print(Conv, x.shape) return self.act(self.bn(self.conv(x))) def fuseforward(self, x): return self.act(self.conv(x)) class TransformerLayer(nn.Module): # Transformer layer https://arxiv.org/abs/2010.11929 (LayerNorm layers removed for better performance) def __init__(self, c, num_heads): super().__init__() self.q nn.Linear(c, c, biasFalse) self.k nn.Linear(c, c, biasFalse) self.v nn.Linear(c, c, biasFalse) self.ma nn.MultiheadAttention(embed_dimc, num_headsnum_heads) self.fc1 nn.Linear(c, c, biasFalse) self.fc2 nn.Linear(c, c, biasFalse) def forward(self, x): x self.ma(self.q(x), self.k(x), self.v(x))[0] x x self.fc2(self.fc1(x)) x return x class TransformerBlock(nn.Module): # Vision Transformer https://arxiv.org/abs/2010.11929 def __init__(self, c1, c2, num_heads, num_layers): super().__init__() self.conv None if c1 ! c2: self.conv Conv(c1, c2) self.linear nn.Linear(c2, c2) # learnable position embedding self.tr nn.Sequential(*[TransformerLayer(c2, num_heads) for _ in range(num_layers)]) self.c2 c2 def forward(self, x): if self.conv is not None: x self.conv(x) b, _, w, h x.shape p x.flatten(2) p p.unsqueeze(0) p p.transpose(0, 3) p p.squeeze(3) e self.linear(p) x p e x self.tr(x) x x.unsqueeze(3) x x.transpose(0, 3) x x.reshape(b, self.c2, w, h) return x class Bottleneck(nn.Module): # Standard bottleneck def __init__(self, c1, c2, shortcutTrue, g1, e0.5): # ch_in, ch_out, shortcut, groups, expansion super(Bottleneck, self).__init__() c_ int(c2 * e) # hidden channels self.cv1 Conv(c1, c_, 1, 1) self.cv2 Conv(c_, c2, 3, 1, gg) self.add shortcut and c1 c2 def forward(self, x): return x self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x)) class BottleneckCSP(nn.Module): # CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks def __init__(self, c1, c2, n1, shortcutTrue, g1, e0.5): # ch_in, ch_out, number, shortcut, groups, expansion super(BottleneckCSP, self).__init__() c_ int(c2 * e) # hidden channels self.cv1 Conv(c1, c_, 1, 1) self.cv2 nn.Conv2d(c1, c_, 1, 1, biasFalse) self.cv3 nn.Conv2d(c_, c_, 1, 1, biasFalse) self.cv4 Conv(2 * c_, c2, 1, 1) self.bn nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3) self.act nn.LeakyReLU(0.1, inplaceTrue) self.m nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e1.0) for _ in range(n)]) def forward(self, x): y1 self.cv3(self.m(self.cv1(x))) y2 self.cv2(x) return self.cv4(self.act(self.bn(torch.cat((y1, y2), dim1)))) class C3(nn.Module): # CSP Bottleneck with 3 convolutions def __init__(self, c1, c2, n1, shortcutTrue, g1, e0.5): # ch_in, ch_out, number, shortcut, groups, expansion super(C3, self).__init__() c_ int(c2 * e) # hidden channels self.cv1 Conv(c1, c_, 1, 1) self.cv2 Conv(c1, c_, 1, 1) self.cv3 Conv(2 * c_, c2, 1) # actFReLU(c2) self.m nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e1.0) for _ in range(n)]) # self.m nn.Sequential(*[CrossConv(c_, c_, 3, 1, g, 1.0, shortcut) for _ in range(n)]) def forward(self, x): return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim1)) class C3TR(C3): # C3 module with TransformerBlock() def __init__(self, c1, c2, n1, shortcutTrue, g1, e0.5): super().__init__(c1, c2, n, shortcut, g, e) c_ int(c2 * e) self.m TransformerBlock(c_, c_, 4, n) class SPP(nn.Module): # Spatial pyramid pooling layer used in YOLOv3-SPP def __init__(self, c1, c2, k(5, 9, 13)): super(SPP, self).__init__() c_ c1 // 2 # hidden channels self.cv1 Conv(c1, c_, 1, 1) self.cv2 Conv(c_ * (len(k) 1), c2, 1, 1) self.m nn.ModuleList([nn.MaxPool2d(kernel_sizex, stride1, paddingx // 2) for x in k]) def forward(self, x): x self.cv1(x) return self.cv2(torch.cat([x] [m(x) for m in self.m], 1)) class Focus(nn.Module): # Focus wh information into c-space def __init__(self, c1, c2, k1, s1, pNone, g1, actTrue): # ch_in, ch_out, kernel, stride, padding, groups super(Focus, self).__init__() # print(c1 * 4, c2, k, c1 * 4, c2, k) self.conv Conv(c1 * 4, c2, k, s, p, g, act) # self.contract Contract(gain2) def forward(self, x): # x(b,c,w,h) - y(b,4c,w/2,h/2) # print(Focus inputs shape, x.shape) # print() return self.conv(torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1)) # return self.conv(self.contract(x)) class Contract(nn.Module): # Contract width-height into channels, i.e. x(1,64,80,80) to x(1,256,40,40) def __init__(self, gain2): super().__init__() self.gain gain def forward(self, x): N, C, H, W x.size() # assert (H / s 0) and (W / s 0), Indivisible gain s self.gain x x.view(N, C, H // s, s, W // s, s) # x(1,64,40,2,40,2) x x.permute(0, 3, 5, 1, 2, 4).contiguous() # x(1,2,2,64,40,40) return x.view(N, C * s * s, H // s, W // s) # x(1,256,40,40) class Expand(nn.Module): # Expand channels into width-height, i.e. x(1,64,80,80) to x(1,16,160,160) def __init__(self, gain2): super().__init__() self.gain gain def forward(self, x): N, C, H, W x.size() # assert C / s ** 2 0, Indivisible gain s self.gain x x.view(N, s, s, C // s ** 2, H, W) # x(1,2,2,16,80,80) x x.permute(0, 3, 4, 1, 5, 2).contiguous() # x(1,16,80,2,80,2) return x.view(N, C // s ** 2, H * s, W * s) # x(1,16,160,160) class Concat(nn.Module): # Concatenate a list of tensors along dimension def __init__(self, dimension1): super(Concat, self).__init__() self.d dimension def forward(self, x): # print(x.shape) return torch.cat(x, self.d) class Add(nn.Module): # Add two tensors def __init__(self, arg): super(Add, self).__init__() self.arg arg def forward(self, x): return torch.add(x[0], x[1]) class Add2(nn.Module): # x transformer[0] or x transformer[1] def __init__(self, c1, index): super().__init__() self.index index def forward(self, x): if self.index 0: return torch.add(x[0], x[1][0]) elif self.index 1: return torch.add(x[0], x[1][1]) # return torch.add(x[0], x[1]) class Classify(nn.Module): # Classification head, i.e. x(b,c1,20,20) to x(b,c2) def __init__(self, c1, c2, k1, s1, pNone, g1): # ch_in, ch_out, kernel, stride, padding, groups super(Classify, self).__init__() self.aap nn.AdaptiveAvgPool2d(1) # to x(b,c1,1,1) self.conv nn.Conv2d(c1, c2, k, s, autopad(k, p), groupsg) # to x(b,c2,1,1) self.flat nn.Flatten() def forward(self, x): z torch.cat([self.aap(y) for y in (x if isinstance(x, list) else [x])], 1) # cat if list return self.flat(self.conv(z)) # flatten to x(b,c2) class SelfAttention(nn.Module): Multi-head masked self-attention layer def __init__(self, d_model, d_k, d_v, h, attn_pdrop.1, resid_pdrop.1): :param d_model: Output dimensionality of the model :param d_k: Dimensionality of queries and keys :param d_v: Dimensionality of values :param h: Number of heads super(SelfAttention, self).__init__() assert d_k % h 0 self.d_model d_model self.d_k d_model // h self.d_v d_model // h self.h h # key, query, value projections for all heads self.que_proj nn.Linear(d_model, h * self.d_k) # query projection self.key_proj nn.Linear(d_model, h * self.d_k) # key projection self.val_proj nn.Linear(d_model, h * self.d_v) # value projection self.out_proj nn.Linear(h * self.d_v, d_model) # ours projection # regularization self.attn_drop nn.Dropout(attn_pdrop) self.resid_drop nn.Dropout(resid_pdrop) self.init_weights() def init_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): init.kaiming_normal_(m.weight, modefan_out) if m.bias is not None: init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): init.constant_(m.weight, 1) init.constant_(m.bias, 0) elif isinstance(m, nn.Linear): init.normal_(m.weight, std0.001) if m.bias is not None: init.constant_(m.bias, 0) def forward(self, x, attention_maskNone, attention_weightsNone): Computes Self-Attention Args: x (tensor): input (token) dim:(b_s, nx, c), b_s means batch size nx means length, for CNN, equals H*W, i.e. the length of feature maps c means channel, i.e. the channel of feature maps attention_mask: Mask over attention values (b_s, h, nq, nk). True indicates masking. attention_weights: Multiplicative weights for attention values (b_s, h, nq, nk). Return: ours (tensor): dim:(b_s, nx, c) b_s, nq x.shape[:2] nk x.shape[1] q self.que_proj(x).view(b_s, nq, self.h, self.d_k).permute(0, 2, 1, 3) # (b_s, h, nq, d_k) k self.key_proj(x).view(b_s, nk, self.h, self.d_k).permute(0, 2, 3, 1) # (b_s, h, d_k, nk) K^T v self.val_proj(x).view(b_s, nk, self.h, self.d_v).permute(0, 2, 1, 3) # (b_s, h, nk, d_v) # Self-Attention # :math:(\text(Attention(Q,K,V) Softmax((Q*K^T)/\sqrt(d_k)) att torch.matmul(q, k) / np.sqrt(self.d_k) # (b_s, h, nq, nk) # weight and mask if attention_weights is not None: att att * attention_weights if attention_mask is not None: att att.masked_fill(attention_mask, -np.inf) # get attention matrix att torch.softmax(att, -1) att self.attn_drop(att) # ours out torch.matmul(att, v).permute(0, 2, 1, 3).contiguous().view(b_s, nq, self.h * self.d_v) # (b_s, nq, h*d_v) out self.resid_drop(self.out_proj(out)) # (b_s, nq, d_model) return out class myTransformerBlock(nn.Module): Transformer block def __init__(self, d_model, d_k, d_v, h, block_exp, attn_pdrop, resid_pdrop): :param d_model: Output dimensionality of the model :param d_k: Dimensionality of queries and keys :param d_v: Dimensionality of values :param h: Number of heads :param block_exp: Expansion factor for MLP (feed foreword network) super().__init__() self.ln_input nn.LayerNorm(d_model) self.ln_output nn.LayerNorm(d_model) self.sa SelfAttention(d_model, d_k, d_v, h, attn_pdrop, resid_pdrop) self.mlp nn.Sequential( nn.Linear(d_model, block_exp * d_model), # nn.SiLU(), # changed from GELU nn.GELU(), # changed from GELU nn.Linear(block_exp * d_model, d_model), nn.Dropout(resid_pdrop), ) def forward(self, x): bs, nx, c x.size() x x self.sa(self.ln_input(x)) x x self.mlp(self.ln_output(x)) return x class GPT(nn.Module): the full GPT language model, with a context size of block_size def __init__(self, d_model, h8, block_exp4, n_layer8, vert_anchors8, horz_anchors8, embd_pdrop0.1, attn_pdrop0.1, resid_pdrop0.1): super().__init__() self.n_embd d_model self.vert_anchors vert_anchors self.horz_anchors horz_anchors d_k d_model d_v d_model # positional embedding parameter (learnable), rgb_fea ir_fea self.pos_emb nn.Parameter(torch.zeros(1, 2 * vert_anchors * horz_anchors, self.n_embd)) # transformer self.trans_blocks nn.Sequential(*[myTransformerBlock(d_model, d_k, d_v, h, block_exp, attn_pdrop, resid_pdrop) for layer in range(n_layer)]) # decoder head self.ln_f nn.LayerNorm(self.n_embd) # regularization self.drop nn.Dropout(embd_pdrop) # avgpool self.avgpool nn.AdaptiveAvgPool2d((self.vert_anchors, self.horz_anchors)) # init weights self.apply(self._init_weights) staticmethod def _init_weights(module): if isinstance(module, nn.Linear): module.weight.data.normal_(mean0.0, std0.02) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) def forward(self, x): Args: x (tuple?) rgb_fea x[0] # rgb_fea (tensor): dim:(B, C, H, W) ir_fea x[1] # ir_fea (tensor): dim:(B, C, H, W) assert rgb_fea.shape[0] ir_fea.shape[0] bs, c, h, w rgb_fea.shape # ------------------------------------------------------------------------- # AvgPooling # ------------------------------------------------------------------------- # AvgPooling for reduce the dimension due to expensive computation rgb_fea self.avgpool(rgb_fea) ir_fea self.avgpool(ir_fea) # ------------------------------------------------------------------------- # Transformer # ------------------------------------------------------------------------- # pad token embeddings along number of tokens dimension rgb_fea_flat rgb_fea.view(bs, c, -1) # flatten the feature ir_fea_flat ir_fea.view(bs, c, -1) # flatten the feature token_embeddings torch.cat([rgb_fea_flat, ir_fea_flat], dim2) # concat token_embeddings token_embeddings.permute(0, 2, 1).contiguous() # dim:(B, 2*H*W, C) # transformer x self.drop(self.pos_emb token_embeddings) # sum positional embedding and token dim:(B, 2*H*W, C) x self.trans_blocks(x) # dim:(B, 2*H*W, C) # decoder head x self.ln_f(x) # dim:(B, 2*H*W, C) x x.view(bs, 2, self.vert_anchors, self.horz_anchors, self.n_embd) x x.permute(0, 1, 4, 2, 3) # dim:(B, 2, C, H, W) # 这样截取的方式, 是否采用映射的方式更加合理 rgb_fea_out x[:, 0, :, :, :].contiguous().view(bs, self.n_embd, self.vert_anchors, self.horz_anchors) ir_fea_out x[:, 1, :, :, :].contiguous().view(bs, self.n_embd, self.vert_anchors, self.horz_anchors) # ------------------------------------------------------------------------- # Interpolate (or Upsample) # ------------------------------------------------------------------------- rgb_fea_out F.interpolate(rgb_fea_out, size([h, w]), modebilinear) ir_fea_out F.interpolate(ir_fea_out, size([h, w]), modebilinear) return torch.cat([rgb_fea_out, ir_fea_out],dim1) class fusion(nn.Module): def __init__(self,channel, reduction16): super().__init__() self.channel channel self.mask_map_r nn.Conv2d(channel, 1,1,1,0,biasTrue) self.mask_map_i nn.Conv2d(channel, 1, 1, 1, 0, biasTrue) self.softmax nn.Softmax(-1) self.bottleneck1 nn.Conv2d(channel, channel, 3,1,1,biasFalse) self.bottleneck2 nn.Conv2d(channel, channel, 3, 1, 1, biasFalse) self.se SE_Block(channel*2, reduction) def forward(self, x): x_left_ori, x_right_ori x[0], x[1] x_left, x_right x_left_ori*0.5, x_right_ori*0.5 x_mask_left torch.mul(self.mask_map_r(x_left), x_left) x_mask_right torch.mul(self.mask_map_i(x_right), x_right) out_IR self.bottleneck1(x_mask_right x_right_ori) out_RGB self.bottleneck2(x_mask_left x_left_ori) out self.se(torch.cat([out_RGB, out_IR], 1)) #print(out.shape, x[0].shape) return out #----------融合模块1 cvci class CVCI(nn.Module): def __init__(self, in_chans3, embed_dims32, img_size16, num_classes1000, stem_channel16, fc_dim1280, num_heads[1, 2], mlp_ratios[3.6, 3.6], qkv_biasTrue, qk_scaleNone, representation_sizeNone, drop_rate0., attn_drop_rate0., drop_path_rate0., hybrid_backboneNone, norm_layerNone, depths[0, 1], qk_ratio1, sr_ratios[8, 4], dp0.1): super().__init__() self.out_dict {} #################### ir transformer #################### self.ir_patch_embed_b PatchEmbed( img_sizeimg_size, patch_size1, in_chansin_chans, embed_dimembed_dims) # self.ir_relative_pos_b nn.Parameter(torch.randn( # num_heads[1], self.ir_patch_embed_b.num_patches, # self.ir_patch_embed_b.num_patches // sr_ratios[1] // sr_ratios[ # 1])) #self.ir_patch_embed_b.num_patches//sr_ratios[1]//sr_ratios[1]) ir_dpr [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule ir_cur 0 # self.ir_blocks_a nn.ModuleList([ # CDAM_Block( # dimembed_dims[0], num_headsnum_heads[0], mlp_ratiomlp_ratios[0], qkv_biasqkv_bias, # qk_scaleqk_scale, dropdrop_rate, attn_dropattn_drop_rate, drop_pathir_dpr[ir_cur i], # norm_layernorm_layer, qk_ratioqk_ratio, sr_ratiosr_ratios[0]) # for i in range(depths[0])]) ir_cur depths[0] self.ir_blocks_b nn.ModuleList([ CDAM_Block( dimembed_dims, num_headsnum_heads[1], mlp_ratiomlp_ratios[1], qkv_biasqkv_bias, qk_scaleqk_scale, dropdrop_rate, attn_dropattn_drop_rate, drop_pathir_dpr[ir_cur i], qk_ratioqk_ratio, sr_ratiosr_ratios[1]) for i in range(depths[1])]) #################### vis transformer #################### self.vis_patch_embed_b PatchEmbed( img_sizeimg_size, patch_size1, in_chansin_chans, embed_dimembed_dims) # self.vis_relative_pos_b nn.Parameter(torch.randn( # num_heads[1], self.vis_patch_embed_b.num_patches, # self.vis_patch_embed_b.num_patches // sr_ratios[1] // sr_ratios[ # 1])) #self.vis_patch_embed_b.num_patches // sr_ratios[1] // sr_ratios[1] vis_dpr [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule vis_cur 0 # self.vis_blocks_a nn.ModuleList([ # CDAM_Block( # dimembed_dims[0], num_headsnum_heads[0], mlp_ratiomlp_ratios[0], qkv_biasqkv_bias, # qk_scaleqk_scale, dropdrop_rate, attn_dropattn_drop_rate, drop_pathvis_dpr[vis_cur i], # norm_layernorm_layer, qk_ratioqk_ratio, sr_ratiosr_ratios[0]) # for i in range(depths[0])]) vis_cur depths[0] self.vis_blocks_b nn.ModuleList([ CDAM_Block( dimembed_dims, num_headsnum_heads[1], mlp_ratiomlp_ratios[1], qkv_biasqkv_bias, qk_scaleqk_scale, dropdrop_rate, attn_dropattn_drop_rate, drop_pathvis_dpr[vis_cur i], qk_ratioqk_ratio, sr_ratiosr_ratios[1]) for i in range(depths[1])]) def forward(self, x): x, y x[0], x[1] x, (H, W) self.ir_patch_embed_b(x) y, (H, W) self.vis_patch_embed_b(y) A x B y if self.out_dict ! {}: for i, blk in enumerate(self.ir_blocks_b): x blk(x, B, H, W, self.out_dict[ir_relative_pos_b]) else: for i, blk in enumerate(self.ir_blocks_b): # x blk(x, B, H, W, self.ir_relative_pos_b) x blk(x, B, H, W) if self.out_dict ! {}: for i, blk in enumerate(self.vis_blocks_b): y blk(y, A, H, W, self.out_dict[vis_relative_pos_b]) else: for i, blk in enumerate(self.vis_blocks_b): # y blk(y, A, H, W, self.vis_relative_pos_b) y blk(y, A, H, W) B, N, C x.shape x x.permute(0, 2, 1).reshape(B, C, H, W) # 加入域间融合先可以直接卷积过去 y y.permute(0, 2, 1).reshape(B, C, H, W) # 加入域间融合先可以直接卷积过去 out_feature torch.cat((x, y), dim1) #print(x.shape, out_feature.shape) return out_feature class MFusion(nn.Module): def __init__(self,dim_in): super(MFusion, self).__init__() dim dim_in//4 self.dim dim self.num_experts 4 self.top_k 1 # 定义多个专家每个专家是 (B, C, W, H) - (B, C, W, H) self.experts nn.ModuleList([ CVCI(in_chansdim, embed_dimsdim), #FeatureAdd(q1), fusion(channeldim), GPT(d_modeldim), #FeatureAdd(q1), #CrossTransformerFusion(input_dimdim), #FeatureAdd(q1), concat(), ]) # 门控网络 self.gating_network nn.Linear(dim, self.num_experts, biasFalse) def forward(self, x): xf torch.abs(x[0]-x[1]) B, C, H, W xf.shape # 保持 (B, C, H, W) 格式 #print(self.dim, xf.shape) pool_x F.adaptive_avg_pool2d(xf, (1, 1)).squeeze(2).squeeze(2) #print(pool_x.shape) # 计算 gating 权重 (B, num_experts)根据 text_feature 计算专家选择概率 #print(self.dim, pool_x.shape) gate_logits self.gating_network(pool_x) # (B, num_experts) gate_weights F.softmax(gate_logits, dim-1) # (B, num_experts) print(gate_weights) # 选择 top-k 专家 topk_values, topk_indices torch.topk(gate_weights, self.top_k, dim-1) # (B, top_k) # 初始化 MoE 输出 moe_output torch.zeros_like(torch.cat([x[0],x[1]], dim1)) # (B, C, H, W) #print(topk_indices.shape) # 仅计算 Top-k 专家 #print(self.top_k) for i in range(self.top_k): expert_idx topk_indices[:, i] # (B,) weight topk_values[:, i].view(B, 1, 1, 1) # (B, 1, 1, 1) 用于加权 # 计算当前专家输出仅在选中的 batch 进行计算 #print(self.num_experts) for j in range(self.num_experts): mask (expert_idx j).view(B, 1, 1, 1) # 选中的 batch if mask.any(): #print(moe_output.shape, self.experts[j](x).shape) moe_output weight * mask * self.experts[j](x) # (B, C, H, W) return moe_output #print(self.experts[0](x).shape, self.experts[1](x).shape) #return self.experts[2](x)大家可以将自己的融合方式放进去包括之前的融合方式笔者为大家提供的融合方式同时也为大家提供其他几种融合方式https://blog.csdn.net/2201_75517551/article/details/159799348?spm1001.2014.3001.5502https://blog.csdn.net/2201_75517551/article/details/159799348?spm1001.2014.3001.5502class MFusion_1(nn.Module): def __init__(self,dim_in): super(MFusion_1, self).__init__() dim dim_in//4 self.dim dim self.num_experts 4 # 这个 self.top_k 1 # 定义多个专家每个专家是 (B, C, W, H) - (B, C, W, H) self.fusion CVCI(in_chansdim, embed_dimsdim) def forward(self, x): return self.fusion(x) class MFusion_2(nn.Module): def __init__(self,dim_in): super(MFusion_2, self).__init__() dim dim_in//4 self.dim dim self.num_experts 4 # 这个 self.top_k 1 # 定义多个专家每个专家是 (B, C, W, H) - (B, C, W, H) self.fusion fusion(channeldim) def forward(self, x): return self.fusion(x) class MFusion_3(nn.Module): def __init__(self,dim_in): super(MFusion_3, self).__init__() dim dim_in//4 self.dim dim self.num_experts 4 # 这个 self.top_k 1 # 定义多个专家每个专家是 (B, C, W, H) - (B, C, W, H) self.fusion GPT(d_modeldim) def forward(self, x): return self.fusion(x)创新三CFI-MPD-IoU 边界框回归损失该创新点提出聚焦目标核心区域的鲁棒边界框优化损失针对恶劣环境下目标边界模糊导致传统 IoU 类损失梯度不稳定、定位精度下降的问题通过收缩边界计算核心区域重叠度弱化模糊边界的干扰同时引入 MPD 角点距离约束强化预测框与真实框的几何对齐一致性以闭合可微分形式完成计算在不增加推理开销的前提下提升边界框优化的稳定性与一致性有效改善退化场景下目标定位不准、训练收敛震荡的问题为检测模型提供更可靠的定位监督信号。后续将进行更新以及进行二次创新发顶刊必备。。。敬请关注笔者整理双模态检测的专属论文资料免费分享给粉丝需要关注后领取。