# VoVNet backbone (OSA + eSE), adapted from vovnet-detectron2 (no detectron2 dependency). # Reference: BK2/archive/vovnet-detectron2-master/vovnet/vovnet.py from collections import OrderedDict import torch import torch.nn as nn import torch.nn.functional as F VoVNet19_slim_dw_eSE = { 'stem': [64, 64, 64], 'stage_conv_ch': [64, 80, 96, 112], 'stage_out_ch': [112, 256, 384, 512], 'layer_per_block': 3, 'block_per_stage': [1, 1, 1, 1], 'eSE': True, 'dw': True, } VoVNet19_dw_eSE = { 'stem': [64, 64, 64], 'stage_conv_ch': [128, 160, 192, 224], 'stage_out_ch': [256, 512, 768, 1024], 'layer_per_block': 3, 'block_per_stage': [1, 1, 1, 1], 'eSE': True, 'dw': True, } VoVNet19_slim_eSE = { 'stem': [64, 64, 128], 'stage_conv_ch': [64, 80, 96, 112], 'stage_out_ch': [112, 256, 384, 512], 'layer_per_block': 3, 'block_per_stage': [1, 1, 1, 1], 'eSE': True, 'dw': False, } VoVNet19_eSE = { 'stem': [64, 64, 128], 'stage_conv_ch': [128, 160, 192, 224], 'stage_out_ch': [256, 512, 768, 1024], 'layer_per_block': 3, 'block_per_stage': [1, 1, 1, 1], 'eSE': True, 'dw': False, } VoVNet39_eSE = { 'stem': [64, 64, 128], 'stage_conv_ch': [128, 160, 192, 224], 'stage_out_ch': [256, 512, 768, 1024], 'layer_per_block': 5, 'block_per_stage': [1, 1, 2, 2], 'eSE': True, 'dw': False, } VoVNet57_eSE = { 'stem': [64, 64, 128], 'stage_conv_ch': [128, 160, 192, 224], 'stage_out_ch': [256, 512, 768, 1024], 'layer_per_block': 5, 'block_per_stage': [1, 1, 4, 3], 'eSE': True, 'dw': False, } VoVNet99_eSE = { 'stem': [64, 64, 128], 'stage_conv_ch': [128, 160, 192, 224], 'stage_out_ch': [256, 512, 768, 1024], 'layer_per_block': 5, 'block_per_stage': [1, 3, 9, 3], 'eSE': True, 'dw': False, } STAGE_SPECS = { 'V-19-slim-dw-eSE': VoVNet19_slim_dw_eSE, 'V-19-dw-eSE': VoVNet19_dw_eSE, 'V-19-slim-eSE': VoVNet19_slim_eSE, 'V-19-eSE': VoVNet19_eSE, 'V-39-eSE': VoVNet39_eSE, 'V-57-eSE': VoVNet57_eSE, 'V-99-eSE': VoVNet99_eSE, } # Short names used in UFLD configs (backbone='vov19slim', ...) VOVNET_ALIASES = { 'vov19slim_dw': 'V-19-slim-dw-eSE', 'vov19_dw': 'V-19-dw-eSE', 'vov19slim': 'V-19-slim-eSE', 'vov19': 'V-19-eSE', 'vov39': 'V-39-eSE', 'vov57': 'V-57-eSE', 'vov99': 'V-99-eSE', } def _bn(ch): return nn.BatchNorm2d(ch) def dw_conv3x3(in_channels, out_channels, module_name, postfix, stride=1, kernel_size=3, padding=1): return [ (f'{module_name}_{postfix}/dw', nn.Conv2d( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=out_channels, bias=False)), (f'{module_name}_{postfix}/pw', nn.Conv2d( in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=False)), (f'{module_name}_{postfix}/bn', _bn(out_channels)), (f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True)), ] def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=3, padding=1): return [ (f'{module_name}_{postfix}/conv', nn.Conv2d( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False)), (f'{module_name}_{postfix}/bn', _bn(out_channels)), (f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True)), ] def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=1, padding=0): return [ (f'{module_name}_{postfix}/conv', nn.Conv2d( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False)), (f'{module_name}_{postfix}/bn', _bn(out_channels)), (f'{module_name}_{postfix}/relu', nn.ReLU(inplace=True)), ] class Hsigmoid(nn.Module): def __init__(self, inplace=True): super().__init__() self.inplace = inplace def forward(self, x): return F.relu6(x + 3.0, inplace=self.inplace) / 6.0 class eSEModule(nn.Module): def __init__(self, channel): super().__init__() self.avg_pool = nn.AdaptiveAvgPool2d(1) self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0) self.hsigmoid = Hsigmoid() def forward(self, x): return x * self.hsigmoid(self.fc(self.avg_pool(x))) class _OSA_module(nn.Module): def __init__(self, in_ch, stage_ch, concat_ch, layer_per_block, module_name, identity=False, depthwise=False): super().__init__() self.identity = identity self.depthwise = depthwise self.is_reduced = False self.layers = nn.ModuleList() in_channel = in_ch if self.depthwise and in_channel != stage_ch: self.is_reduced = True self.conv_reduction = nn.Sequential( OrderedDict(conv1x1(in_channel, stage_ch, f'{module_name}_reduction', '0'))) for i in range(layer_per_block): if self.depthwise: self.layers.append(nn.Sequential(OrderedDict( dw_conv3x3(stage_ch, stage_ch, module_name, str(i))))) else: self.layers.append(nn.Sequential(OrderedDict( conv3x3(in_channel, stage_ch, module_name, str(i))))) in_channel = stage_ch in_channel = in_ch + layer_per_block * stage_ch self.concat = nn.Sequential(OrderedDict( conv1x1(in_channel, concat_ch, module_name, 'concat'))) self.ese = eSEModule(concat_ch) def forward(self, x): identity_feat = x output = [x] if self.depthwise and self.is_reduced: x = self.conv_reduction(x) for layer in self.layers: x = layer(x) output.append(x) x = torch.cat(output, dim=1) xt = self.ese(self.concat(x)) if self.identity: xt = xt + identity_feat return xt class _OSA_stage(nn.Sequential): def __init__(self, in_ch, stage_ch, concat_ch, block_per_stage, layer_per_block, stage_num, depthwise=False): super().__init__() if stage_num != 2: self.add_module('Pooling', nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)) module_name = f'OSA{stage_num}_1' self.add_module(module_name, _OSA_module( in_ch, stage_ch, concat_ch, layer_per_block, module_name, depthwise=depthwise)) for i in range(block_per_stage - 1): module_name = f'OSA{stage_num}_{i + 2}' self.add_module(module_name, _OSA_module( concat_ch, stage_ch, concat_ch, layer_per_block, module_name, identity=True, depthwise=depthwise)) class VoVNetBody(nn.Module): """VoVNet stages; forward returns (stage3, stage4, stage5) at strides 8/16/32.""" def __init__(self, variant='V-19-slim-eSE', input_ch=3): super().__init__() if variant in VOVNET_ALIASES: variant = VOVNET_ALIASES[variant] if variant not in STAGE_SPECS: raise KeyError(f'Unknown VoVNet variant {variant!r}, choose from {list(STAGE_SPECS)}') self.variant = variant spec = STAGE_SPECS[variant] stem_ch = spec['stem'] config_stage_ch = spec['stage_conv_ch'] config_concat_ch = spec['stage_out_ch'] block_per_stage = spec['block_per_stage'] layer_per_block = spec['layer_per_block'] depthwise = spec['dw'] conv_type = dw_conv3x3 if depthwise else conv3x3 stem = conv3x3(input_ch, stem_ch[0], 'stem', '1', stride=2) stem += conv_type(stem_ch[0], stem_ch[1], 'stem', '2', stride=1) stem += conv_type(stem_ch[1], stem_ch[2], 'stem', '3', stride=2) self.stem = nn.Sequential(OrderedDict(stem)) in_ch_list = [stem_ch[2]] + config_concat_ch[:-1] self.stage2 = _OSA_stage( in_ch_list[0], config_stage_ch[0], config_concat_ch[0], block_per_stage[0], layer_per_block, 2, depthwise=depthwise) self.stage3 = _OSA_stage( in_ch_list[1], config_stage_ch[1], config_concat_ch[1], block_per_stage[1], layer_per_block, 3, depthwise=depthwise) self.stage4 = _OSA_stage( in_ch_list[2], config_stage_ch[2], config_concat_ch[2], block_per_stage[2], layer_per_block, 4, depthwise=depthwise) self.stage5 = _OSA_stage( in_ch_list[3], config_stage_ch[3], config_concat_ch[3], block_per_stage[3], layer_per_block, 5, depthwise=depthwise) self.out_channels = { 'c3': config_concat_ch[1], 'c4': config_concat_ch[2], 'c5': config_concat_ch[3], } self._initialize_weights() def _initialize_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight) def forward(self, x): x = self.stem(x) x = self.stage2(x) c3 = self.stage3(x) c4 = self.stage4(c3) c5 = self.stage5(c4) return c3, c4, c5 class vovnet(nn.Module): """UFLD-compatible wrapper (same interface as resnet: x2, x3, x4).""" def __init__(self, variant='vov19slim', pretrained=False): super().__init__() if pretrained: import warnings warnings.warn( 'VoVNet has no torchvision pretrained weights in UFLD; ' 'train from scratch or load a custom checkpoint.', UserWarning, stacklevel=2, ) key = variant if variant in VOVNET_ALIASES else variant self.body = VoVNetBody(key) self.variant = self.body.variant def forward(self, x): return self.body(x)