From 0568102788f8bd67a3ea54f450e8589b7bbebf6a Mon Sep 17 00:00:00 2001
From: zakajd <djamilzak@gmail.com>
Date: Thu, 4 Jun 2020 20:17:57 +0300
Subject: [PATCH] formatting

---
 .../detection_models/efficientdet.py          |   8 +-
 pytorch_tools/detection_models/retinanet.py   |  13 +-
 pytorch_tools/models/__init__.py              |   2 +-
 pytorch_tools/models/bit_resnet.py            | 455 +++++++++---------
 pytorch_tools/models/efficientnet.py          |  11 +-
 pytorch_tools/models/hrnet.py                 | 136 +++---
 pytorch_tools/models/resnet.py                |   7 +-
 pytorch_tools/models/tresnet.py               |  37 +-
 pytorch_tools/models/vgg.py                   |   1 -
 pytorch_tools/modules/activated_batch_norm.py |   2 +-
 pytorch_tools/modules/tf_same_ops.py          |  12 +-
 pytorch_tools/utils/box.py                    | 117 ++---
 pytorch_tools/utils/misc.py                   |   9 +-
 tests/detection_models/test_det_models.py     |   3 +-
 tests/losses/test_losses.py                   |  11 +-
 tests/models/test_models.py                   |  21 +-
 tests/models/test_weights.py                  |   2 +
 tests/modules/test_modules.py                 |   2 +
 tests/segmentation_models/test_segm_models.py |  11 +-
 tests/utils/test_utils.py                     |  24 +-
 20 files changed, 473 insertions(+), 411 deletions(-)

diff --git a/pytorch_tools/detection_models/efficientdet.py b/pytorch_tools/detection_models/efficientdet.py
index e765f97..0446b56 100644
--- a/pytorch_tools/detection_models/efficientdet.py
+++ b/pytorch_tools/detection_models/efficientdet.py
@@ -161,16 +161,14 @@ def predict(self, x):
         """
         class_outputs, box_outputs = self.forward(x)
         anchors = box_utils.generate_anchors_boxes(x.shape[-2:])[0]
-        return box_utils.decode(
-            class_outputs, box_outputs, anchors, #img_shape=x.shape[-2:]
-        )
+        return box_utils.decode(class_outputs, box_outputs, anchors)
 
     def _initialize_weights(self):
         # init everything except encoder
         no_encoder_m = [m for n, m in self.named_modules() if not "encoder" in n]
         initialize_iterator(no_encoder_m)
-        # need to init last bias so that after sigmoid it's 0.01 
-        cls_bias_init = -torch.log(torch.tensor((1 - 0.01) / 0.01)) # -4.59
+        # need to init last bias so that after sigmoid it's 0.01
+        cls_bias_init = -torch.log(torch.tensor((1 - 0.01) / 0.01))  # -4.59
         nn.init.constant_(self.cls_head_convs[-1][1].bias, cls_bias_init)
 
 
diff --git a/pytorch_tools/detection_models/retinanet.py b/pytorch_tools/detection_models/retinanet.py
index 0ddf559..2fbdf74 100644
--- a/pytorch_tools/detection_models/retinanet.py
+++ b/pytorch_tools/detection_models/retinanet.py
@@ -44,7 +44,7 @@ class RetinaNet(nn.Module):
 
     def __init__(
         self,
-        pretrained="coco", # not used here for proper signature
+        pretrained="coco",  # not used here for proper signature
         encoder_name="resnet50",
         encoder_weights="imagenet",
         pyramid_channels=256,
@@ -90,7 +90,7 @@ def make_final_convs():
         self.box_convs = make_final_convs()
         self.box_head_conv = conv3x3(pyramid_channels, 4 * anchors_per_location, bias=True)
         self.num_classes = num_classes
-        self. _initialize_weights()
+        self._initialize_weights()
 
     # Name from mmdetectin for convenience
     def extract_features(self, x):
@@ -126,18 +126,17 @@ def predict(self, x):
         """Run forward on given images and decode raw prediction into bboxes"""
         class_outputs, box_outputs = self.forward(x)
         anchors = box_utils.generate_anchors_boxes(x.shape[-2:])[0]
-        return box_utils.decode(
-            class_outputs, box_outputs, anchors, img_shape=x.shape[-2:]
-        )
+        return box_utils.decode(class_outputs, box_outputs, anchors)
 
     def _initialize_weights(self):
         # init everything except encoder
         no_encoder_m = [m for n, m in self.named_modules() if not "encoder" in n]
         initialize_iterator(no_encoder_m)
-        # need to init last bias so that after sigmoid it's 0.01 
-        cls_bias_init = -torch.log(torch.tensor((1 - 0.01) / 0.01)) # -4.59
+        # need to init last bias so that after sigmoid it's 0.01
+        cls_bias_init = -torch.log(torch.tensor((1 - 0.01) / 0.01))  # -4.59
         nn.init.constant_(self.cls_head_conv.bias, cls_bias_init)
 
+
 # Don't really know input size for the models. 512 is just a guess
 PRETRAIN_SETTINGS = {**DEFAULT_IMAGENET_SETTINGS, "input_size": (512, 512), "crop_pct": 1, "num_classes": 80}
 
diff --git a/pytorch_tools/models/__init__.py b/pytorch_tools/models/__init__.py
index ef9c0d4..4fb855e 100644
--- a/pytorch_tools/models/__init__.py
+++ b/pytorch_tools/models/__init__.py
@@ -51,4 +51,4 @@
 from .bit_resnet import bit_m_101x1
 from .bit_resnet import bit_m_101x3
 from .bit_resnet import bit_m_152x2
-from .bit_resnet import bit_m_152x4
\ No newline at end of file
+from .bit_resnet import bit_m_152x4
diff --git a/pytorch_tools/models/bit_resnet.py b/pytorch_tools/models/bit_resnet.py
index 3108e97..c5acbd5 100644
--- a/pytorch_tools/models/bit_resnet.py
+++ b/pytorch_tools/models/bit_resnet.py
@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#      http://www.apache.org/licenses/LICENSE-2.0
+#            http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -29,179 +29,181 @@
 
 
 def conv3x3(cin, cout, stride=1, groups=1, bias=False):
-  return StdConv2d(cin, cout, kernel_size=3, stride=stride,
-                   padding=1, bias=bias, groups=groups)
+    return StdConv2d(cin, cout, kernel_size=3, stride=stride, padding=1, bias=bias, groups=groups)
 
 
 def conv1x1(cin, cout, stride=1, bias=False):
-  return StdConv2d(cin, cout, kernel_size=1, stride=stride,
-                   padding=0, bias=bias)
+    return StdConv2d(cin, cout, kernel_size=1, stride=stride, padding=0, bias=bias)
 
 
 def tf2th(conv_weights):
-  """Possibly convert HWIO to OIHW."""
-  if conv_weights.ndim == 4:
-    conv_weights = conv_weights.transpose([3, 2, 0, 1])
-  return torch.from_numpy(conv_weights)
+    """Possibly convert HWIO to OIHW."""
+    if conv_weights.ndim == 4:
+        conv_weights = conv_weights.transpose([3, 2, 0, 1])
+    return torch.from_numpy(conv_weights)
 
 
 class PreActBottleneck(nn.Module):
-  """Pre-activation (v2) bottleneck block.
-
-  Follows the implementation of "Identity Mappings in Deep Residual Networks":
-  https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua
-
-  Except it puts the stride on 3x3 conv when available.
-  """
-
-  def __init__(self, cin, cout=None, cmid=None, stride=1):
-    super().__init__()
-    cout = cout or cin
-    cmid = cmid or cout//4
-
-    self.gn1 = nn.GroupNorm(32, cin)
-    self.conv1 = conv1x1(cin, cmid)
-    self.gn2 = nn.GroupNorm(32, cmid)
-    self.conv2 = conv3x3(cmid, cmid, stride)  # Original code has it on conv1!!
-    self.gn3 = nn.GroupNorm(32, cmid)
-    self.conv3 = conv1x1(cmid, cout)
-    self.relu = nn.ReLU(inplace=True)
-
-    if (stride != 1 or cin != cout):
-      # Projection also with pre-activation according to paper.
-      self.downsample = conv1x1(cin, cout, stride)
-
-  def forward(self, x):
-    out = self.relu(self.gn1(x))
-
-    # Residual branch
-    residual = x
-    if hasattr(self, 'downsample'):
-      residual = self.downsample(out)
-
-    # Unit's branch
-    out = self.conv1(out)
-    out = self.conv2(self.relu(self.gn2(out)))
-    out = self.conv3(self.relu(self.gn3(out)))
-
-    return out + residual
-
-  def load_from(self, weights, prefix=''):
-    convname = 'standardized_conv2d'
-    with torch.no_grad():
-      self.conv1.weight.copy_(tf2th(weights[f'{prefix}a/{convname}/kernel']))
-      self.conv2.weight.copy_(tf2th(weights[f'{prefix}b/{convname}/kernel']))
-      self.conv3.weight.copy_(tf2th(weights[f'{prefix}c/{convname}/kernel']))
-      self.gn1.weight.copy_(tf2th(weights[f'{prefix}a/group_norm/gamma']))
-      self.gn2.weight.copy_(tf2th(weights[f'{prefix}b/group_norm/gamma']))
-      self.gn3.weight.copy_(tf2th(weights[f'{prefix}c/group_norm/gamma']))
-      self.gn1.bias.copy_(tf2th(weights[f'{prefix}a/group_norm/beta']))
-      self.gn2.bias.copy_(tf2th(weights[f'{prefix}b/group_norm/beta']))
-      self.gn3.bias.copy_(tf2th(weights[f'{prefix}c/group_norm/beta']))
-      if hasattr(self, 'downsample'):
-        w = weights[f'{prefix}a/proj/{convname}/kernel']
-        self.downsample.weight.copy_(tf2th(w))
+    """Pre-activation (v2) bottleneck block.
+
+    Follows the implementation of "Identity Mappings in Deep Residual Networks":
+    https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua
+
+    Except it puts the stride on 3x3 conv when available.
+    """
+
+    def __init__(self, cin, cout=None, cmid=None, stride=1):
+        super().__init__()
+        cout = cout or cin
+        cmid = cmid or cout // 4
+
+        self.gn1 = nn.GroupNorm(32, cin)
+        self.conv1 = conv1x1(cin, cmid)
+        self.gn2 = nn.GroupNorm(32, cmid)
+        self.conv2 = conv3x3(cmid, cmid, stride)  # Original code has it on conv1!!
+        self.gn3 = nn.GroupNorm(32, cmid)
+        self.conv3 = conv1x1(cmid, cout)
+        self.relu = nn.ReLU(inplace=True)
+
+        if stride != 1 or cin != cout:
+            # Projection also with pre-activation according to paper.
+            self.downsample = conv1x1(cin, cout, stride)
+
+    def forward(self, x):
+        out = self.relu(self.gn1(x))
+
+        # Residual branch
+        residual = x
+        if hasattr(self, "downsample"):
+            residual = self.downsample(out)
+
+        # Unit's branch
+        out = self.conv1(out)
+        out = self.conv2(self.relu(self.gn2(out)))
+        out = self.conv3(self.relu(self.gn3(out)))
+
+        return out + residual
+
+    def load_from(self, weights, prefix=""):
+        convname = "standardized_conv2d"
+        with torch.no_grad():
+            self.conv1.weight.copy_(tf2th(weights[f"{prefix}a/{convname}/kernel"]))
+            self.conv2.weight.copy_(tf2th(weights[f"{prefix}b/{convname}/kernel"]))
+            self.conv3.weight.copy_(tf2th(weights[f"{prefix}c/{convname}/kernel"]))
+            self.gn1.weight.copy_(tf2th(weights[f"{prefix}a/group_norm/gamma"]))
+            self.gn2.weight.copy_(tf2th(weights[f"{prefix}b/group_norm/gamma"]))
+            self.gn3.weight.copy_(tf2th(weights[f"{prefix}c/group_norm/gamma"]))
+            self.gn1.bias.copy_(tf2th(weights[f"{prefix}a/group_norm/beta"]))
+            self.gn2.bias.copy_(tf2th(weights[f"{prefix}b/group_norm/beta"]))
+            self.gn3.bias.copy_(tf2th(weights[f"{prefix}c/group_norm/beta"]))
+            if hasattr(self, "downsample"):
+                w = weights[f"{prefix}a/proj/{convname}/kernel"]
+                self.downsample.weight.copy_(tf2th(w))
+
 
 # this models are designed for trasfer learning only! not for training from scratch
 class ResNetV2(nn.Module):
-  """
-  Implementation of Pre-activation (v2) ResNet mode.
-  Used to create Bit-M-50/101/152x1/2/3/4 models
-  
-  Args:
-    num_classes (int): Number of classification classes. Defaults to 5
-  """
-
-  def __init__(
-    self,
-    block_units,
-    width_factor,
-    # in_channels=3, # TODO: add later
-    num_classes=5, # just a random number
-    # encoder=False, # TODO: add later
+    """
+    Implementation of Pre-activation (v2) ResNet mode.
+    Used to create Bit-M-50/101/152x1/2/3/4 models
+    
+    Args:
+        num_classes (int): Number of classification classes. Defaults to 5
+    """
+
+    def __init__(
+        self,
+        block_units,
+        width_factor,
+        # in_channels=3, # TODO: add later
+        num_classes=5,  # just a random number
+        # encoder=False, # TODO: add later
     ):
-    super().__init__()
-    wf = width_factor  # shortcut 'cause we'll use it a lot.
-
-    # The following will be unreadable if we split lines.
-    # pylint: disable=line-too-long
-    self.root = nn.Sequential(OrderedDict([
-        ('conv', StdConv2d(3, 64*wf, kernel_size=7, stride=2, padding=3, bias=False)),
-        ('pad', nn.ConstantPad2d(1, 0)),
-        ('pool', nn.MaxPool2d(kernel_size=3, stride=2, padding=0)),
-        # The following is subtly not the same!
-        # ('pool', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
-    ]))
-
-    self.body = nn.Sequential(OrderedDict([
-        ('block1', nn.Sequential(OrderedDict(
-            [('unit01', PreActBottleneck(cin=64*wf, cout=256*wf, cmid=64*wf))] +
-            [(f'unit{i:02d}', PreActBottleneck(cin=256*wf, cout=256*wf, cmid=64*wf)) for i in range(2, block_units[0] + 1)],
-        ))),
-        ('block2', nn.Sequential(OrderedDict(
-            [('unit01', PreActBottleneck(cin=256*wf, cout=512*wf, cmid=128*wf, stride=2))] +
-            [(f'unit{i:02d}', PreActBottleneck(cin=512*wf, cout=512*wf, cmid=128*wf)) for i in range(2, block_units[1] + 1)],
-        ))),
-        ('block3', nn.Sequential(OrderedDict(
-            [('unit01', PreActBottleneck(cin=512*wf, cout=1024*wf, cmid=256*wf, stride=2))] +
-            [(f'unit{i:02d}', PreActBottleneck(cin=1024*wf, cout=1024*wf, cmid=256*wf)) for i in range(2, block_units[2] + 1)],
-        ))),
-        ('block4', nn.Sequential(OrderedDict(
-            [('unit01', PreActBottleneck(cin=1024*wf, cout=2048*wf, cmid=512*wf, stride=2))] +
-            [(f'unit{i:02d}', PreActBottleneck(cin=2048*wf, cout=2048*wf, cmid=512*wf)) for i in range(2, block_units[3] + 1)],
-        ))),
-    ]))
-    # pylint: enable=line-too-long
-
-    self.head = nn.Sequential(OrderedDict([
-        ('gn', nn.GroupNorm(32, 2048*wf)),
-        ('relu', nn.ReLU(inplace=True)),
-        ('avg', nn.AdaptiveAvgPool2d(output_size=1)),
-        ('conv', nn.Conv2d(2048*wf, num_classes, kernel_size=1, bias=True)),
-    ]))
-
-  def features(self, x):
-    return self.body(self.root(x))
-
-  def logits(self, x):
-    return self.head(x)
-
-  def forward(self, x):
-    x = self.logits(self.features(x))
-    assert x.shape[-2:] == (1, 1)  # We should have no spatial shape left.
-    return x[...,0,0]
-
-  def load_from(self, weights, prefix='resnet/'):
-    with torch.no_grad():
-      self.root.conv.weight.copy_(tf2th(weights[f'{prefix}root_block/standardized_conv2d/kernel']))  # pylint: disable=line-too-long
-      self.head.gn.weight.copy_(tf2th(weights[f'{prefix}group_norm/gamma']))
-      self.head.gn.bias.copy_(tf2th(weights[f'{prefix}group_norm/beta']))
-      # always zero_head
-      nn.init.zeros_(self.head.conv.weight)
-      nn.init.zeros_(self.head.conv.bias)
-
-      for bname, block in self.body.named_children():
-        for uname, unit in block.named_children():
-          unit.load_from(weights, prefix=f'{prefix}{bname}/{uname}/')
-
-
-
-
-KNOWN_MODELS = OrderedDict([
-    ('BiT-M-R50x1', lambda *a, **kw: ResNetV2([3, 4, 6, 3], 1, *a, **kw)),
-    ('BiT-M-R50x3', lambda *a, **kw: ResNetV2([3, 4, 6, 3], 3, *a, **kw)),
-    ('BiT-M-R101x1', lambda *a, **kw: ResNetV2([3, 4, 23, 3], 1, *a, **kw)),
-    ('BiT-M-R101x3', lambda *a, **kw: ResNetV2([3, 4, 23, 3], 3, *a, **kw)),
-    ('BiT-M-R152x2', lambda *a, **kw: ResNetV2([3, 8, 36, 3], 2, *a, **kw)),
-    ('BiT-M-R152x4', lambda *a, **kw: ResNetV2([3, 8, 36, 3], 4, *a, **kw)),
-
-    ('BiT-S-R50x1', lambda *a, **kw: ResNetV2([3, 4, 6, 3], 1, *a, **kw)),
-    ('BiT-S-R50x3', lambda *a, **kw: ResNetV2([3, 4, 6, 3], 3, *a, **kw)),
-    ('BiT-S-R101x1', lambda *a, **kw: ResNetV2([3, 4, 23, 3], 1, *a, **kw)),
-    ('BiT-S-R101x3', lambda *a, **kw: ResNetV2([3, 4, 23, 3], 3, *a, **kw)),
-    ('BiT-S-R152x2', lambda *a, **kw: ResNetV2([3, 8, 36, 3], 2, *a, **kw)),
-    ('BiT-S-R152x4', lambda *a, **kw: ResNetV2([3, 8, 36, 3], 4, *a, **kw)),
-])
+        super().__init__()
+        wf = width_factor  # shortcut 'cause we'll use it a lot.
+
+        # The following will be unreadable if we split lines.
+        # pylint: disable=line-too-long
+        # fmt: off
+        self.root = nn.Sequential(OrderedDict([
+                ('conv', StdConv2d(3, 64*wf, kernel_size=7, stride=2, padding=3, bias=False)),
+                ('pad', nn.ConstantPad2d(1, 0)),
+                ('pool', nn.MaxPool2d(kernel_size=3, stride=2, padding=0)),
+                # The following is subtly not the same!
+                # ('pool', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)),
+        ]))
+
+        self.body = nn.Sequential(OrderedDict([
+                ('block1', nn.Sequential(OrderedDict(
+                        [('unit01', PreActBottleneck(cin=64*wf, cout=256*wf, cmid=64*wf))] +
+                        [(f'unit{i:02d}', PreActBottleneck(cin=256*wf, cout=256*wf, cmid=64*wf)) for i in range(2, block_units[0] + 1)],
+                ))),
+                ('block2', nn.Sequential(OrderedDict(
+                        [('unit01', PreActBottleneck(cin=256*wf, cout=512*wf, cmid=128*wf, stride=2))] +
+                        [(f'unit{i:02d}', PreActBottleneck(cin=512*wf, cout=512*wf, cmid=128*wf)) for i in range(2, block_units[1] + 1)],
+                ))),
+                ('block3', nn.Sequential(OrderedDict(
+                        [('unit01', PreActBottleneck(cin=512*wf, cout=1024*wf, cmid=256*wf, stride=2))] +
+                        [(f'unit{i:02d}', PreActBottleneck(cin=1024*wf, cout=1024*wf, cmid=256*wf)) for i in range(2, block_units[2] + 1)],
+                ))),
+                ('block4', nn.Sequential(OrderedDict(
+                        [('unit01', PreActBottleneck(cin=1024*wf, cout=2048*wf, cmid=512*wf, stride=2))] +
+                        [(f'unit{i:02d}', PreActBottleneck(cin=2048*wf, cout=2048*wf, cmid=512*wf)) for i in range(2, block_units[3] + 1)],
+                ))),
+        ]))
+        # pylint: enable=line-too-long
+
+        self.head = nn.Sequential(OrderedDict([
+                ('gn', nn.GroupNorm(32, 2048*wf)),
+                ('relu', nn.ReLU(inplace=True)),
+                ('avg', nn.AdaptiveAvgPool2d(output_size=1)),
+                ('conv', nn.Conv2d(2048*wf, num_classes, kernel_size=1, bias=True)),
+        ]))
+        # fmt: on
+
+    def features(self, x):
+        return self.body(self.root(x))
+
+    def logits(self, x):
+        return self.head(x)
+
+    def forward(self, x):
+        x = self.logits(self.features(x))
+        assert x.shape[-2:] == (1, 1)  # We should have no spatial shape left.
+        return x[..., 0, 0]
+
+    def load_from(self, weights, prefix="resnet/"):
+        with torch.no_grad():
+            self.root.conv.weight.copy_(
+                tf2th(weights[f"{prefix}root_block/standardized_conv2d/kernel"])
+            )  # pylint: disable=line-too-long
+            self.head.gn.weight.copy_(tf2th(weights[f"{prefix}group_norm/gamma"]))
+            self.head.gn.bias.copy_(tf2th(weights[f"{prefix}group_norm/beta"]))
+            # always zero_head
+            nn.init.zeros_(self.head.conv.weight)
+            nn.init.zeros_(self.head.conv.bias)
+
+            for bname, block in self.body.named_children():
+                for uname, unit in block.named_children():
+                    unit.load_from(weights, prefix=f"{prefix}{bname}/{uname}/")
+
+
+KNOWN_MODELS = OrderedDict(
+    [
+        ("BiT-M-R50x1", lambda *a, **kw: ResNetV2([3, 4, 6, 3], 1, *a, **kw)),
+        ("BiT-M-R50x3", lambda *a, **kw: ResNetV2([3, 4, 6, 3], 3, *a, **kw)),
+        ("BiT-M-R101x1", lambda *a, **kw: ResNetV2([3, 4, 23, 3], 1, *a, **kw)),
+        ("BiT-M-R101x3", lambda *a, **kw: ResNetV2([3, 4, 23, 3], 3, *a, **kw)),
+        ("BiT-M-R152x2", lambda *a, **kw: ResNetV2([3, 8, 36, 3], 2, *a, **kw)),
+        ("BiT-M-R152x4", lambda *a, **kw: ResNetV2([3, 8, 36, 3], 4, *a, **kw)),
+        ("BiT-S-R50x1", lambda *a, **kw: ResNetV2([3, 4, 6, 3], 1, *a, **kw)),
+        ("BiT-S-R50x3", lambda *a, **kw: ResNetV2([3, 4, 6, 3], 3, *a, **kw)),
+        ("BiT-S-R101x1", lambda *a, **kw: ResNetV2([3, 4, 23, 3], 1, *a, **kw)),
+        ("BiT-S-R101x3", lambda *a, **kw: ResNetV2([3, 4, 23, 3], 3, *a, **kw)),
+        ("BiT-S-R152x2", lambda *a, **kw: ResNetV2([3, 8, 36, 3], 2, *a, **kw)),
+        ("BiT-S-R152x4", lambda *a, **kw: ResNetV2([3, 8, 36, 3], 4, *a, **kw)),
+    ]
+)
 
 
 PRETRAIN_SETTINGS = {
@@ -215,96 +217,99 @@ def load_from(self, weights, prefix='resnet/'):
 
 # fmt: off
 CFGS = {
-  # weights are loaded by default
-  "bit_m_50x1": {
-    "default": {
-      "params": {"block_units": [3, 4, 6, 3], "width_factor": 1},
-      "url": "https://storage.googleapis.com/bit_models/BiT-M-R50x1.npz",
-      **PRETRAIN_SETTINGS
+    # weights are loaded by default
+    "bit_m_50x1": {
+        "default": {
+            "params": {"block_units": [3, 4, 6, 3], "width_factor": 1},
+            "url": "https://storage.googleapis.com/bit_models/BiT-M-R50x1.npz",
+            **PRETRAIN_SETTINGS
+        },
     },
-  },
-  "bit_m_50x3": {
-    "default": {
-      "params": {"block_units": [3, 4, 6, 3], "width_factor": 3},
-      "url": "https://storage.googleapis.com/bit_models/BiT-M-R50x3.npz",
-      **PRETRAIN_SETTINGS,
+    "bit_m_50x3": {
+        "default": {
+            "params": {"block_units": [3, 4, 6, 3], "width_factor": 3},
+            "url": "https://storage.googleapis.com/bit_models/BiT-M-R50x3.npz",
+            **PRETRAIN_SETTINGS,
+        },
     },
-  },
-  "bit_m_101x1": {
-    "default": {
-      "params": {"block_units": [3, 4, 23, 3], "width_factor": 1},
-      "url": "https://storage.googleapis.com/bit_models/BiT-M-R101x1.npz",
-      **PRETRAIN_SETTINGS,
+    "bit_m_101x1": {
+        "default": {
+            "params": {"block_units": [3, 4, 23, 3], "width_factor": 1},
+            "url": "https://storage.googleapis.com/bit_models/BiT-M-R101x1.npz",
+            **PRETRAIN_SETTINGS,
+        },
     },
-  },
-  "bit_m_101x3": {
-    "default": {
-      "params": {"block_units": [3, 4, 23, 3], "width_factor": 3},
-      "url": "https://storage.googleapis.com/bit_models/BiT-M-R101x3.npz",
-      **PRETRAIN_SETTINGS,
+    "bit_m_101x3": {
+        "default": {
+            "params": {"block_units": [3, 4, 23, 3], "width_factor": 3},
+            "url": "https://storage.googleapis.com/bit_models/BiT-M-R101x3.npz",
+            **PRETRAIN_SETTINGS,
+        },
     },
-  },
-  "bit_m_152x2": {
-    "default": {
-      "params": {"block_units": [3, 8, 36, 3], "width_factor": 2},
-      "url": "https://storage.googleapis.com/bit_models/BiT-M-R152x2.npz",
-      **PRETRAIN_SETTINGS,
+    "bit_m_152x2": {
+        "default": {
+            "params": {"block_units": [3, 8, 36, 3], "width_factor": 2},
+            "url": "https://storage.googleapis.com/bit_models/BiT-M-R152x2.npz",
+            **PRETRAIN_SETTINGS,
+        },
     },
-  },
-  "bit_m_152x4": {
-    "default": {
-      "params": {"block_units": [3, 8, 36, 3], "width_factor": 4},
-      "url": "https://storage.googleapis.com/bit_models/BiT-M-R152x4.npz",
-      **PRETRAIN_SETTINGS
+    "bit_m_152x4": {
+        "default": {
+            "params": {"block_units": [3, 8, 36, 3], "width_factor": 4},
+            "url": "https://storage.googleapis.com/bit_models/BiT-M-R152x4.npz",
+            **PRETRAIN_SETTINGS
+        },
     },
-  },
 }
 
 # fmt: on
 def _bit_resnet(arch, pretrained=None, **kwargs):
-  cfgs = deepcopy(CFGS)
-  cfg_settings = cfgs[arch]["default"]
-  cfg_params = cfg_settings.pop("params")
-  cfg_url = cfg_settings.pop("url")
-  kwargs.pop("pretrained", None)
-  kwargs.update(cfg_params)
-  model = ResNetV2(**kwargs)
-  # load weights to torch checkpoints folder
-  try:
-      torch.hub.load_state_dict_from_url(cfg_url)
-  except RuntimeError:
-      pass # to avoid RuntimeError: Only one file(not dir) is allowed in the zipfile
-  filename = os.path.basename(urlparse(cfg_url).path)
-  torch_home = torch.hub._get_torch_home()
-  cached_file = os.path.join(torch_home, 'checkpoints', filename)
-  weights = np.load(cached_file)
-  model.load_from(weights)
-  return model
+    cfgs = deepcopy(CFGS)
+    cfg_settings = cfgs[arch]["default"]
+    cfg_params = cfg_settings.pop("params")
+    cfg_url = cfg_settings.pop("url")
+    kwargs.pop("pretrained", None)
+    kwargs.update(cfg_params)
+    model = ResNetV2(**kwargs)
+    # load weights to torch checkpoints folder
+    try:
+        torch.hub.load_state_dict_from_url(cfg_url)
+    except RuntimeError:
+        pass  # to avoid RuntimeError: Only one file(not dir) is allowed in the zipfile
+    filename = os.path.basename(urlparse(cfg_url).path)
+    torch_home = torch.hub._get_torch_home()
+    cached_file = os.path.join(torch_home, "checkpoints", filename)
+    weights = np.load(cached_file)
+    model.load_from(weights)
+    return model
+
 
 # only want M versions of models for fine-tuning
 @wraps(ResNetV2)
 def bit_m_50x1(**kwargs):
-  return _bit_resnet("bit_m_50x1", **kwargs)
+    return _bit_resnet("bit_m_50x1", **kwargs)
+
 
 @wraps(ResNetV2)
 def bit_m_50x3(**kwargs):
-  return _bit_resnet("bit_m_50x3", **kwargs)
+    return _bit_resnet("bit_m_50x3", **kwargs)
+
 
 @wraps(ResNetV2)
 def bit_m_101x1(**kwargs):
-  return _bit_resnet("bit_m_101x1", **kwargs)
+    return _bit_resnet("bit_m_101x1", **kwargs)
+
 
 @wraps(ResNetV2)
 def bit_m_101x3(**kwargs):
-  return _bit_resnet("bit_m_101x3", **kwargs)
+    return _bit_resnet("bit_m_101x3", **kwargs)
+
 
 @wraps(ResNetV2)
 def bit_m_152x2(**kwargs):
-  return _bit_resnet("bit_m_152x2", **kwargs)
+    return _bit_resnet("bit_m_152x2", **kwargs)
+
 
 @wraps(ResNetV2)
 def bit_m_152x4(**kwargs):
-  return _bit_resnet("bit_m_152x4", **kwargs)
-
-
-
+    return _bit_resnet("bit_m_152x4", **kwargs)
diff --git a/pytorch_tools/models/efficientnet.py b/pytorch_tools/models/efficientnet.py
index ca5e337..17e67c0 100644
--- a/pytorch_tools/models/efficientnet.py
+++ b/pytorch_tools/models/efficientnet.py
@@ -144,7 +144,7 @@ def __init__(
             self.dropout = nn.Dropout(drop_rate, inplace=True)
             self.classifier = nn.Linear(num_features, num_classes)
 
-        patch_bn(self) # adjust epsilon
+        patch_bn(self)  # adjust epsilon
         initialize(self)
         if match_tf_same_padding:
             conv_to_same_conv(self)
@@ -397,7 +397,8 @@ def patch_bn(module):
         module.eps = 1e-3
     for m in module.children():
         patch_bn(m)
-  
+
+
 def _efficientnet(arch, pretrained=None, **kwargs):
     cfgs = deepcopy(CFGS)
     cfg_settings = cfgs[arch]["default"]
@@ -426,8 +427,10 @@ def _efficientnet(arch, pretrained=None, **kwargs):
             )
             state_dict["classifier.weight"] = model.state_dict()["classifier.weight"]
             state_dict["classifier.bias"] = model.state_dict()["classifier.bias"]
-        if kwargs.get("in_channels", 3) != 3: # support pretrained for custom input channels
-            state_dict["conv_stem.weight"] = repeat_channels(state_dict["conv_stem.weight"], kwargs["in_channels"])
+        if kwargs.get("in_channels", 3) != 3:  # support pretrained for custom input channels
+            state_dict["conv_stem.weight"] = repeat_channels(
+                state_dict["conv_stem.weight"], kwargs["in_channels"]
+            )
         model.load_state_dict(state_dict)
     setattr(model, "pretrained_settings", cfg_settings)
     return model
diff --git a/pytorch_tools/models/hrnet.py b/pytorch_tools/models/hrnet.py
index 0a65182..cbf5817 100644
--- a/pytorch_tools/models/hrnet.py
+++ b/pytorch_tools/models/hrnet.py
@@ -43,22 +43,23 @@ def make_layer(inplanes, planes, blocks, norm_layer=ABN, norm_act="relu"):
     layers = []
     layers.append(block(inplanes, planes, downsample=downsample, **bn_args))
     inplanes = planes * block.expansion
-    for i in range(1, blocks):
+    for _ in range(1, blocks):
         layers.append(block(inplanes, planes, **bn_args))
     return nn.Sequential(*layers)
 
+
 class HighResolutionModule(nn.Module):
     def __init__(
-        self, 
-        num_branches, # number of parallel branches
-        num_blocks, # number of blocks
+        self,
+        num_branches,  # number of parallel branches
+        num_blocks,  # number of blocks
         num_channels,
         norm_layer=ABN,
         norm_act="relu",
     ):
         super(HighResolutionModule, self).__init__()
         self.block = BasicBlock
-        self.num_branches = num_branches # used in forward
+        self.num_branches = num_branches  # used in forward
         self.num_inchannels = num_channels
         self.bn_args = {"norm_layer": norm_layer, "norm_act": norm_act}
         branches = [self._make_branch(n_bl, n_ch) for n_bl, n_ch in zip(num_blocks, num_channels)]
@@ -69,6 +70,7 @@ def __init__(
     def _make_branch(self, b_blocks, b_channels):
         return nn.Sequential(*[self.block(b_channels, b_channels, **self.bn_args) for _ in range(b_blocks)])
 
+    # fmt: off
     # don't want to rewrite this piece it's too fragile
     def _make_fuse_layers(self, norm_layer, norm_act):
         if self.num_branches == 1:
@@ -104,23 +106,24 @@ def _make_fuse_layers(self, norm_layer, norm_act):
             fuse_layers.append(nn.ModuleList(fuse_layer))
 
         return nn.ModuleList(fuse_layers)
-
+    # fmt: on
 
     def forward(self, x):
         if self.num_branches == 1:
             return [self.branches[0](x[0])]
-        
+
         x = [branch(x_i) for branch, x_i in zip(self.branches, x)]
 
         x_fuse = []
         for i in range(len(self.fuse_layers)):
             y = x[0] if i == 0 else self.fuse_layers[i][0](x[0])
             for j in range(1, self.num_branches):
-                    y = y + self.fuse_layers[i][j](x[j])
+                y = y + self.fuse_layers[i][j](x[j])
             x_fuse.append(self.relu(y))
 
         return x_fuse
 
+
 class TransitionBlock(nn.Module):
     """Transition is where new branches for smaller resolution are born
     -- ==> --
@@ -129,7 +132,7 @@ class TransitionBlock(nn.Module):
       \
        \=> --
     """
-    
+
     def __init__(self, prev_channels, current_channels, norm_layer=ABN, norm_act="relu"):
         super().__init__()
         transition_layers = []
@@ -140,40 +143,40 @@ def __init__(self, prev_channels, current_channels, norm_layer=ABN, norm_act="re
                 transition_layers.append(nn.Sequential(*layers))
             else:
                 transition_layers.append(nn.Identity())
-                
-        if len(current_channels) > len(prev_channels): # only works for ONE extra branch
+
+        if len(current_channels) > len(prev_channels):  # only works for ONE extra branch
             layers = [
-                conv3x3(prev_channels[-1], current_channels[-1], 2), 
-                norm_layer(current_channels[-1], activation=norm_act)
+                conv3x3(prev_channels[-1], current_channels[-1], 2),
+                norm_layer(current_channels[-1], activation=norm_act),
             ]
             transition_layers.append(nn.Sequential(*layers))
         self.trans_layers = nn.ModuleList(transition_layers)
-        
-    def forward(self, x): # x is actually an array
+
+    def forward(self, x):  # x is actually an array
         out_x = [trans_l(x_i) for x_i, trans_l in zip(x, self.trans_layers)]
         out_x.append(self.trans_layers[-1](x[-1]))
         return out_x
 
+
 class HRClassificationHead(nn.Module):
     def __init__(self, pre_channels, norm_layer=ABN, norm_act="relu"):
         super().__init__()
         head_block = Bottleneck
         head_channels = [32, 64, 128, 256]
-        # Increasing the #channels on each resolution 
+        # Increasing the #channels on each resolution
         # from C, 2C, 4C, 8C to 128, 256, 512, 1024
         incre_modules = []
         for (pre_c, head_c) in zip(pre_channels, head_channels):
             incre_modules.append(make_layer(pre_c, head_c, 1, norm_layer, norm_act))
         self.incre_modules = nn.ModuleList(incre_modules)
-        
+
         # downsampling modules
         downsamp_modules = []
-        for i in range(len(pre_channels)-1):
+        for i in range(len(pre_channels) - 1):
             in_ch = head_channels[i] * head_block.expansion
-            out_ch = head_channels[i+1] * head_block.expansion
+            out_ch = head_channels[i + 1] * head_block.expansion
             downsamp_module = nn.Sequential(
-                conv3x3(in_ch, out_ch, 2, bias=True),
-                norm_layer(out_ch, activation=norm_act)
+                conv3x3(in_ch, out_ch, 2, bias=True), norm_layer(out_ch, activation=norm_act)
             )
             downsamp_modules.append(downsamp_module)
         self.downsamp_modules = nn.ModuleList(downsamp_modules)
@@ -182,13 +185,13 @@ def __init__(self, pre_channels, norm_layer=ABN, norm_act="relu"):
             conv1x1(head_channels[3] * head_block.expansion, 2048, bias=True),
             norm_layer(2048, activation=norm_act),
         )
-        
+
     def forward(self, x):
-        x = [self.incre_modules[i](x[i]) for i in range(4)]        
+        x = [self.incre_modules[i](x[i]) for i in range(4)]
         for i in range(1, 4):
-            x[i] = x[i] + self.downsamp_modules[i-1](x[i-1])
+            x[i] = x[i] + self.downsamp_modules[i - 1](x[i - 1])
         return self.final_layer(x[3])
-    
+
 
 class HighResolutionNet(nn.Module):
     """HighResolution Nets constructor
@@ -219,13 +222,14 @@ class HighResolutionNet(nn.Module):
             NOTE: HRNet first features have resolution 4x times smaller than input, not 2x as all other models. 
             So it CAN'T be used as encoder in Unet and Linknet models 
     """
-        # drop_rate (float):
-        #     Dropout probability before classifier, for training. Defaults to 0.
+
+    # drop_rate (float):
+    #     Dropout probability before classifier, for training. Defaults to 0.
     def __init__(
-        self, 
+        self,
         width=18,
         small=False,
-        pretrained=None, # not used. here for proper signature
+        pretrained=None,  # not used. here for proper signature
         num_classes=1000,
         in_channels=3,
         norm_layer="abn",
@@ -241,27 +245,25 @@ def __init__(
 
         self.conv2 = conv3x3(stem_width, stem_width, stride=2)
         self.bn2 = norm_layer(stem_width, activation=norm_act)
-        
+
         channels = [width, width * 2, width * 4, width * 8]
         n_blocks = [2 if small else 4] * 4
-        
+
         self.layer1 = make_layer(stem_width, stem_width, n_blocks[0], **bn_args)
-        
+
         self.transition1 = TransitionBlock([stem_width * Bottleneck.expansion], channels[:2], **bn_args)
-        self.stage2 = self._make_stage(
-            n_modules=1, n_branches=2, n_blocks=n_blocks[:2], n_chnls=channels[:2]
-        )
-        
+        self.stage2 = self._make_stage(n_modules=1, n_branches=2, n_blocks=n_blocks[:2], n_chnls=channels[:2])
+
         self.transition2 = TransitionBlock(channels[:2], channels[:3], **bn_args)
-        self.stage3 = self._make_stage( # 3 if small else 4
-            n_modules=(4,3)[small], n_branches=3, n_blocks=n_blocks[:3], n_chnls=channels[:3]
+        self.stage3 = self._make_stage(  # 3 if small else 4
+            n_modules=(4, 3)[small], n_branches=3, n_blocks=n_blocks[:3], n_chnls=channels[:3]
         )
-        
+
         self.transition3 = TransitionBlock(channels[:3], channels, **bn_args)
-        self.stage4 = self._make_stage( # 2 if small else 3
-            n_modules=(3,2)[small], n_branches=4, n_blocks=n_blocks, n_chnls=channels,
+        self.stage4 = self._make_stage(  # 2 if small else 3
+            n_modules=(3, 2)[small], n_branches=4, n_blocks=n_blocks, n_chnls=channels,
         )
-        
+
         self.encoder = encoder
         if encoder:
             self.forward = self.encoder_features
@@ -276,16 +278,9 @@ def __init__(
     def _make_stage(self, n_modules, n_branches, n_blocks, n_chnls):
         modules = []
         for i in range(n_modules):
-            modules.append(
-                HighResolutionModule(
-                    n_branches,
-                    n_blocks,
-                    n_chnls,
-                    **self.bn_args,
-                )
-            )
+            modules.append(HighResolutionModule(n_branches, n_blocks, n_chnls, **self.bn_args,))
         return nn.Sequential(*modules)
-    
+
     def encoder_features(self, x):
         # stem
         x = self.conv1(x)
@@ -293,46 +288,46 @@ def encoder_features(self, x):
         x = self.conv2(x)
         x = self.bn2(x)
         x = self.layer1(x)
-        
-        x = self.transition1([x]) # x is actually a list now
+
+        x = self.transition1([x])  # x is actually a list now
         x = self.stage2(x)
-        
+
         x = self.transition2(x)
         x = self.stage3(x)
-        
+
         x = self.transition3(x)
         x = self.stage4(x)
-        if self.encoder: # want to return from lowest resolution to highest
+        if self.encoder:  # want to return from lowest resolution to highest
             x = [x[3], x[2], x[1], x[0], x[0]]
         return x
-    
+
     def features(self, x):
         x = self.encoder_features(x)
         x = self.cls_head(x)
         return x
-    
+
     def logits(self, x):
         x = self.global_pool(x)
         x = torch.flatten(x, 1)
-#         x = self.dropout(x)
+        #         x = self.dropout(x)
         x = self.last_linear(x)
         return x
-    
+
     def forward(self, x):
         x = self.features(x)
         x = self.logits(x)
         return x
-    
+
     def load_state_dict(self, state_dict, **kwargs):
         self_keys = list(self.state_dict().keys())
         sd_keys = list(state_dict.keys())
-        sd_keys = [k for k in sd_keys if "num_batches_tracked" not in k] # filter
+        sd_keys = [k for k in sd_keys if "num_batches_tracked" not in k]  # filter
         new_state_dict = {}
         for new_key, old_key in zip(self_keys, sd_keys):
             new_state_dict[new_key] = state_dict[old_key]
         super().load_state_dict(new_state_dict, **kwargs)
-        
-        
+
+
 # fmt: off
 CFGS = {
     "hrnet_w18_small": {
@@ -368,9 +363,10 @@ def load_state_dict(self, state_dict, **kwargs):
         "imagenet": {"url": None},
     },
 }
-    
+
 # fmt:on
-        
+
+
 def _hrnet(arch, pretrained=None, **kwargs):
     cfgs = deepcopy(CFGS)
     cfg_settings = cfgs[arch]["default"]
@@ -420,7 +416,7 @@ def hrnet_w18_small(**kwargs):
 def hrnet_w18(**kwargs):
     r"""Constructs a HRNetv2-18 model."""
     return _hrnet("hrnet_w18", **kwargs)
-    
+
 
 @wraps(HighResolutionNet)
 @add_docs_for(HighResolutionNet)
@@ -428,33 +424,37 @@ def hrnet_w30(**kwargs):
     r"""Constructs a HRNetv2-30 model."""
     return _hrnet("hrnet_w30", **kwargs)
 
+
 @wraps(HighResolutionNet)
 @add_docs_for(HighResolutionNet)
 def hrnet_w32(**kwargs):
     r"""Constructs a HRNetv2-32 model."""
     return _hrnet("hrnet_w32", **kwargs)
 
+
 @wraps(HighResolutionNet)
 @add_docs_for(HighResolutionNet)
 def hrnet_w40(**kwargs):
     r"""Constructs a HRNetv2-40 model."""
     return _hrnet("hrnet_w40", **kwargs)
 
+
 @wraps(HighResolutionNet)
 @add_docs_for(HighResolutionNet)
 def hrnet_w44(**kwargs):
     r"""Constructs a HRNetv2-44 model."""
     return _hrnet("hrnet_w44", **kwargs)
 
+
 @wraps(HighResolutionNet)
 @add_docs_for(HighResolutionNet)
 def hrnet_w48(**kwargs):
     r"""Constructs a HRNetv2-48 model."""
     return _hrnet("hrnet_w48", **kwargs)
 
+
 @wraps(HighResolutionNet)
 @add_docs_for(HighResolutionNet)
 def hrnet_w64(**kwargs):
     r"""Constructs a HRNetv2-64 model."""
     return _hrnet("hrnet_w64", **kwargs)
-
diff --git a/pytorch_tools/models/resnet.py b/pytorch_tools/models/resnet.py
index 0490cb6..bdc9e57 100644
--- a/pytorch_tools/models/resnet.py
+++ b/pytorch_tools/models/resnet.py
@@ -214,7 +214,7 @@ def _make_stem(self, stem_type, stem_width, in_channels, norm_layer, norm_act):
             # in the paper they use conv1x1 but in code conv3x3 (which seems better)
             self.conv1 = nn.Sequential(SpaceToDepth(), conv3x3(in_channels * 16, stem_width))
             self.bn1 = norm_layer(stem_width, activation=norm_act)
-            self.maxpool = nn.Identity() # not used but needed for code compatability
+            self.maxpool = nn.Identity()  # not used but needed for code compatability
         else:
             if stem_type == "deep":
                 self.conv1 = nn.Sequential(
@@ -225,7 +225,9 @@ def _make_stem(self, stem_type, stem_width, in_channels, norm_layer, norm_act):
                     conv3x3(stem_width // 2, stem_width),
                 )
             else:
-                self.conv1 = nn.Conv2d(in_channels, stem_width, kernel_size=7, stride=2, padding=3, bias=False)
+                self.conv1 = nn.Conv2d(
+                    in_channels, stem_width, kernel_size=7, stride=2, padding=3, bias=False
+                )
             self.bn1 = norm_layer(stem_width, activation=norm_act)
             self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 
@@ -296,6 +298,7 @@ def keep_prob(self):
         self.block_idx += 1
         return keep_prob
 
+
 # fmt: off
 CFGS = {
     # RESNET MODELS
diff --git a/pytorch_tools/models/tresnet.py b/pytorch_tools/models/tresnet.py
index 022c9b0..9be0c36 100644
--- a/pytorch_tools/models/tresnet.py
+++ b/pytorch_tools/models/tresnet.py
@@ -19,6 +19,7 @@
 # avoid overwriting doc string
 wraps = partial(wraps, assigned=("__module__", "__name__", "__qualname__", "__annotations__"))
 
+
 class TResNet(ResNet):
     """TResNet M / TResNet L / XL
 
@@ -71,13 +72,13 @@ def __init__(
         drop_rate=0.0,
         drop_connect_rate=0.0,
     ):
-        nn.Module.__init__(self) 
+        nn.Module.__init__(self)
         stem_width = int(64 * width_factor)
         norm_layer = bn_from_name(norm_layer)
         self.inplanes = stem_width
         self.num_classes = num_classes
-        self.groups = 1 # not really used but needed inside _make_layer
-        self.base_width = 64 # used inside _make_layer
+        self.groups = 1  # not really used but needed inside _make_layer
+        self.base_width = 64  # used inside _make_layer
         self.norm_act = norm_act
         self.block_idx = 0
         self.num_blocks = sum(layers)
@@ -89,9 +90,9 @@ def __init__(
             raise ValueError("Output stride should be in [8, 16, 32]")
         # TODO add OS later
         # if output_stride == 8:
-            # stride_3, stride_4, dilation_3, dilation_4 = 1, 1, 2, 4
+        # stride_3, stride_4, dilation_3, dilation_4 = 1, 1, 2, 4
         # elif output_stride == 16:
-            # stride_3, stride_4, dilation_3, dilation_4 = 2, 1, 1, 2
+        # stride_3, stride_4, dilation_3, dilation_4 = 2, 1, 1, 2
         # elif output_stride == 32:
         stride_3, stride_4, dilation_3, dilation_4 = 2, 2, 1, 1
 
@@ -101,11 +102,15 @@ def __init__(
         self.layer1 = self._make_layer(stem_width, layers[0], stride=1, **largs)
         self.layer2 = self._make_layer(stem_width * 2, layers[1], stride=2, **largs)
 
-        self.block = TBottleneck # first 2 - Basic, last 2 - Bottleneck
+        self.block = TBottleneck  # first 2 - Basic, last 2 - Bottleneck
         self.expansion = TBottleneck.expansion
-        self.layer3 = self._make_layer(stem_width * 4, layers[2], stride=stride_3, dilation=dilation_3, **largs)
-        largs.update(attn_type=None) # no se in last layer
-        self.layer4 = self._make_layer(stem_width * 8, layers[3], stride=stride_4, dilation=dilation_4, **largs)
+        self.layer3 = self._make_layer(
+            stem_width * 4, layers[2], stride=stride_3, dilation=dilation_3, **largs
+        )
+        largs.update(attn_type=None)  # no se in last layer
+        self.layer4 = self._make_layer(
+            stem_width * 8, layers[3], stride=stride_4, dilation=dilation_4, **largs
+        )
         self.global_pool = FastGlobalAvgPool2d(flatten=True)
         self.num_features = stem_width * 8 * self.expansion
         self.encoder = encoder
@@ -123,6 +128,7 @@ def load_state_dict(self, state_dict, **kwargs):
             state_dict.pop("last_linear.bias")
         nn.Module.load_state_dict(self, state_dict, **kwargs)
 
+
 # fmt: off
 # images should be normalized to [0, 1]
 PRETRAIN_SETTINGS = {
@@ -169,6 +175,7 @@ def load_state_dict(self, state_dict, **kwargs):
 }
 # fmt: on
 
+
 def patch_bn(module):
     """changes weight from InplaceABN to be compatible with usual ABN"""
     if isinstance(module, ABN):
@@ -176,6 +183,7 @@ def patch_bn(module):
     for m in module.children():
         patch_bn(m)
 
+
 def _resnet(arch, pretrained=None, **kwargs):
     cfgs = deepcopy(CFGS)
     cfg_settings = cfgs[arch]["default"]
@@ -204,27 +212,32 @@ def _resnet(arch, pretrained=None, **kwargs):
             # if there is last_linear in state_dict, it's going to be overwritten
             state_dict["last_linear.weight"] = model.state_dict()["last_linear.weight"]
             state_dict["last_linear.bias"] = model.state_dict()["last_linear.bias"]
-        if kwargs.get("in_channels", 3) != 3: # support pretrained for custom input channels
-            state_dict["conv1.1.weight"] = repeat_channels(state_dict["conv1.1.weight"], kwargs["in_channels"] * 16, 3 * 16)
+        if kwargs.get("in_channels", 3) != 3:  # support pretrained for custom input channels
+            state_dict["conv1.1.weight"] = repeat_channels(
+                state_dict["conv1.1.weight"], kwargs["in_channels"] * 16, 3 * 16
+            )
         model.load_state_dict(state_dict)
         patch_bn(model)
     setattr(model, "pretrained_settings", cfg_settings)
     return model
 
+
 @wraps(TResNet)
 @add_docs_for(TResNet)
 def tresnetm(**kwargs):
     r"""Constructs a TResnetM model."""
     return _resnet("tresnetm", **kwargs)
 
+
 @wraps(TResNet)
 @add_docs_for(TResNet)
 def tresnetl(**kwargs):
     r"""Constructs a TResnetL model."""
     return _resnet("tresnetl", **kwargs)
 
+
 @wraps(TResNet)
 @add_docs_for(TResNet)
 def tresnetxl(**kwargs):
     r"""Constructs a TResnetXL model."""
-    return _resnet("tresnetxl", **kwargs)
\ No newline at end of file
+    return _resnet("tresnetxl", **kwargs)
diff --git a/pytorch_tools/models/vgg.py b/pytorch_tools/models/vgg.py
index b2037ce..0f17b85 100644
--- a/pytorch_tools/models/vgg.py
+++ b/pytorch_tools/models/vgg.py
@@ -90,7 +90,6 @@ def forward(self, x):
         x = self.logits(x)
         return x
 
-
     def _make_layers(self, cfg):
         layers = []
         in_channels = self.in_channels
diff --git a/pytorch_tools/modules/activated_batch_norm.py b/pytorch_tools/modules/activated_batch_norm.py
index f4e0786..586ae0d 100644
--- a/pytorch_tools/modules/activated_batch_norm.py
+++ b/pytorch_tools/modules/activated_batch_norm.py
@@ -46,7 +46,7 @@ def __init__(
         self.activation = ACT(activation)
         self.activation_param = activation_param
         self.frozen = frozen
-        
+
         if frozen:
             self.register_buffer("weight", torch.ones(num_features))
             self.register_buffer("bias", torch.zeros(num_features))
diff --git a/pytorch_tools/modules/tf_same_ops.py b/pytorch_tools/modules/tf_same_ops.py
index 1c5adb6..cfb8026 100644
--- a/pytorch_tools/modules/tf_same_ops.py
+++ b/pytorch_tools/modules/tf_same_ops.py
@@ -5,6 +5,7 @@
 import torch.nn.functional as F
 from torch.nn.modules.utils import _pair
 
+
 def pad_same(x, k, s, d, value=0):
     # type: (Tensor, int, int, int, float)->Tensor
     # x - input tensor, s - stride, k - kernel_size, d - dilation
@@ -15,26 +16,31 @@ def pad_same(x, k, s, d, value=0):
         x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2], value=value)
     return x
 
-# current implementation is only for symmetric case. But there are no non symmetric cases 
+
+# current implementation is only for symmetric case. But there are no non symmetric cases
 def conv2d_same(x, weight, bias=None, stride=(1, 1), dilation=(1, 1), groups=1):
     # type: (Tensor, Tensor, Optional[torch.Tensor], Tuple[int, int], Tuple[int, int], int)->Tensor
     x = pad_same(x, weight.shape[-1], stride[0], dilation[0])
     return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups)
 
+
 def maxpool2d_same(x, kernel_size, stride):
     # type: (Tensor, Tuple[int, int], Tuple[int, int])->Tensor
-    x = pad_same(x, kernel_size[0], stride[0], 1, value=-float('inf'))
+    x = pad_same(x, kernel_size[0], stride[0], 1, value=-float("inf"))
     return F.max_pool2d(x, kernel_size, stride, (0, 0))
 
+
 class Conv2dSamePadding(nn.Conv2d):
     """Assymetric padding matching TensorFlow `same`"""
 
     def forward(self, x):
         return conv2d_same(x, self.weight, self.bias, self.stride, self.dilation, self.groups)
 
-# as of 1.5 there is no _pair in MaxPool. Remove when this is fixed 
+
+# as of 1.5 there is no _pair in MaxPool. Remove when this is fixed
 class MaxPool2dSamePadding(nn.MaxPool2d):
     """Assymetric padding matching TensorFlow `same`"""
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.kernel_size = _pair(self.kernel_size)
diff --git a/pytorch_tools/utils/box.py b/pytorch_tools/utils/box.py
index 73bb377..16b349e 100644
--- a/pytorch_tools/utils/box.py
+++ b/pytorch_tools/utils/box.py
@@ -3,6 +3,7 @@
 import numpy as np
 from functools import wraps
 
+
 def box2delta(boxes, anchors):
     # type: (Tensor, Tensor)->Tensor
     """Convert boxes to deltas from anchors. Boxes are expected in 'ltrb' format
@@ -14,9 +15,9 @@ def box2delta(boxes, anchors):
             offset_x, offset_y, scale_x, scale_y
     """
 
-    anchors_wh = anchors[..., 2:] - anchors[..., :2] # + 1
+    anchors_wh = anchors[..., 2:] - anchors[..., :2]
     anchors_ctr = anchors[..., :2] + 0.5 * anchors_wh
-    boxes_wh = boxes[..., 2:] - boxes[..., :2] # + 1
+    boxes_wh = boxes[..., 2:] - boxes[..., :2]
     boxes_ctr = boxes[..., :2] + 0.5 * boxes_wh
     offset_delta = (boxes_ctr - anchors_ctr) / anchors_wh
     scale_delta = torch.log(boxes_wh / anchors_wh)
@@ -40,7 +41,7 @@ def delta2box(deltas, anchors):
     # Value for clamping large dw and dh predictions. The heuristic is that we clamp
     # such that dw and dh are no larger than what would transform a 16px box into a
     # 1000px box (based on a small anchor, 16px, and a typical image size, 1000px).
-    SCALE_CLAMP = 4.135 # ~= np.log(1000. / 16.)
+    SCALE_CLAMP = 4.135  # ~= np.log(1000. / 16.)
     deltas[..., 2:] = deltas[..., 2:].clamp(min=-SCALE_CLAMP, max=SCALE_CLAMP)
 
     pred_wh = deltas[..., 2:].exp() * anchors_wh
@@ -53,6 +54,7 @@ def box_area(box):
     """
     return (box[..., 2] - box[..., 0]) * (box[..., 3] - box[..., 1])
 
+
 def clip_bboxes(bboxes, size):
     """Args:
         bboxes (torch.Tensor): in `ltrb` format. Shape [N, 4]
@@ -61,14 +63,15 @@ def clip_bboxes(bboxes, size):
     bboxes[:, 1::2] = bboxes[:, 1::2].clamp(0, size[0])
     return bboxes
 
+
 def clip_bboxes_batch(bboxes, size):
     # type: (Tensor, Tensor)->Tensor
     """Args:
         bboxes (torch.Tensor): in `ltrb` format. Shape [BS, N, 4]
         size (torch.Tensor): (H, W). Shape [BS, 2] """
     size = size.to(bboxes)
-    h_size = size[..., 0].view(-1, 1, 1) #.float()
-    w_size = size[..., 1].view(-1, 1, 1) #.float()
+    h_size = size[..., 0].view(-1, 1, 1)  # .float()
+    w_size = size[..., 1].view(-1, 1, 1)  # .float()
     h_bboxes = bboxes[..., 1::2]
     w_bboxes = bboxes[..., 0::2]
     zeros = torch.zeros_like(h_bboxes)
@@ -79,6 +82,7 @@ def clip_bboxes_batch(bboxes, size):
     # bboxes[:, 1::2] = bboxes[:, 1::2].clamp(0, size[0].item())
     return bboxes
 
+
 # implementation from https://github.com/kuangliu/torchcv/blob/master/torchcv/utils/box.py
 # with slight modifications
 def box_iou(boxes1, boxes2):
@@ -107,13 +111,9 @@ def box_iou(boxes1, boxes2):
 
 
 # based on https://github.com/NVIDIA/retinanet-examples/
-# and on https://github.com/google/automl/ 
+# and on https://github.com/google/automl/
 def generate_anchors_boxes(
-    image_size, 
-    num_scales=3,
-    aspect_ratios=(1.0, 2.0, 0.5),
-    pyramid_levels=[3, 4, 5, 6, 7],
-    anchor_scale=4,
+    image_size, num_scales=3, aspect_ratios=(1.0, 2.0, 0.5), pyramid_levels=[3, 4, 5, 6, 7], anchor_scale=4,
 ):
     """Generates multiscale anchor boxes
     Minimum object size which could be detected is anchor_scale * 2**pyramid_levels[0]. By default it's 32px
@@ -132,28 +132,28 @@ def generate_anchors_boxes(
             boxes are in 'ltrb' format
         num_anchors (int): number of anchors per location
     """
-    
+
     if isinstance(image_size, int):
         image_size = (image_size, image_size)
     scale_vals = [anchor_scale * 2 ** (i / num_scales) for i in range(num_scales)]
     # from lowest stride to largest. Anchors from models should be in the same order!
-    strides = [2**i for i in pyramid_levels]
-    
+    strides = [2 ** i for i in pyramid_levels]
+
     # get offsets for anchor boxes for one pixel
     # can rewrite in pure Torch but using np is more convenient. This function usually should only be called once
     num_anchors = len(scale_vals) * len(aspect_ratios)
     ratio_vals_sq = np.sqrt(np.tile(aspect_ratios, len(scale_vals)))
     scale_vals = np.repeat(scale_vals, len(aspect_ratios))[:, np.newaxis]
-    wh = np.stack([np.ones(num_anchors) * ratio_vals_sq, np.ones(num_anchors) / ratio_vals_sq], axis=1) 
-    lt = - 0.5 * wh * scale_vals
+    wh = np.stack([np.ones(num_anchors) * ratio_vals_sq, np.ones(num_anchors) / ratio_vals_sq], axis=1)
+    lt = -0.5 * wh * scale_vals
     rb = 0.5 * wh * scale_vals
-    base_offsets = torch.from_numpy(np.hstack([lt, rb])).float() # [num_anchors, 4]
-    base_offsets = base_offsets.view(-1, 1, 1, 4) # [num_anchors, 1, 1, 4]
+    base_offsets = torch.from_numpy(np.hstack([lt, rb])).float()  # [num_anchors, 4]
+    base_offsets = base_offsets.view(-1, 1, 1, 4)  # [num_anchors, 1, 1, 4]
     # generate anchor boxes for all given strides
     all_anchors = []
     for stride in strides:
         y, x = torch.meshgrid([torch.arange(stride / 2, image_size[i], stride) for i in range(2)])
-        xyxy = torch.stack((x, y, x, y), 2).unsqueeze(0) 
+        xyxy = torch.stack((x, y, x, y), 2).unsqueeze(0)
         # permute to match TF EffDet anchors order after reshape
         anchors = (xyxy + base_offsets * stride).permute(1, 2, 0, 3).reshape(-1, 4)
         all_anchors.append(anchors)
@@ -162,6 +162,7 @@ def generate_anchors_boxes(
     # clip_bboxes(all_anchors, image_size)
     return all_anchors, num_anchors
 
+
 def generate_targets(anchors, batch_gt_boxes, num_classes, matched_iou=0.5, unmatched_iou=0.4):
     """Generate targets for regression and classification
     
@@ -169,34 +170,35 @@ def generate_targets(anchors, batch_gt_boxes, num_classes, matched_iou=0.5, unma
     1) IoU >= matched_iou: Highest similarity. Matched/Positive. Mask value is 1
     2) matched_iou > IoU >= unmatched_iou: Medium similarity. Ignored. Mask value is -1
     3) unmatched_iou > IoU: Lowest similarity. Unmatched/Negative. Mask value is 0
-    
+
     Args:
         anchors (torch.Tensor): all anchors on a single image. shape [N, 4]
-        batch_gt_boxes (torch.Tesor): all groud truth bounding boxes and classes for the batch. shape [BS, N, 5]
-            classes are expected to be in the last column. 
+        batch_gt_boxes (torch.Tensor): all ground truth bounding boxes and classes for the batch. shape [BS, N, 5]
+            classes are expected to be in the last column.
             bboxes are in `ltrb` format!
         num_classes (int): number of classes. needed for one-hot encoding labels
-        matched_iou (float):  
+        matched_iou (float):
         unmatched_iou (float):
-    
+
     Returns:
         box_target, cls_target, matches_mask
-    
+
     """
+
     def _generate_single_targets(gt_boxes):
         gt_boxes, gt_classes = gt_boxes.split(4, dim=1)
         overlap = box_iou(anchors, gt_boxes)
-        
+
         # Keep best box per anchor
         overlap, indices = overlap.max(1)
         box_target = box2delta(gt_boxes[indices], anchors)
-        
-        # There are three types of anchors. 
+
+        # There are three types of anchors.
         # matched (with objects), unmatched (with background), and in between (which should be ignored)
         IGNORED_VALUE = -1
         UNMATCHED_VALUE = 0
         matches_mask = torch.ones_like(overlap) * IGNORED_VALUE
-        matches_mask[overlap < unmatched_iou] = UNMATCHED_VALUE # background
+        matches_mask[overlap < unmatched_iou] = UNMATCHED_VALUE  # background
         matches_mask[overlap >= matched_iou] = 1
 
         # Generate one-hot-encoded target classes
@@ -206,11 +208,11 @@ def _generate_single_targets(gt_boxes):
         gt_classes = gt_classes[indices].long()
         gt_classes[overlap < unmatched_iou] = num_classes  # background has no class
         cls_target.scatter_(1, gt_classes, 1)
-        cls_target = cls_target[:, :num_classes] # remove background class from one-hot
+        cls_target = cls_target[:, :num_classes]  # remove background class from one-hot
 
         return cls_target, box_target, matches_mask
-    
-    anchors = anchors.to(batch_gt_boxes) # change device & type if needed
+
+    anchors = anchors.to(batch_gt_boxes)  # change device & type if needed
     batch_results = ([], [], [])
     for single_gt_boxes in batch_gt_boxes:
         single_target_results = _generate_single_targets(single_gt_boxes)
@@ -218,7 +220,8 @@ def _generate_single_targets(gt_boxes):
             batch_res.append(single_res)
     b_cls_target, b_box_target, b_matches_mask = [torch.stack(targets) for targets in batch_results]
     return b_cls_target, b_box_target, b_matches_mask
-    
+
+
 # copied from torchvision
 def batched_nms(boxes, scores, idxs, iou_threshold):
     # type: (Tensor, Tensor, Tensor, float)->Tensor
@@ -257,9 +260,10 @@ def batched_nms(boxes, scores, idxs, iou_threshold):
     keep = torch.ops.torchvision.nms(boxes_for_nms, scores, iou_threshold)
     return keep
 
+
 # jit actually makes it slower for fp16 and results are different!
 # FIXME: check it after 1.6 release. maybe they will fix JIT by that time
-# @torch.jit.script 
+# @torch.jit.script
 def decode(
     batch_cls_head,
     batch_box_head,
@@ -274,8 +278,8 @@ def decode(
     # type: (Tensor, Tensor, Tensor, Tensor, Tensor, float, int, int, float)->Tensor
     """
     Decodes raw outputs of a model for easy visualization of bboxes
-    
-    Args: 
+
+    Args:
         batch_cls_head (torch.Tensor): shape [BS, *, NUM_CLASSES]
         batch_box_head (torch.Tensor): shape [BS, *, 4]
         anchors (torch.Tensor): shape [*, 4]
@@ -285,25 +289,27 @@ def decode(
         max_detection_points (int): Maximum number of bboxes to consider for NMS for one image
         max_detection_per_image (int): Maximum number of bboxes to return per image 
         iou_threshold (float): iou_threshold for Non Maximum Supression
-        
+
     Returns:
         torch.Tensor with bboxes, scores and classes
             shape [BS, MAX_DETECTION_PER_IMAGE, 6]. 
             bboxes in 'ltrb' format. If img_shape is not given they are NOT CLIPPED (!)
     """
-    
+
     batch_size = batch_cls_head.size(0)
     num_classes = batch_cls_head.size(-1)
 
-    anchors = anchors.to(batch_cls_head).unsqueeze(0).expand(batch_size, -1, -1) # [N, 4] -> [BS, N, 4]
+    anchors = anchors.to(batch_cls_head).unsqueeze(0).expand(batch_size, -1, -1)  # [N, 4] -> [BS, N, 4]
     # it has to be raw logits but check anyway to avoid applying sigmoid twice
     if batch_cls_head.min() < 0 or batch_cls_head.max() > 1:
         batch_cls_head = batch_cls_head.sigmoid()
-    
-    # It's much faster to calculate topk once for full batch here rather than doing it inside loop 
+
+    # It's much faster to calculate topk once for full batch here rather than doing it inside loop
     # In TF The same bbox may belong to two different objects
     # select `max_detection_points` scores and corresponding bboxes
-    scores_topk_all, cls_topk_indices_all = torch.topk(batch_cls_head.view(batch_size, -1), k=max_detection_points)
+    scores_topk_all, cls_topk_indices_all = torch.topk(
+        batch_cls_head.view(batch_size, -1), k=max_detection_points
+    )
     indices_all = cls_topk_indices_all / num_classes
     classes_all = cls_topk_indices_all % num_classes
 
@@ -322,32 +328,29 @@ def decode(
     out_classes = torch.zeros((batch_size, max_detection_per_image)).to(batch_cls_head)
 
     for batch in range(batch_size):
-        scores_topk = scores_topk_all[batch] #  , cls_topk_indices_all[batch]
-        classes = classes_all[batch] #cls_topk_indices % num_classes
-        box_topk = box_topk_all[batch] # torch.gather(batch_box_head[batch], 0, indices)
-        anchor_topk = anchors_topk_all[batch]
-        regressed_boxes = regressed_boxes_all[batch] # delta2box(box_topk, anchor_topk)
+        scores_topk = scores_topk_all[batch]  #  , cls_topk_indices_all[batch]
+        classes = classes_all[batch]  # cls_topk_indices % num_classes
+        regressed_boxes = regressed_boxes_all[batch]  # delta2box(box_topk, anchor_topk)
 
         # apply NMS
         nms_idx = batched_nms(regressed_boxes, scores_topk, classes, iou_threshold)
-        nms_idx = nms_idx[:min(len(nms_idx), max_detection_per_image)]
+        nms_idx = nms_idx[: min(len(nms_idx), max_detection_per_image)]
         # select suppressed bboxes
         im_scores = scores_topk[nms_idx]
         im_classes = classes[nms_idx]
         im_bboxes = regressed_boxes[nms_idx]
         im_classes += 1  # back to class idx with background class = 0
 
-        out_scores[batch, :im_scores.size(0)] = im_scores
-        out_classes[batch, :im_classes.size(0)] = im_classes
-        out_boxes[batch, :im_bboxes.size(0)] = im_bboxes
+        out_scores[batch, : im_scores.size(0)] = im_scores
+        out_classes[batch, : im_classes.size(0)] = im_classes
+        out_boxes[batch, : im_bboxes.size(0)] = im_bboxes
         # no need to pad because it's already padded with 0's
-        
 
         ## old way ##
         # get regressed bboxes
         # all_img_bboxes = delta2box(batch_box_head[batch], anchors)
         # if img_shape: # maybe clip
-            # all_img_bboxes = clip_bboxes(all_img_bboxes, img_shape)
+        # all_img_bboxes = clip_bboxes(all_img_bboxes, img_shape)
         # select at most `top_n` bboxes and from them select with score > threshold
         # max_cls_score, max_cls_idx = batch_cls_head[batch].max(1)
         # top_cls_score, top_cls_idx = max_cls_score.topk(top_n)
@@ -356,15 +359,15 @@ def decode(
         # im_scores = max_cls_score[top_cls_idx]
         # im_classes = max_cls_idx[top_cls_idx]
         # im_bboxes = all_img_bboxes[top_cls_idx]
-        
+
         # apply NMS
         # nms_idx = batched_nms(im_bboxes, im_scores, im_classes, iou_threshold)
         # im_scores = im_scores[nms_idx]
         # im_classes = im_classes[nms_idx]
         # im_bboxes = im_bboxes[nms_idx]
-        
+
         # out_scores[batch, :im_scores.size(0)] = im_scores
         # out_classes[batch, :im_classes.size(0)] = im_classes
         # out_boxes[batch, :im_bboxes.size(0)] = im_bboxes
-        
-    return torch.cat([out_boxes, out_scores.unsqueeze(-1), out_classes.unsqueeze(-1)], dim=2)
\ No newline at end of file
+
+    return torch.cat([out_boxes, out_scores.unsqueeze(-1), out_classes.unsqueeze(-1)], dim=2)
diff --git a/pytorch_tools/utils/misc.py b/pytorch_tools/utils/misc.py
index cf1fc85..733a22e 100644
--- a/pytorch_tools/utils/misc.py
+++ b/pytorch_tools/utils/misc.py
@@ -5,10 +5,10 @@
 import random
 import collections
 import numpy as np
+from functools import partial
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.distributed as dist
-from functools import partial
 
 
 def initialize_fn(m):
@@ -27,10 +27,12 @@ def initialize_fn(m):
         nn.init.kaiming_uniform_(m.weight, mode="fan_out", nonlinearity="linear")
         nn.init.constant_(m.bias, 0)
 
+
 def initialize(module):
     for m in module.modules():
         initialize_fn(m)
 
+
 def initialize_iterator(module_iterator):
     for m in module_iterator:
         initialize_fn(m)
@@ -219,6 +221,7 @@ def make_divisible(v, divisor=8):
         new_v += divisor
     return new_v
 
+
 def repeat_channels(conv_weights, new_channels, old_channels=3):
     """Repeat channels to match new number of input channels
     Args:
@@ -228,5 +231,5 @@ def repeat_channels(conv_weights, new_channels, old_channels=3):
     """
     rep_times = math.ceil(new_channels / old_channels)
     new_weights = conv_weights.repeat(1, rep_times, 1, 1)[:, :new_channels, :, :]
-    new_weights *= old_channels / new_channels # to keep the same output amplitude
-    return new_weights
\ No newline at end of file
+    new_weights *= old_channels / new_channels  # to keep the same output amplitude
+    return new_weights
diff --git a/tests/detection_models/test_det_models.py b/tests/detection_models/test_det_models.py
index 62ebb13..0ab1ff2 100644
--- a/tests/detection_models/test_det_models.py
+++ b/tests/detection_models/test_det_models.py
@@ -51,7 +51,7 @@ def test_coco_pretrain(arch):
         im = np.array(im.resize((inp_size, inp_size)))
         im_t = tensor_from_rgb_image(preprocess_fn(im)).unsqueeze(0).float().cuda()
         boxes_scores_classes = m.predict(im_t)
-        # check that most confident bbox is close to correct class. The reason for such strange test is 
+        # check that most confident bbox is close to correct class. The reason for such strange test is
         # because in different models class mappings are shifted by +- 1
         assert (boxes_scores_classes[0, 0, 5] - im_cls) < 2
 
@@ -61,6 +61,7 @@ def test_pretrain_custom_num_classes(arch):
     m = pt_det.__dict__[arch](pretrained="coco", num_classes=80).eval().cuda()
     _test_forward(m)
 
+
 @pytest.mark.parametrize("arch", MODEL_NAMES[:2])
 def test_encoder_frozenabn(arch):
     m = pt_det.__dict__[arch](encoder_norm_layer="frozenabn").eval().cuda()
diff --git a/tests/losses/test_losses.py b/tests/losses/test_losses.py
index cf3090e..2f7a741 100644
--- a/tests/losses/test_losses.py
+++ b/tests/losses/test_losses.py
@@ -52,9 +52,7 @@ def test_focal_loss_fn_basic():
 
 @pytest.mark.parametrize("reduction", ["sum", "mean", "none"])
 def test_focal_loss_fn_reduction(reduction):
-    torch_ce = F.binary_cross_entropy_with_logits(
-        INP_BINARY, TARGET_BINARY.float(), reduction=reduction
-    )
+    torch_ce = F.binary_cross_entropy_with_logits(INP_BINARY, TARGET_BINARY.float(), reduction=reduction)
     my_ce = pt_F.focal_loss_with_logits(INP_BINARY, TARGET_BINARY, alpha=0.5, gamma=0, reduction=reduction)
     assert torch.allclose(torch_ce, my_ce * 2)
 
@@ -108,6 +106,7 @@ def test_focal_loss():
     fl_i = losses.FocalLoss(mode="binary", reduction="sum", ignore_label=-100)(INP_IMG_BINARY, y_true)
     assert torch.allclose(fl.sum() - loss_diff, fl_i)
 
+
 @pytest.mark.parametrize(
     ["y_true", "y_pred", "expected"],
     [
@@ -333,9 +332,7 @@ def test_binary_cross_entropy(reduction):
     assert torch.allclose(torch_ce, my_ce)
 
     # test for images
-    torch_ce = F.binary_cross_entropy_with_logits(
-        INP_IMG_BINARY, TARGET_IMG_BINARY, reduction=reduction
-    )
+    torch_ce = F.binary_cross_entropy_with_logits(INP_IMG_BINARY, TARGET_IMG_BINARY, reduction=reduction)
     my_ce = my_ce_loss(INP_IMG_BINARY, TARGET_IMG_BINARY)
     assert torch.allclose(torch_ce, my_ce)
 
@@ -391,4 +388,4 @@ def test_binary_hinge():
 @pytest.mark.parametrize("reduction", ["sum", "mean", "none"])
 def test_smoothl1(reduction):
     loss_my = losses.SmoothL1Loss(delta=1, reduction=reduction)(INP, TARGET_MULTILABEL)
-    loss_torch = F.smooth_l1_loss(INP, TARGET_MULTILABEL, reduction=reduction)
\ No newline at end of file
+    loss_torch = F.smooth_l1_loss(INP, TARGET_MULTILABEL, reduction=reduction)
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index d5fb396..0042a85 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -23,7 +23,14 @@
 HRNET_NAMES = [name for name in ALL_MODEL_NAMES if "hrnet" in name]
 
 # test only part of the models
-TEST_MODEL_NAMES = DENSENET_NAMES[:1] + EFFNET_NAMES[:1] + VGG_NAMES[:1] + RESNET_NAMES[:1] + TRESNET_NAMES[:1] + HRNET_NAMES[:1]
+TEST_MODEL_NAMES = (
+    DENSENET_NAMES[:1]
+    + EFFNET_NAMES[:1]
+    + VGG_NAMES[:1]
+    + RESNET_NAMES[:1]
+    + TRESNET_NAMES[:1]
+    + HRNET_NAMES[:1]
+)
 # TEST_MODEL_NAMES = HRNET_NAMES[:1]
 INP = torch.ones(2, 3, 128, 128)
 
@@ -52,6 +59,7 @@ def test_custom_in_channels(arch):
     with torch.no_grad():
         m(torch.ones(2, 5, 128, 128))
 
+
 @pytest.mark.parametrize("arch", EFFNET_NAMES[:2] + RESNET_NAMES[:2])
 def test_pretrained_custom_in_channels(arch):
     m = models.__dict__[arch](in_channels=5, pretrained="imagenet")
@@ -82,11 +90,13 @@ def test_dilation(arch, output_stride):
     W, H = INP.shape[-2:]
     assert res.shape[-2:] == (W // output_stride, H // output_stride)
 
+
 @pytest.mark.parametrize("arch", EFFNET_NAMES[:2] + RESNET_NAMES[:2])
 def test_drop_connect(arch):
     m = models.__dict__[arch](drop_connect_rate=0.2)
     _test_forward(m)
 
+
 NUM_PARAMS = {
     "tresnetm": 31389032,
     "tresnetl": 55989256,
@@ -96,13 +106,16 @@ def test_drop_connect(arch):
     "efficientnet_b2": 9109994,
     "efficientnet_b3": 12233232,
 }
-@pytest.mark.parametrize('name_num_params', zip(NUM_PARAMS.items()))
+
+
+@pytest.mark.parametrize("name_num_params", zip(NUM_PARAMS.items()))
 def test_num_parameters(name_num_params):
     name, num_params = name_num_params[0]
     m = models.__dict__[name]()
     assert pt.utils.misc.count_parameters(m)[0] == num_params
 
-@pytest.mark.parametrize('stem_type', ["", "deep", "space2depth"])
+
+@pytest.mark.parametrize("stem_type", ["", "deep", "space2depth"])
 def test_resnet_stem_type(stem_type):
     m = models.resnet50(stem_type=stem_type)
-    _test_forward(m)
\ No newline at end of file
+    _test_forward(m)
diff --git a/tests/models/test_weights.py b/tests/models/test_weights.py
index c22c173..c68cd72 100644
--- a/tests/models/test_weights.py
+++ b/tests/models/test_weights.py
@@ -55,6 +55,7 @@ def test_imagenet_pretrain(arch):
         pred_cls = m(im).argmax()
         assert pred_cls == im_cls
 
+
 # test that output mean for fixed input is the same
 MODEL_NAMES2 = [
     "resnet34",
@@ -68,6 +69,7 @@ def test_imagenet_pretrain(arch):
     "efficientnet_b0": 0.0070,
 }
 
+
 @pytest.mark.parametrize("arch", MODEL_NAMES2)
 def test_output_mean(arch):
     m = models.__dict__[arch](pretrained="imagenet")
diff --git a/tests/modules/test_modules.py b/tests/modules/test_modules.py
index 969adc0..1be5837 100644
--- a/tests/modules/test_modules.py
+++ b/tests/modules/test_modules.py
@@ -13,12 +13,14 @@ def test_activations_init(activation):
     res = act(inp)
     assert res.mean()
 
+
 def test_frozen_abn():
     l = modules.bn_from_name("frozen_abn")(10)
     assert list(l.parameters()) == []
     l = modules.ABN(10, frozen=True)
     assert list(l.parameters()) == []
 
+
 # need to test and resnet and vgg because in resnet there are no Convs with bias
 # and in VGG there are no Convs without bias
 @pytest.mark.parametrize("norm_layer", ["abn", "agn"])
diff --git a/tests/segmentation_models/test_segm_models.py b/tests/segmentation_models/test_segm_models.py
index d454815..cb6378d 100644
--- a/tests/segmentation_models/test_segm_models.py
+++ b/tests/segmentation_models/test_segm_models.py
@@ -7,12 +7,13 @@
 
 INP = torch.ones(2, 3, 64, 64)
 ENCODERS = ["resnet34", "se_resnet50", "efficientnet_b1", "densenet121"]
-SEGM_ARCHS = [pt_sm.Unet, pt_sm.Linknet, pt_sm.DeepLabV3, pt_sm.SegmentationFPN, pt_sm.SegmentationBiFPN]
+SEGM_ARCHS = [pt_sm.Unet, pt_sm.Linknet, pt_sm.DeepLabV3, pt_sm.SegmentationFPN]  # pt_sm.SegmentationBiFPN
 
 # this lines are usefull for quick tests
 # ENCODERS = ["se_resnet50"]
 # SEGM_ARCHS = [pt_sm.SegmentationFPN, pt_sm.SegmentationFPN]
 
+
 def _test_forward(model):
     with torch.no_grad():
         return model(INP)
@@ -47,21 +48,24 @@ def test_num_classes(encoder_name, model_class):
     out = _test_forward(m)
     assert out.size(1) == 5
 
+
 @pytest.mark.parametrize("encoder_name", ENCODERS)
 @pytest.mark.parametrize("model_class", SEGM_ARCHS)
 def test_drop_rate(encoder_name, model_class):
     m = model_class(encoder_name=encoder_name, drop_rate=0.2)
     _test_forward(m)
 
+
 @pytest.mark.parametrize("encoder_name", ENCODERS)
 @pytest.mark.parametrize("model_class", [pt_sm.DeepLabV3])  # pt_sm.Unet, pt_sm.Linknet
 @pytest.mark.parametrize("output_stride", [32, 16, 8])
 def test_dilation(encoder_name, model_class, output_stride):
     if output_stride == 8 and model_class != pt_sm.DeepLabV3:
-        return None # OS=8 only supported for Deeplab
+        return None  # OS=8 only supported for Deeplab
     m = model_class(encoder_name=encoder_name, output_stride=output_stride)
     _test_forward(m)
 
+
 @pytest.mark.parametrize("model_class", [pt_sm.DeepLabV3, pt_sm.SegmentationFPN])
 def test_deeplab_last_upsample(model_class):
     m = model_class(last_upsample=True)
@@ -74,7 +78,8 @@ def test_deeplab_last_upsample(model_class):
     # should be 4 times smaller
     assert tuple(out.shape[-2:]) == (W // 4, H // 4)
 
+
 @pytest.mark.parametrize("merge_policy", ["add", "cat"])
 def test_merge_policy(merge_policy):
     m = pt_sm.SegmentationFPN(merge_policy=merge_policy)
-    _test_forward(m)
\ No newline at end of file
+    _test_forward(m)
diff --git a/tests/utils/test_utils.py b/tests/utils/test_utils.py
index 13b4f23..e3257f4 100644
--- a/tests/utils/test_utils.py
+++ b/tests/utils/test_utils.py
@@ -2,18 +2,23 @@
 import pytest
 import pytorch_tools as pt
 
+
 def random_boxes(mean_box, stdev, N):
     return torch.rand(N, 4) * stdev + torch.tensor(mean_box, dtype=torch.float)
 
+
+# fmt: off
 DEVICE_DTYPE =  [
     ("cpu", torch.float), 
     ("cuda", torch.float), 
     ("cuda", torch.half)
 ]
+# fmt: on
 # check that it works for all combinations of dtype and device
 @pytest.mark.parametrize("device_dtype", DEVICE_DTYPE)
 def test_clip_bboxes(device_dtype):
     device, dtype = device_dtype
+    # fmt: off
     bboxes = torch.tensor(
         [
             [-5, -10, 50, 100],
@@ -30,6 +35,7 @@ def test_clip_bboxes(device_dtype):
         device=device,
         dtype=dtype,
     )
+    # fmt: on
     size = (60, 40)
     # test single bbox clip
     res1 = pt.utils.box.clip_bboxes(bboxes, size)
@@ -55,9 +61,11 @@ def test_clip_bboxes(device_dtype):
     res5 = jit_clip(batch_bboxes.clone(), batch_sizes)
     assert torch.allclose(res5, batch_expected)
 
+
 @pytest.mark.parametrize("device_dtype", DEVICE_DTYPE)
 def test_delta2box(device_dtype):
     device, dtype = device_dtype
+    # fmt: off
     anchors = torch.tensor(
         [
             [ 0.,  0.,  1.,  1.],
@@ -84,12 +92,13 @@ def test_delta2box(device_dtype):
             [0.0000, 0.0000, 1.0000, 1.0000],
             [0.1409, 0.1409, 2.8591, 2.8591],
             [-3.1945, 0.3161, 4.1945, 0.6839],
-            [5.0000, 5.0000, 5.0000, 5.0000]
+            [5.0000, 5.0000, 5.0000, 5.0000],
         ],
         device=device,
         dtype=dtype,
     )
-    res1 =  pt.utils.box.delta2box(deltas, anchors)
+    # fmt: on
+    res1 = pt.utils.box.delta2box(deltas, anchors)
     assert torch.allclose(res1, expected_res, atol=3e-4)
 
     BS = 4
@@ -97,8 +106,8 @@ def test_delta2box(device_dtype):
     batch_deltas = deltas.unsqueeze(0).expand(BS, -1, -1)
     batch_expected = expected_res.unsqueeze(0).expand(BS, -1, -1)
 
-    # test applying to batch 
-    res2 =  pt.utils.box.delta2box(batch_deltas.clone(), batch_anchors)
+    # test applying to batch
+    res2 = pt.utils.box.delta2box(batch_deltas.clone(), batch_anchors)
     assert torch.allclose(res2, batch_expected, atol=3e-4)
 
     # check that function is JIT script friendly
@@ -106,6 +115,7 @@ def test_delta2box(device_dtype):
     res3 = jit_func(batch_deltas.clone(), batch_anchors)
     assert torch.allclose(res3, batch_expected, atol=3e-4)
 
+
 @pytest.mark.parametrize("device_dtype", DEVICE_DTYPE)
 def test_box2delta(device_dtype):
     ## this test only checks that encoding and decoding  gives the same result
@@ -114,12 +124,12 @@ def test_box2delta(device_dtype):
     anchors = random_boxes([10, 10, 20, 20], 10, 10).to(device).to(dtype)
     deltas = pt.utils.box.box2delta(boxes, anchors)
     boxes_reconstructed = pt.utils.box.delta2box(deltas, anchors)
-    atol = 2e-2 if dtype == torch.half else 1e-6 # for fp16 sometimes error is large 
-    assert torch.allclose(boxes, boxes_reconstructed, atol=atol) 
+    atol = 2e-2 if dtype == torch.half else 1e-6  # for fp16 sometimes error is large
+    assert torch.allclose(boxes, boxes_reconstructed, atol=atol)
 
     # check that it's jit friendly
     jit_box2delta = torch.jit.script(pt.utils.box.box2delta)
     jit_delta2box = torch.jit.script(pt.utils.box.delta2box)
     deltas2 = jit_box2delta(boxes, anchors)
     boxes_reconstructed2 = jit_delta2box(deltas2, anchors)
-    assert torch.allclose(boxes, boxes_reconstructed2,  atol=atol)
\ No newline at end of file
+    assert torch.allclose(boxes, boxes_reconstructed2, atol=atol)