Add new CSPNet preset and add manual padding. (#2212)

sachinprasadhs · web-flow · commit c00db4e2c64d · 2025-05-06T11:56:00.000-07:00
* csp stages zero padding

* zero padding for all stages and maxpooling

* add manual padding

* remove activation for conv3

* add zero padding for stages

* indentation fix

* remove zero padding for avg_down

* add more presets and timm conversion

* fix configs and timm preset logic

* change preset loader logic

* preset loader fix
diff --git a/keras_hub/src/models/cspnet/cspnet_backbone.py b/keras_hub/src/models/cspnet/cspnet_backbone.py
@@ -81,7 +81,7 @@ class CSPNetBackbone(FeaturePyramidBackbone):
 
     # Pretrained backbone
     model = keras_hub.models.CSPNetBackbone.from_preset(
-        "cspdarknet53_ra_imagenet"
+        "csp_darknet_53_ra_imagenet"
     )
     model(input_data)
 
@@ -357,18 +357,6 @@ def apply(x):
             dtype=dtype,
             name=f"{name}_bottleneck_block_bn_3",
         )(x)
-        if activation == "leaky_relu":
-            x = layers.LeakyReLU(
-                negative_slope=0.01,
-                dtype=dtype,
-                name=f"{name}_bottleneck_block_activation_3",
-            )(x)
-        else:
-            x = layers.Activation(
-                activation,
-                dtype=dtype,
-                name=f"{name}_bottleneck_block_activation_3",
-            )(x)
 
         x = layers.add(
             [x, shortcut], dtype=dtype, name=f"{name}_bottleneck_block_add"
@@ -673,6 +661,13 @@ def apply(x):
                         name=f"{name}_csp_activation_1",
                     )(x)
             else:
+                if strides > 1:
+                    x = layers.ZeroPadding2D(
+                        1,
+                        data_format=data_format,
+                        dtype=dtype,
+                        name=f"{name}_csp_conv_pad_1",
+                    )(x)
                 x = layers.Conv2D(
                     filters=down_chs,
                     kernel_size=3,
@@ -882,6 +877,13 @@ def apply(x):
                         name=f"{name}_cs3_activation_1",
                     )(x)
             else:
+                if strides > 1:
+                    x = layers.ZeroPadding2D(
+                        1,
+                        data_format=data_format,
+                        dtype=dtype,
+                        name=f"{name}_cs3_conv_pad_1",
+                    )(x)
                 x = layers.Conv2D(
                     filters=down_chs,
                     kernel_size=3,
@@ -1062,6 +1064,13 @@ def apply(x):
                     name=f"{name}_dark_activation_1",
                 )(x)
         else:
+            if strides > 1:
+                x = layers.ZeroPadding2D(
+                    1,
+                    data_format=data_format,
+                    dtype=dtype,
+                    name=f"{name}_dark_conv_pad_1",
+                )(x)
             x = layers.Conv2D(
                 filters=filters,
                 kernel_size=3,
@@ -1091,18 +1100,18 @@ def apply(x):
                     dtype=dtype,
                     name=f"{name}_dark_activation_1",
                 )(x)
-            for i in range(depth):
-                x = block_fn(
-                    filters=block_channels,
-                    dilation=dilation,
-                    bottle_ratio=bottle_ratio,
-                    groups=groups,
-                    activation=activation,
-                    data_format=data_format,
-                    channel_axis=channel_axis,
-                    dtype=dtype,
-                    name=f"{name}_block_{i}",
-                )(x)
+        for i in range(depth):
+            x = block_fn(
+                filters=block_channels,
+                dilation=dilation,
+                bottle_ratio=bottle_ratio,
+                groups=groups,
+                activation=activation,
+                data_format=data_format,
+                channel_axis=channel_axis,
+                dtype=dtype,
+                name=f"{name}_block_{i}",
+            )(x)
         return x
 
     return apply
@@ -1135,6 +1144,13 @@ def apply(x):
                 or (i == last_idx and strides > 2 and not pooling)
                 else 1
             )
+            if conv_strides > 1:
+                x = layers.ZeroPadding2D(
+                    (kernel_size - 1) // 2,
+                    data_format=data_format,
+                    dtype=dtype,
+                    name=f"csp_stem_pad_{i}",
+                )(x)
             x = layers.Conv2D(
                 filters=chs,
                 kernel_size=kernel_size,
@@ -1167,10 +1183,19 @@ def apply(x):
 
         if pooling == "max":
             assert strides > 2
+            # Use manual padding to handle edge case scenario to ignore zero's
+            # as max value instead consider negative values from Leaky Relu type
+            # of activations.
+            pad_width = [[1, 1], [1, 1]]
+            if data_format == "channels_last":
+                pad_width += [[0, 0]]
+            else:
+                pad_width = [[0, 0]] + pad_width
+            pad_width = [[0, 0]] + pad_width
+            x = ops.pad(x, pad_width=pad_width, constant_values=float("-inf"))
             x = layers.MaxPooling2D(
                 pool_size=3,
                 strides=2,
-                padding="same",
                 data_format=data_format,
                 dtype=dtype,
                 name="csp_stem_pool",
diff --git a/keras_hub/src/models/cspnet/cspnet_backbone_test.py b/keras_hub/src/models/cspnet/cspnet_backbone_test.py
@@ -22,6 +22,7 @@ def setUp(self):
             "expand_ratio": (2.0,) + (1.0,),
             "block_type": "dark_block",
             "stage_type": "csp",
+            "stem_padding": "same",
         }
         self.input_size = 64
         self.input_data = ops.ones((2, self.input_size, self.input_size, 3))
@@ -38,9 +39,9 @@ def test_backbone_basics(self, stage_type, block_type):
                 "stage_type": stage_type,
             },
             input_data=self.input_data,
-            expected_output_shape=(2, 6, 6, 48),
+            expected_output_shape=(2, 8, 8, 48),
             expected_pyramid_output_keys=["P2", "P3", "P4"],
-            expected_pyramid_image_sizes=[(30, 30), (14, 14), (6, 6)],
+            expected_pyramid_image_sizes=[(32, 32), (16, 16), (8, 8)],
         )
 
     @pytest.mark.large
diff --git a/keras_hub/src/models/cspnet/cspnet_presets.py b/keras_hub/src/models/cspnet/cspnet_presets.py
@@ -6,11 +6,46 @@
             "description": (
                 "A CSP-DarkNet (Cross-Stage-Partial) image classification model"
                 " pre-trained on the Randomly Augmented ImageNet 1k dataset at "
-                "a 224x224 resolution."
+                "a 256x256 resolution."
             ),
-            "params": 26652512,
+            "params": 27642184,
             "path": "cspnet",
         },
-        "kaggle_handle": "kaggle://keras/cspdarknet/keras/csp_darknet_53_ra_imagenet/1",
+        "kaggle_handle": "kaggle://keras/cspdarknet/keras/csp_darknet_53_ra_imagenet/2",
+    },
+    "csp_resnext_50_ra_imagenet": {
+        "metadata": {
+            "description": (
+                "A CSP-ResNeXt (Cross-Stage-Partial) image classification model"
+                " pre-trained on the Randomly Augmented ImageNet 1k dataset at "
+                "a 256x256 resolution."
+            ),
+            "params": 20569896,
+            "path": "cspnet",
+        },
+        "kaggle_handle": "kaggle://keras/cspdarknet/keras/csp_resnext_50_ra_imagenet/1",
+    },
+    "csp_resnet_50_ra_imagenet": {
+        "metadata": {
+            "description": (
+                "A CSP-ResNet (Cross-Stage-Partial) image classification model"
+                " pre-trained on the Randomly Augmented ImageNet 1k dataset at "
+                "a 256x256 resolution."
+            ),
+            "params": 21616168,
+            "path": "cspnet",
+        },
+        "kaggle_handle": "kaggle://keras/cspdarknet/keras/csp_resnet_50_ra_imagenet/1",
+    },
+    "darknet_53_imagenet": {
+        "metadata": {
+            "description": (
+                "A DarkNet image classification model pre-trained on the"
+                "ImageNet 1k dataset at a 256x256 resolution."
+            ),
+            "params": 41609928,
+            "path": "cspnet",
+        },
+        "kaggle_handle": "kaggle://keras/cspdarknet/keras/darknet_53_imagenet/1",
     },
 }
diff --git a/keras_hub/src/utils/timm/convert_cspnet.py b/keras_hub/src/utils/timm/convert_cspnet.py
@@ -17,10 +17,69 @@ def convert_backbone_config(timm_config):
         bottle_ratio = (0.5,) + (1.0,)
         block_ratio = (1.0,) + (0.5,)
         expand_ratio = (2.0,) + (1.0,)
+        stem_padding = "same"
+        stem_pooling = None
         stage_type = "csp"
+        groups = 1
         block_type = "dark_block"
         down_growth = True
-        stackwise_strides = 2
+        stackwise_strides = [2, 2, 2, 2, 2]
+        avg_down = False
+        cross_linear = False
+    elif timm_architecture == "cspresnet50":
+        stem_filters = 64
+        stem_kernel_size = 7
+        stem_strides = 4
+        stackwise_depth = [3, 3, 5, 2]
+        stackwise_strides = [1, 2, 2, 2]
+        stackwise_num_filters = [128, 256, 512, 1024]
+        block_type = "bottleneck_block"
+        stage_type = "csp"
+        bottle_ratio = [0.5]
+        block_ratio = [1.0]
+        expand_ratio = [2.0]
+        stem_padding = "valid"
+        stem_pooling = "max"
+        avg_down = False
+        groups = 1
+        down_growth = False
+        cross_linear = True
+    elif timm_architecture == "cspresnext50":
+        stem_filters = 64
+        stem_kernel_size = 7
+        stem_strides = 4
+        stackwise_depth = [3, 3, 5, 2]
+        stackwise_num_filters = [256, 512, 1024, 2048]
+        bottle_ratio = [1.0]
+        block_ratio = [0.5]
+        expand_ratio = [1.0]
+        stage_type = "csp"
+        block_type = "bottleneck_block"
+        stem_pooling = "max"
+        stackwise_strides = [1, 2, 2, 2]
+        groups = 32
+        stem_padding = "valid"
+        avg_down = False
+        down_growth = False
+        cross_linear = True
+    elif timm_architecture == "darknet53":
+        stem_filters = 32
+        stem_kernel_size = 3
+        stem_strides = 1
+        stackwise_depth = [1, 2, 8, 8, 4]
+        stackwise_num_filters = [64, 128, 256, 512, 1024]
+        bottle_ratio = [0.5]
+        block_ratio = [1.0]
+        groups = 1
+        expand_ratio = [1.0]
+        stage_type = "dark"
+        block_type = "dark_block"
+        stem_pooling = None
+        stackwise_strides = [2, 2, 2, 2, 2]
+        stem_padding = "same"
+        avg_down = False
+        down_growth = False
+        cross_linear = False
     else:
         raise ValueError(
             f"Currently, the architecture {timm_architecture} is not supported."
@@ -38,6 +97,11 @@ def convert_backbone_config(timm_config):
         block_type=block_type,
         stackwise_strides=stackwise_strides,
         down_growth=down_growth,
+        stem_pooling=stem_pooling,
+        stem_padding=stem_padding,
+        avg_down=avg_down,
+        cross_linear=cross_linear,
+        groups=groups,
     )
 
 
@@ -81,21 +145,36 @@ def port_batch_normalization(hf_weight_prefix, keras_layer_name):
     stackwise_depth = backbone.stackwise_depth
     stage_type = backbone.stage_type
     block_type = backbone.block_type
+    strides = backbone.stackwise_strides
 
     for idx, block in enumerate(stackwise_depth):
-        port_conv2d(
-            f"stages.{idx}.conv_down.conv",
-            f"stage_{idx}_{stage_type}_conv_down_1",
-        )
-        port_batch_normalization(
-            f"stages.{idx}.conv_down.bn", f"stage_{idx}_{stage_type}_bn_1"
-        )
-        port_conv2d(
-            f"stages.{idx}.conv_exp.conv", f"stage_{idx}_{stage_type}_conv_exp"
-        )
-        port_batch_normalization(
-            f"stages.{idx}.conv_exp.bn", f"stage_{idx}_{stage_type}_bn_2"
-        )
+        if strides[idx] != 1 or stage_type == "dark":
+            if strides[idx] == 2 and backbone.avg_down:
+                port_conv2d(
+                    f"stages.{idx}.conv_down.1.conv",
+                    f"stage_{idx}_{stage_type}_conv_down_1",
+                )
+                port_batch_normalization(
+                    f"stages.{idx}.conv_down.1.bn",
+                    f"stage_{idx}_{stage_type}_bn_1",
+                )
+            else:
+                port_conv2d(
+                    f"stages.{idx}.conv_down.conv",
+                    f"stage_{idx}_{stage_type}_conv_down_1",
+                )
+                port_batch_normalization(
+                    f"stages.{idx}.conv_down.bn",
+                    f"stage_{idx}_{stage_type}_bn_1",
+                )
+        if stage_type != "dark":
+            port_conv2d(
+                f"stages.{idx}.conv_exp.conv",
+                f"stage_{idx}_{stage_type}_conv_exp",
+            )
+            port_batch_normalization(
+                f"stages.{idx}.conv_exp.bn", f"stage_{idx}_{stage_type}_bn_2"
+            )
 
         for i in range(block):
             port_conv2d(
@@ -133,16 +212,8 @@ def port_batch_normalization(hf_weight_prefix, keras_layer_name):
                 f"stages.{idx}.conv_transition_b.bn",
                 f"stage_{idx}_{stage_type}_transition_b_bn",
             )
-            port_conv2d(
-                f"stages.{idx}.conv_transition.conv",
-                f"stage_{idx}_{stage_type}_conv_transition",
-            )
-            port_batch_normalization(
-                f"stages.{idx}.conv_transition.bn",
-                f"stage_{idx}_{stage_type}_transition_bn",
-            )
 
-        else:
+        if stage_type != "dark":
             port_conv2d(
                 f"stages.{idx}.conv_transition.conv",
                 f"stage_{idx}_{stage_type}_conv_transition",
diff --git a/keras_hub/src/utils/timm/convert_cspnet_test.py b/keras_hub/src/utils/timm/convert_cspnet_test.py
@@ -6,15 +6,15 @@
 from keras_hub.src.tests.test_case import TestCase
 
 
-class TimmDenseNetBackboneTest(TestCase):
+class TimmCSPNetBackboneTest(TestCase):
     @pytest.mark.large
-    def test_convert_densenet_backbone(self):
+    def test_convert_cspnet_backbone(self):
         model = Backbone.from_preset("hf://timm/cspdarknet53.ra_in1k")
-        outputs = model.predict(ops.ones((1, 224, 224, 3)))
-        self.assertEqual(outputs.shape, (1, 5, 5, 1024))
+        outputs = model.predict(ops.ones((1, 256, 256, 3)))
+        self.assertEqual(outputs.shape, (1, 8, 8, 1024))
 
     @pytest.mark.large
-    def test_convert_densenet_classifier(self):
+    def test_convert_cspnet_classifier(self):
         model = ImageClassifier.from_preset("hf://timm/cspdarknet53.ra_in1k")
         outputs = model.predict(ops.ones((1, 512, 512, 3)))
         self.assertEqual(outputs.shape, (1, 1000))
diff --git a/keras_hub/src/utils/timm/preset_loader.py b/keras_hub/src/utils/timm/preset_loader.py
diff --git a/tools/checkpoint_conversion/convert_cspnet_checkpoints.py b/tools/checkpoint_conversion/convert_cspnet_checkpoints.py