diff --git a/dl_lib/data/datasets/builtin.py b/dl_lib/data/datasets/builtin.py
index 457bce5..d2272e2 100644
--- a/dl_lib/data/datasets/builtin.py
+++ b/dl_lib/data/datasets/builtin.py
@@ -53,6 +53,13 @@
                           "coco/annotations/instances_val2017_100.json"),
 }
 
+_PREDEFINED_SPLITS_COCO["multi_metal"] = {
+    "multi_metal_coco_2014_train":
+    ("train2014", "annotations/instances_train2014.json"),
+    "multi_metal_coco_2014_val":
+    ("val2014", "annotations/instances_val2014.json"),
+}
+
 
 def register_all_coco(root=osp.join(
         osp.split(osp.split(dl_lib.__file__)[0])[0], "datasets")):
@@ -87,5 +94,5 @@ def register_all_pascal_voc(root=osp.join(
 
 
 # Register them all under "./datasets"
-register_all_coco()
+register_all_coco(root=r'E:\dataset\uncompressed')
 register_all_pascal_voc()
diff --git a/dl_lib/data/datasets/builtin_meta.py b/dl_lib/data/datasets/builtin_meta.py
index bb311a7..6fc7eff 100644
--- a/dl_lib/data/datasets/builtin_meta.py
+++ b/dl_lib/data/datasets/builtin_meta.py
@@ -140,6 +140,18 @@
     {"color": [250, 141, 255], "isthing": 0, "id": 200, "name": "rug-merged"},
 ]
 
+MULTI_METAL_COCO_CATEGORIES = [
+    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "flat"},
+    {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "flat_back"},
+    {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "four_flat"},
+    {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "four_hole"},
+    {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "metal_three"},
+    {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "metal_three_back"},
+    {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "one_hole_back"},
+    {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "one_hole_front"},
+    {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "two_back"},
+    {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "two_front"},
+]
 
 def _get_coco_instances_meta():
     thing_ids = [k["id"] for k in COCO_CATEGORIES if k["isthing"] == 1]
@@ -155,6 +167,20 @@ def _get_coco_instances_meta():
     }
     return ret
 
+def _get_multi_metal_coco_instances_meta():
+    thing_ids = [k["id"] for k in MULTI_METAL_COCO_CATEGORIES if k["isthing"] == 1]
+    thing_colors = [k["color"] for k in MULTI_METAL_COCO_CATEGORIES if k["isthing"] == 1]
+    assert len(thing_ids) == 10, len(thing_ids)
+    # Mapping from the incontiguous COCO category id to an id in [0, 79]
+    thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
+    thing_classes = [k["name"] for k in MULTI_METAL_COCO_CATEGORIES if k["isthing"] == 1]
+    ret = {
+        "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
+        "thing_classes": thing_classes,
+        "thing_colors": thing_colors,
+    }
+    return ret
+
 
 def _get_builtin_metadata(dataset_name):
     if dataset_name == "coco":
@@ -175,4 +201,6 @@ def _get_builtin_metadata(dataset_name):
             "thing_classes": CITYSCAPES_THING_CLASSES,
             "stuff_classes": CITYSCAPES_STUFF_CLASSES,
         }
+    elif dataset_name == "multi_metal":
+        return _get_multi_metal_coco_instances_meta()
     raise KeyError("No built-in metadata for dataset {}".format(dataset_name))
diff --git a/dl_lib/engine/defaults.py b/dl_lib/engine/defaults.py
index cd323b9..6cb0fb9 100755
--- a/dl_lib/engine/defaults.py
+++ b/dl_lib/engine/defaults.py
@@ -12,6 +12,7 @@
 import argparse
 import logging
 import os
+import sys
 from collections import OrderedDict
 
 import torch
@@ -66,7 +67,7 @@ def default_argument_parser():
     # PyTorch still may leave orphan processes in multi-gpu training.
     # Therefore we use a deterministic way to obtain port,
     # so that users are aware of orphan processes by seeing the port occupied.
-    port = 2 ** 15 + 2 ** 14 + hash(os.getuid()) % 2 ** 14
+    port = 2 ** 15 + 2 ** 14 + hash(1 if sys.platform == "win32" else os.getuid()) % 2 ** 14
     parser.add_argument("--dist-url", default="tcp://127.0.0.1:{}".format(port))
     parser.add_argument(
         "opts",
diff --git a/dl_lib/evaluation/coco_evaluation.py b/dl_lib/evaluation/coco_evaluation.py
index 7371bd5..2dc96a6 100755
--- a/dl_lib/evaluation/coco_evaluation.py
+++ b/dl_lib/evaluation/coco_evaluation.py
@@ -88,7 +88,8 @@ def _tasks_from_config(self, cfg):
 
         tasks = ("bbox",)
         if cfg.MODEL.MASK_ON:
-            tasks = tasks + ("segm",)
+            #tasks = tasks + ("segm",)
+            pass
         if cfg.MODEL.KEYPOINT_ON:
             tasks = tasks + ("keypoints",)
         return tasks
diff --git a/dl_lib/evaluation/evaluator.py b/dl_lib/evaluation/evaluator.py
index 15f44f6..2e6dabf 100755
--- a/dl_lib/evaluation/evaluator.py
+++ b/dl_lib/evaluation/evaluator.py
@@ -101,7 +101,8 @@ def inference_on_dataset(model, data_loader, evaluator):
     Returns:
         The return value of `evaluator.evaluate()`
     """
-    num_devices = torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
+    #num_devices = torch.distributed.get_world_size() if torch.distributed.is_initialized() else 1
+    num_devices = 1
     logger = logging.getLogger(__name__)
     logger.info("Start inference on {} images".format(len(data_loader)))
 
@@ -120,6 +121,7 @@ def inference_on_dataset(model, data_loader, evaluator):
 
             start_compute_time = time.time()
             outputs = model(inputs)
+            draw_result(inputs, outputs)
             if torch.cuda.is_available():
                 torch.cuda.synchronize()
             total_compute_time += time.time() - start_compute_time
@@ -160,6 +162,19 @@ def inference_on_dataset(model, data_loader, evaluator):
         results = {}
     return results
 
+def draw_result(inputs, outputs):
+    import cv2
+    for input, output in zip(inputs, outputs):
+        file_name = input['file_name']
+        image = cv2.imread(file_name)
+        pred_segmentation = output['instances'].get('pred_segmentation')
+        pred_bbox = output['instances'].get('pred_boxes').tensor
+        for segmentation, bbox in zip(pred_segmentation, pred_bbox):
+            for idx in range(0, segmentation.shape[0], 2):
+                cv2.circle(image, (int(segmentation[idx]), int(segmentation[idx+1])), 2, (0, 255, 0), 0)
+            cv2.rectangle(image, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
+            cv2.imwrite(r'D:\project\COCO_MetalMulti\result\result.jpg', image)
+
 
 @contextmanager
 def inference_context(model):
diff --git a/dl_lib/layers/ROIAlign/ROIAlign_cuda.cu b/dl_lib/layers/ROIAlign/ROIAlign_cuda.cu
index 3f49edc..26702d1 100644
--- a/dl_lib/layers/ROIAlign/ROIAlign_cuda.cu
+++ b/dl_lib/layers/ROIAlign/ROIAlign_cuda.cu
@@ -307,6 +307,10 @@ __global__ void RoIAlignBackwardFeature(
 
 namespace dl_lib {
 
+int ceil_div(int a, int b){
+	return  (a + b - 1) / b;
+}
+
 at::Tensor ROIAlign_forward_cuda(
     const at::Tensor& input,
     const at::Tensor& rois,
@@ -334,7 +338,9 @@ at::Tensor ROIAlign_forward_cuda(
   auto output_size = num_rois * pooled_height * pooled_width * channels;
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 grid(std::min(at::cuda::ATenCeilDiv(output_size, 512L), 4096L));
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(static_cast<int64_t>(output_size),
+                                           static_cast<int64_t>(512)),
+                     static_cast<int64_t>(4096)));
   dim3 block(512);
 
   if (output.numel() == 0) {
@@ -390,7 +396,9 @@ at::Tensor ROIAlign_backward_cuda(
 
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  dim3 grid(std::min(at::cuda::ATenCeilDiv(grad.numel(), 512L), 4096L));
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(static_cast<int64_t>(grad.numel()),
+                                           static_cast<int64_t>(512)),
+                     static_cast<int64_t>(4096)));
   dim3 block(512);
 
   // handle possibly empty gradients
diff --git a/dl_lib/network/centernet.py b/dl_lib/network/centernet.py
index f7b36b3..8c84b24 100644
--- a/dl_lib/network/centernet.py
+++ b/dl_lib/network/centernet.py
@@ -10,6 +10,8 @@
 from .generator import CenterNetDecoder, CenterNetGT
 from .loss import modified_focal_loss, reg_l1_loss
 
+import matplotlib.pyplot as plt
+
 
 class CenterNet(nn.Module):
     """
@@ -73,9 +75,9 @@ def forward(self, batched_inputs):
 
         gt_dict = self.get_ground_truth(batched_inputs)
 
-        return self.losses(pred_dict, gt_dict)
+        return self.losses(pred_dict, gt_dict, images)
 
-    def losses(self, pred_dict, gt_dict):
+    def losses(self, pred_dict, gt_dict, images):
         r"""
         calculate losses of pred and gt
 
@@ -107,19 +109,37 @@ def losses(self, pred_dict, gt_dict):
         index = gt_dict['index']
         index = index.to(torch.long)
         # width and height loss, better version
-        loss_wh = reg_l1_loss(pred_dict['wh'], mask, index, gt_dict['wh'])
+        loss_wh, _, _ = reg_l1_loss(pred_dict['wh'], mask, index, gt_dict['wh'])
 
         # regression loss
-        loss_reg = reg_l1_loss(pred_dict['reg'], mask, index, gt_dict['reg'])
+        loss_reg, _, _ = reg_l1_loss(pred_dict['reg'], mask, index, gt_dict['reg'])
+        loss_segmentation_x, pred_x_s, gt_x_s = reg_l1_loss(pred_dict['segmentation_x'], mask, index, gt_dict['segmentation_x'])
+        loss_segmentation_y, pred_y_s, gt_y_s = reg_l1_loss(pred_dict['segmentation_y'], mask, index, gt_dict['segmentation_y'])
+
+
+        for pred_x, gt_x, pred_y, gt_y in zip(pred_x_s[0], gt_x_s[0], pred_y_s[0], gt_y_s[0]):
+            pred_x = pred_x.cpu().data.numpy() * 512
+            gt_x = gt_x.cpu().data.numpy() * 512
+            pred_y = pred_y.cpu().data.numpy() * 512
+            gt_y = gt_y.cpu().data.numpy() * 512
+            # plt.scatter(i[:, 1], i[:, 0], color='b')
+            plt.imshow(np.transpose(images[0].cpu().data.numpy(), (1, 2, 0)))
+            plt.scatter(gt_x, gt_y, color='g')
+            plt.scatter(pred_x, pred_y, color='r')
+            plt.show()
 
         loss_cls *= self.cfg.MODEL.LOSS.CLS_WEIGHT
         loss_wh *= self.cfg.MODEL.LOSS.WH_WEIGHT
         loss_reg *= self.cfg.MODEL.LOSS.REG_WEIGHT
+        loss_segmentation_x *= self.cfg.MODEL.LOSS.SEG_WEIGHT
+        loss_segmentation_y *= self.cfg.MODEL.LOSS.SEG_WEIGHT
 
         loss = {
             "loss_cls": loss_cls,
             "loss_box_wh": loss_wh,
             "loss_center_reg": loss_reg,
+            "loss_segmentation_x": loss_segmentation_x,
+            "loss_segmentation_y": loss_segmentation_y,
         }
         # print(loss)
         return loss
@@ -168,8 +188,12 @@ def decode_prediction(self, pred_dict, img_info):
         fmap = pred_dict["cls"]
         reg = pred_dict["reg"]
         wh = pred_dict["wh"]
+        segmentation_x = pred_dict["segmentation_x"] if 'segmentation_x' in pred_dict else None
+        segmentation_y = pred_dict["segmentation_y"] if 'segmentation_y' in pred_dict else None
+        segmentation = (segmentation_x, segmentation_y) if segmentation_x is not None and segmentation_y is not None \
+            else None
 
-        boxes, scores, classes = CenterNetDecoder.decode(fmap, wh, reg)
+        boxes, scores, classes, segmentation = CenterNetDecoder.decode(fmap, wh, reg, segmentation=segmentation)
         # boxes = Boxes(boxes.reshape(boxes.shape[-2:]))
         scores = scores.reshape(-1)
         classes = classes.reshape(-1).to(torch.int64)
@@ -177,7 +201,8 @@ def decode_prediction(self, pred_dict, img_info):
         # dets = CenterNetDecoder.decode(fmap, wh, reg)
         boxes = CenterNetDecoder.transform_boxes(boxes, img_info)
         boxes = Boxes(boxes)
-        return dict(pred_boxes=boxes, scores=scores, pred_classes=classes)
+        segmentation = CenterNetDecoder.transform_segmentation(segmentation, img_info)
+        return dict(pred_boxes=boxes, scores=scores, pred_classes=classes, pred_segmentation=segmentation)
 
     def preprocess_image(self, batched_inputs):
         """
diff --git a/dl_lib/network/generator/centernet_decode.py b/dl_lib/network/generator/centernet_decode.py
index 0369290..28b5413 100644
--- a/dl_lib/network/generator/centernet_decode.py
+++ b/dl_lib/network/generator/centernet_decode.py
@@ -14,7 +14,7 @@
 class CenterNetDecoder(object):
 
     @staticmethod
-    def decode(fmap, wh, reg=None, cat_spec_wh=False, K=100):
+    def decode(fmap, wh, reg=None, cat_spec_wh=False, K=100, segmentation=None):
         r"""
         decode output feature map to detection results
 
@@ -47,6 +47,16 @@ def decode(fmap, wh, reg=None, cat_spec_wh=False, K=100):
         else:
             wh = wh.reshape(batch, K, 2)
 
+        if segmentation is not None:
+            segmentation_x = gather_feature(segmentation[0], index, use_transform=True)
+            segmentation_y = gather_feature(segmentation[1], index, use_transform=True)
+            batch_size = segmentation_x.shape[0]
+            objects_num = segmentation_x.shape[1]
+            points_num = segmentation_x.shape[2]
+            segmentation = torch.zeros((batch_size, objects_num, points_num*2))
+            segmentation[:, :, 0::2] = segmentation_x
+            segmentation[:, :, 1::2] = segmentation_y
+
         clses  = clses.reshape(batch, K, 1).float()
         scores = scores.reshape(batch, K, 1)
 
@@ -55,7 +65,7 @@ def decode(fmap, wh, reg=None, cat_spec_wh=False, K=100):
                             xs + half_w, ys + half_h],
                            dim=2)
 
-        detections = (bboxes, scores, clses)
+        detections = (bboxes, scores, clses, segmentation)
 
         return detections
 
@@ -82,6 +92,29 @@ def transform_boxes(boxes, img_info, scale=1):
         target_boxes = np.dot(aug_coords, trans.T).reshape(-1, 4)
         return target_boxes
 
+    @staticmethod
+    def transform_segmentation(boxes, img_info, scale=1):
+        r"""
+        transform predicted boxes to target boxes
+
+        Args:
+            boxes(Tensor): torch Tensor with (Batch, N, 4) shape
+            img_info(dict): dict contains all information of original image
+            scale(float): used for multiscale testing
+        """
+        boxes = boxes.cpu().numpy().reshape(-1, 8)
+
+        center = img_info['center']
+        size = img_info['size']
+        output_size = (img_info['width'], img_info['height'])
+        src, dst = CenterAffine.generate_src_and_dst(center, size, output_size)
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+
+        coords = boxes.reshape(-1, 2)
+        aug_coords = np.column_stack((coords, np.ones(coords.shape[0])))
+        target_segmentation = np.dot(aug_coords, trans.T).reshape(-1, 8)
+        return target_segmentation
+
     @staticmethod
     def pseudo_nms(fmap, pool_size=3):
         r"""
@@ -107,13 +140,13 @@ def topk_score(scores, K=40):
         topk_scores, topk_inds = torch.topk(scores.reshape(batch, channel, -1), K)
 
         topk_inds = topk_inds % (height * width)
-        topk_ys = (topk_inds / width).int().float()
+        topk_ys = (topk_inds.true_divide(width)).int().float()
         topk_xs = (topk_inds % width).int().float()
 
         # get all topk in in a batch
         topk_score, index = torch.topk(topk_scores.reshape(batch, -1), K)
         # div by K because index is grouped by K(C x K shape)
-        topk_clses = (index / K).int()
+        topk_clses = (index.true_divide(K)).int()
         topk_inds = gather_feature(topk_inds.view(batch, -1, 1), index).reshape(batch, K)
         topk_ys = gather_feature(topk_ys.reshape(batch, -1, 1), index).reshape(batch, K)
         topk_xs = gather_feature(topk_xs.reshape(batch, -1, 1), index).reshape(batch, K)
diff --git a/dl_lib/network/generator/centernet_gt.py b/dl_lib/network/generator/centernet_gt.py
index 60b172e..66a46a1 100644
--- a/dl_lib/network/generator/centernet_gt.py
+++ b/dl_lib/network/generator/centernet_gt.py
@@ -15,8 +15,11 @@ def generate(config, batched_input):
         output_size = config.INPUT.OUTPUT_SIZE
         min_overlap = config.MODEL.CENTERNET.MIN_OVERLAP
         tensor_dim = config.MODEL.CENTERNET.TENSOR_DIM
+        num_polygons_points = config.MODEL.CENTERNET.NUM_POLYGON_POINTS
 
         scoremap_list, wh_list, reg_list, reg_mask_list, index_list = [[] for i in range(5)]
+        segmentation_list_x = []
+        segmentation_list_y = []
         for data in batched_input:
             # img_size = (data['height'], data['width'])
 
@@ -28,6 +31,8 @@ def generate(config, batched_input):
             gt_reg = torch.zeros_like(gt_wh)
             reg_mask = torch.zeros(tensor_dim)
             gt_index = torch.zeros(tensor_dim)
+            gt_segmentation_x = torch.ones(tensor_dim, num_polygons_points) * -128
+            gt_segmentation_y = torch.ones(tensor_dim, num_polygons_points) * -128
             # pass
 
             boxes, classes = bbox_dict['gt_boxes'], bbox_dict['gt_classes']
@@ -50,11 +55,28 @@ def generate(config, batched_input):
             )
             gt_wh[:num_boxes] = wh
 
+            masks = bbox_dict['gt_masks']
+            gt_segmentation_x[:num_boxes], gt_segmentation_y[:num_boxes] = \
+                masks.normalized_by_length(box_tensor.numpy(),
+                                           num_polygons_points,
+                                           box_scale)
+            gt_segmentation = torch.zeros(num_boxes, num_polygons_points * 2)
+            gt_segmentation[:, 0::2] = gt_segmentation_x[:num_boxes]
+            gt_segmentation[:, 1::2] = gt_segmentation_y[:num_boxes]
+            import cv2
+            image = data['image'].numpy().transpose((1, 2, 0))
+            image = cv2.resize(image, (128, 128))
+            for bbox in box_tensor:
+                cv2.rectangle(image, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 2)
+                cv2.imwrite(r'D:\project\COCO_MetalMulti\result\result.jpg', image)
+
             scoremap_list.append(gt_scoremap)
             wh_list.append(gt_wh)
             reg_list.append(gt_reg)
             reg_mask_list.append(reg_mask)
             index_list.append(gt_index)
+            segmentation_list_x.append(gt_segmentation_x)
+            segmentation_list_y.append(gt_segmentation_y)
 
         gt_dict = {
             "score_map": torch.stack(scoremap_list, dim=0),
@@ -62,6 +84,8 @@ def generate(config, batched_input):
             "reg": torch.stack(reg_list, dim=0),
             "reg_mask": torch.stack(reg_mask_list, dim=0),
             "index": torch.stack(index_list, dim=0),
+            "segmentation_x": torch.stack(segmentation_list_x, dim=0),
+            "segmentation_y": torch.stack(segmentation_list_y, dim=0),
         }
         return gt_dict
 
diff --git a/dl_lib/network/head/centernet_head.py b/dl_lib/network/head/centernet_head.py
index e1f59b7..f4070f9 100644
--- a/dl_lib/network/head/centernet_head.py
+++ b/dl_lib/network/head/centernet_head.py
@@ -36,15 +36,50 @@ def __init__(self, cfg):
         )
         self.wh_head = SingleHead(64, 2)
         self.reg_head = SingleHead(64, 2)
+        self.segmentation_head_x = SegHead(num_polygon_points=cfg.MODEL.CENTERNET.NUM_POLYGON_POINTS)
+        self.segmentation_head_y = SegHead(num_polygon_points=cfg.MODEL.CENTERNET.NUM_POLYGON_POINTS)
 
     def forward(self, x):
         cls = self.cls_head(x)
         cls = torch.sigmoid(cls)
         wh = self.wh_head(x)
         reg = self.reg_head(x)
+        segmentation_x = self.segmentation_head_x(x)
+        segmentation_y = self.segmentation_head_y(x)
         pred = {
             'cls': cls,
             'wh': wh,
-            'reg': reg
+            'reg': reg,
+            'segmentation_x': segmentation_x,
+            'segmentation_y': segmentation_y
         }
         return pred
+
+class SegHead(nn.Module):
+    def __init__(self, num_convs=2, in_channels=64, conv_out_channels=64, conv_kernel_size=3, num_polygon_points=4):
+        super(SegHead, self).__init__()
+        self.num_convs = num_convs
+        self.in_channels = in_channels
+        self.conv_out_channels = conv_out_channels
+        self.conv_kernel_size = conv_kernel_size
+        self.relu = nn.ReLU(inplace=True)
+        self.out_conv = nn.Conv2d(conv_out_channels, num_polygon_points, 1)
+
+        self.convs = nn.ModuleList()
+        for i in range(self.num_convs):
+            in_channels = (
+                self.in_channels if i == 0 else self.conv_out_channels)
+            padding = (self.conv_kernel_size - 1) // 2
+            self.convs.append(
+                nn.Conv2d(
+                    in_channels,
+                    self.conv_out_channels,
+                    self.conv_kernel_size,
+                    padding=padding,))
+
+    def forward(self, x):
+        for conv in self.convs:
+            x = conv(x)
+        x = self.relu(x)
+        x = self.out_conv(x)
+        return x
diff --git a/dl_lib/network/loss/reg_l1_loss.py b/dl_lib/network/loss/reg_l1_loss.py
index dee718e..7e90e3f 100644
--- a/dl_lib/network/loss/reg_l1_loss.py
+++ b/dl_lib/network/loss/reg_l1_loss.py
@@ -11,4 +11,4 @@ def reg_l1_loss(output, mask, index, target):
     # loss = F.l1_loss(pred * mask, target * mask, reduction='elementwise_mean')
     loss = F.l1_loss(pred * mask, target * mask, reduction='sum')
     loss = loss / (mask.sum() + 1e-4)
-    return loss
+    return loss, pred * mask, target * mask
diff --git a/dl_lib/structures/masks.py b/dl_lib/structures/masks.py
index 7b2cb5e..d3a77ce 100755
--- a/dl_lib/structures/masks.py
+++ b/dl_lib/structures/masks.py
@@ -394,6 +394,61 @@ def area(self):
 
             return torch.tensor(area)
 
+    def normalized_by_length(self, bboxes, num_of_target, bbox_scale):
+        if bboxes.shape[0] == 0:
+            return torch.ones(0, num_of_target) * -128, torch.ones(0, num_of_target) * -128
+
+        polygons_x = []
+        polygons_y = []
+        for idx, polygons_per_instance in enumerate(self.polygons):
+            # sum_of_points = sum([int(line.shape[0]/2) for line in polygons_per_instance])
+            # lines_per_polygons_x = []
+            # lines_per_polygons_y = []
+            # bbox = bboxes[idx]
+            # for line in polygons_per_instance:
+            #     num_of_points = int(line.shape[0]/2)
+            #     line *= bbox_scale
+            #     line = np.roll(line, -np.argwhere(line==line[np.argwhere(line[1::2]==line[1::2].min())*2].min()).min())
+            #     normalized_line_x = (line[0::2] - np.tile(bbox[0], num_of_points))
+            #     normalized_line_y = (line[1::2] - np.tile(bbox[1], num_of_points))
+            #     lines_per_polygons_x.append(normalized_line_x)
+            #     lines_per_polygons_y.append(normalized_line_y)
+            # if sum_of_points > num_of_target:
+            #     sample_idx = np.linspace(0, sum_of_points, num=num_of_target, endpoint=False, dtype=np.int32)
+            #     sum_of_points = int(num_of_target)
+            # else:
+            #     sample_idx = np.arange(0, sum_of_points)
+            # target_numpy_x = np.ones(num_of_target) * -128
+            # target_numpy_x[:sum_of_points] = np.concatenate(lines_per_polygons_x)[sample_idx]
+            # target_numpy_y = np.ones(num_of_target) * -128
+            # target_numpy_y[:sum_of_points] = np.concatenate(lines_per_polygons_y)[sample_idx]
+            line = np.concatenate(polygons_per_instance) * bbox_scale
+
+            min_in_x = np.squeeze(np.argwhere(line[0::2] == line[0::2].min()))
+            left_in_x = line[0::2].min()
+            left_in_y = line[line == line[min_in_x * 2 + 1].max()].min()
+
+            min_in_y = np.squeeze(np.argwhere(line[1::2] == line[1::2].min()))
+            up_in_x = line[line == line[min_in_y * 2].min()].min()
+            up_in_y = line[1::2].min()
+
+            max_in_x = np.squeeze(np.argwhere(line[0::2] == line[0::2].max()))
+            right_in_x = line[0::2].max()
+            right_in_y = line[line == line[max_in_x * 2 + 1].min()].min()
+
+            max_in_y = np.squeeze(np.argwhere(line[1::2] == line[1::2].max()))
+            down_in_x = line[line == line[max_in_y * 2].max()].min()
+            down_in_y = line[1::2].max()
+
+            target_numpy_x = np.array([left_in_x, up_in_x, right_in_x, down_in_x]) / 512
+            target_numpy_y = np.array([left_in_y, up_in_y, right_in_y, down_in_y]) / 512
+
+            polygons_x.append(target_numpy_x)
+            polygons_y.append(target_numpy_y)
+
+        return torch.tensor(polygons_x), torch.tensor(polygons_y)
+
+
     @staticmethod
     def cat(polymasks_list: List["PolygonMasks"]) -> "PolygonMasks":
         """
diff --git a/playground/centernet.res18.coco.512size/config.py b/playground/centernet.res18.coco.512size/config.py
index b64e481..091725d 100644
--- a/playground/centernet.res18.coco.512size/config.py
+++ b/playground/centernet.res18.coco.512size/config.py
@@ -3,12 +3,13 @@
 
 _config_dict = dict(
     MODEL=dict(
-        WEIGHTS="",
-        RESNETS=dict(DEPTH=18),
+        WEIGHTS=r"D:\git_projects\CenterNet-better\playground\centernet.res18.coco.512size\exp_3\model_final.pth",
+        MASK_ON=True,
+        RESNETS=dict(DEPTH=101),
         PIXEL_MEAN=[0.485, 0.456, 0.406],
         PIXEL_STD=[0.229, 0.224, 0.225],
         CENTERNET=dict(
-            DECONV_CHANNEL=[512, 256, 128, 64],
+            DECONV_CHANNEL=[2048, 256, 128, 64],
             DECONV_KERNEL=[4, 4, 4],
             NUM_CLASSES=80,
             MODULATE_DEFORM=True,
@@ -16,11 +17,13 @@
             DOWN_SCALE=4,
             MIN_OVERLAP=0.7,
             TENSOR_DIM=128,
+            NUM_POLYGON_POINTS=4,
         ),
         LOSS=dict(
             CLS_WEIGHT=1,
             WH_WEIGHT=0.1,
             REG_WEIGHT=1,
+            SEG_WEIGHT=2,
         ),
     ),
     INPUT=dict(
@@ -52,16 +55,16 @@
     SOLVER=dict(
         OPTIMIZER=dict(
             NAME="SGD",
-            BASE_LR=0.02,
+            BASE_LR=0.002,
             WEIGHT_DECAY=1e-4,
         ),
         LR_SCHEDULER=dict(
             GAMMA=0.1,
             STEPS=(81000, 108000),
-            MAX_ITER=126000,
+            MAX_ITER=253000,
             WARMUP_ITERS=1000,
         ),
-        IMS_PER_BATCH=128,
+        IMS_PER_BATCH=4,
     ),
     OUTPUT_DIR=osp.join(
         '/data/Outputs/model_logs/playground',
diff --git a/playground/centernet.res18.coco.512size/test_net.py b/playground/centernet.res18.coco.512size/test_net.py
new file mode 100644
index 0000000..51722d0
--- /dev/null
+++ b/playground/centernet.res18.coco.512size/test_net.py
@@ -0,0 +1,164 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Modified by Feng Wang
+"""
+Detection Training Script.
+
+This scripts reads a given config file and runs the training or evaluation.
+It is an entry point that is made to train standard models in dl_lib.
+
+In order to let one script support training of many models,
+this script contains logic that are specific to these built-in models and therefore
+may not be suitable for your own project.
+For example, your research project perhaps only needs a single "evaluator".
+
+Therefore, we recommend you to use dl_lib as an library and take
+this file as an example of how to use the library.
+You may want to write your own script with your datasets and other customizations.
+"""
+import glob
+import logging
+import os
+import re
+import sys
+sys.path.insert(0, '.')  # noqa: E402
+from collections import OrderedDict
+
+import dl_lib.utils.comm as comm
+from config import config
+from dl_lib.checkpoint import DetectionCheckpointer
+from dl_lib.data import MetadataCatalog
+from dl_lib.engine import (DefaultTrainer, default_argument_parser,
+                           default_setup, launch)
+from dl_lib.evaluation import (COCOEvaluator, DatasetEvaluators,
+                               PascalVOCDetectionEvaluator, verify_results)
+from net import build_model
+
+
+class Trainer(DefaultTrainer):
+    """
+    We use the "DefaultTrainer" which contains a number pre-defined logic for
+    standard training workflow. They may not work for you, especially if you
+    are working on a new research project. In that case you can use the cleaner
+    "SimpleTrainer", or write your own training loop.
+    """
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        """
+        Create evaluator(s) for a given dataset.
+        This uses the special metadata "evaluator_type" associated with each builtin dataset.
+        For your own dataset, you can simply create an evaluator manually in your
+        script and do not have to worry about the hacky if-else logic here.
+        """
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        evaluator_list = []
+        evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+
+        if evaluator_type in ["coco", "coco_panoptic_seg"]:
+            evaluator_list.append(
+                COCOEvaluator(
+                    dataset_name, cfg, True,
+                    output_folder, dump=cfg.GLOBAL.DUMP_TEST
+                ))
+
+        if evaluator_type == "pascal_voc":
+            return PascalVOCDetectionEvaluator(dataset_name)
+
+        if len(evaluator_list) == 0:
+            raise NotImplementedError(
+                "no Evaluator for the dataset {} with the type {}".format(
+                    dataset_name, evaluator_type
+                )
+            )
+        if len(evaluator_list) == 1:
+            return evaluator_list[0]
+        return DatasetEvaluators(evaluator_list)
+
+    @classmethod
+    def test_with_TTA(cls, cfg, model):
+        logger = logging.getLogger("dl_lib.trainer")
+        # In the end of training, run an evaluation with TTA
+        # Only support some R-CNN models.
+        logger.info("Running inference with test-time augmentation ...")
+        from dl_lib.modeling import GeneralizedRCNNWithTTA
+        model = GeneralizedRCNNWithTTA(cfg, model)
+        evaluators = [
+            cls.build_evaluator(
+                cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
+            )
+            for name in cfg.DATASETS.TEST
+        ]
+        res = cls.test(cfg, model, evaluators)
+        res = OrderedDict({k + "_TTA": v for k, v in res.items()})
+        return res
+
+
+def test_argument_parser():
+    parser = default_argument_parser()
+    parser.add_argument("--start-iter", type=int, default=0, help="start iter used to test")
+    parser.add_argument("--end-iter", type=int, default=None,
+                        help="end iter used to test")
+    parser.add_argument("--debug", action="store_true", help="use debug mode or not")
+    return parser
+
+
+def main(args):
+    config.merge_from_list(args.opts)
+    cfg, logger = default_setup(config, args)
+    if args.debug:
+        batches = int(cfg.SOLVER.IMS_PER_BATCH / 8 * args.num_gpus)
+        if cfg.SOLVER.IMS_PER_BATCH != batches:
+            cfg.SOLVER.IMS_PER_BATCH = batches
+            logger.warning("SOLVER.IMS_PER_BATCH is changed to {}".format(batches))
+
+    if "MODEL.WEIGHTS" in args.opts:
+        valid_files = [cfg.MODEL.WEIGHTS]
+    else:
+        list_of_files = glob.glob(os.path.join(cfg.OUTPUT_DIR, '*.pth'))
+        assert list_of_files, "no pth file found in {}".format(cfg.OUTPUT_DIR)
+        list_of_files.sort(key=os.path.getctime)
+        latest_file = list_of_files[-1]
+        if not args.end_iter:
+            valid_files = [latest_file]
+        else:
+            files = [f for f in list_of_files if str(f) <= str(latest_file)]
+            valid_files = []
+            for f in files:
+                try:
+                    model_iter = int(re.split(r'(model_|\.pth)', f)[-3])
+                except Exception:
+                    logger.warning("remove {}".format(f))
+                    continue
+                if args.start_iter <= model_iter <= args.end_iter:
+                    valid_files.append(f)
+            assert valid_files, "No .pth files satisfy your requirement"
+
+    # * means all if need specific format then *.csv
+    for current_file in valid_files:
+        cfg.MODEL.WEIGHTS = current_file
+        model = build_model(cfg)
+
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        if cfg.TEST.AUG.ENABLED:
+            res.update(Trainer.test_with_TTA(cfg, model))
+
+    # return res
+
+
+if __name__ == "__main__":
+    args = test_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/playground/centernet.res18.coco.512size/train_net.py b/playground/centernet.res18.coco.512size/train_net.py
new file mode 100644
index 0000000..a144e30
--- /dev/null
+++ b/playground/centernet.res18.coco.512size/train_net.py
@@ -0,0 +1,127 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Modified by Feng Wang
+"""
+Detection Training Script.
+
+This scripts reads a given config file and runs the training or evaluation.
+It is an entry point that is made to train standard models in dl_lib.
+
+In order to let one script support training of many models,
+this script contains logic that are specific to these built-in models and therefore
+may not be suitable for your own project.
+For example, your research project perhaps only needs a single "evaluator".
+
+Therefore, we recommend you to use dl_lib as an library and take
+this file as an example of how to use the library.
+You may want to write your own script with your datasets and other customizations.
+"""
+
+import os
+import sys
+sys.path.insert(0, '.')  # noqa: E402
+
+from colorama import Fore, Style
+
+import dl_lib.utils.comm as comm
+from config import config
+from dl_lib.checkpoint import DetectionCheckpointer
+from dl_lib.data import MetadataCatalog
+from dl_lib.engine import (DefaultTrainer, default_argument_parser,
+                           default_setup, hooks, launch)
+from dl_lib.evaluation import (COCOEvaluator, DatasetEvaluators,
+                               PascalVOCDetectionEvaluator, verify_results)
+from net import build_model
+
+
+class Trainer(DefaultTrainer):
+    """
+    We use the "DefaultTrainer" which contains a number pre-defined logic for
+    standard training workflow. They may not work for you, especially if you
+    are working on a new research project. In that case you can use the cleaner
+    "SimpleTrainer", or write your own training loop.
+    """
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        """
+        Create evaluator(s) for a given dataset.
+        This uses the special metadata "evaluator_type" associated with each builtin dataset.
+        For your own dataset, you can simply create an evaluator manually in your
+        script and do not have to worry about the hacky if-else logic here.
+        """
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        evaluator_list = []
+        evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+
+        if evaluator_type in ["coco", "coco_panoptic_seg"]:
+            evaluator_list.append(
+                COCOEvaluator(
+                    dataset_name, cfg, True,
+                    output_folder, dump=cfg.GLOBAL.DUMP_TRAIN
+                ))
+        elif evaluator_type == "pascal_voc":
+            return PascalVOCDetectionEvaluator(dataset_name)
+
+        if len(evaluator_list) == 0:
+            raise NotImplementedError(
+                "no Evaluator for the dataset {} with the type {}".format(
+                    dataset_name, evaluator_type
+                )
+            )
+        elif len(evaluator_list) == 1:
+            return evaluator_list[0]
+        return DatasetEvaluators(evaluator_list)
+
+
+def main(args):
+    config.merge_from_list(args.opts)
+    cfg, logger = default_setup(config, args)
+    model = build_model(cfg)
+    logger.info(f"Model structure: {model}")
+    if sys.platform == "linux":
+        file_sys = os.statvfs(cfg.OUTPUT_DIR)
+        free_space_Gb = (file_sys.f_bfree * file_sys.f_frsize) / 2**30
+        # We assume that a single dumped model is 700Mb
+        eval_space_Gb = (cfg.SOLVER.LR_SCHEDULER.MAX_ITER // cfg.SOLVER.CHECKPOINT_PERIOD) * 700 / 2**10
+        if eval_space_Gb > free_space_Gb:
+            logger.warning(f"{Fore.RED}Remaining space({free_space_Gb}GB) "
+                           f"is less than ({eval_space_Gb}GB){Style.RESET_ALL}")
+    if args.eval_only:
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+        res = Trainer.test(cfg, model)
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        if cfg.TEST.AUG.ENABLED:
+            res.update(Trainer.test_with_TTA(cfg, model))
+        return res
+
+    """
+    If you'd like to do anything fancier than the standard training logic,
+    consider writing your own training loop or subclassing the trainer.
+    """
+    trainer = Trainer(cfg, model)
+    trainer.resume_or_load(resume=args.resume)
+    if cfg.TEST.AUG.ENABLED:
+        trainer.register_hooks(
+            [hooks.EvalHook(0, lambda: trainer.test_with_TTA(cfg, trainer.model))]
+        )
+
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("soft link to {}".format(config.OUTPUT_DIR))
+    config.link_log()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
diff --git a/setup.py b/setup.py
index ed5fe5b..d3e3a62 100644
--- a/setup.py
+++ b/setup.py
@@ -4,6 +4,7 @@
 
 import glob
 import os
+import sys
 
 import torch
 from setuptools import find_packages, setup
@@ -12,6 +13,7 @@
 torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
 assert torch_ver >= [1, 3], "Requires PyTorch >= 1.3"
 
+os_name = sys.platform
 
 def get_extensions():
     this_dir = os.path.dirname(os.path.abspath(__file__))
@@ -39,6 +41,8 @@ def get_extensions():
             "-D__CUDA_NO_HALF_CONVERSIONS__",
             "-D__CUDA_NO_HALF2_OPERATORS__",
         ]
+        if sys.platform == 'win32':
+            extra_compile_args["nvcc"].append("-D _WIN64")
 
         # It's better if pytorch can do this by default ..
         CC = os.environ.get("CC", None)
@@ -61,13 +65,28 @@ def get_extensions():
 
 
 cur_dir = os.getcwd()
-with open("tools/dl_train", "w") as dl_lib_train:
+
+if os_name == "win32":
+    dl_train_name = "tools/dl_train.bat"
+    dl_test_name = "tools/dl_test.bat"
+    head = f"set OMP_NUM_THREADS=1\n"
+    python_command = "python"
+    parameters = "%*"
+elif os_name == "linux":
+    dl_train_name = "tools/dl_train"
+    dl_test_name = "tools/dl_test"
     head = f"#!/bin/bash\n\nexport OMP_NUM_THREADS=1\n"
+    python_command = "python3"
+    parameters = "$@"
+else:
+    raise Exception("Target OS not support")
+
+with open(dl_train_name, "w") as dl_lib_train:
     dl_lib_train.write(
-        head + f"python3 {os.path.join(cur_dir, 'tools', 'train_net.py')} $@")
-with open("tools/dl_test", "w") as dl_lib_test:
+        head + f"{python_command} {os.path.join(cur_dir, 'tools', 'train_net.py')} {parameters}")
+with open(dl_test_name, "w") as dl_lib_test:
     dl_lib_test.write(
-        head + f"python3 {os.path.join(cur_dir, 'tools', 'test_net.py')} $@")
+        head + f"{python_command} {os.path.join(cur_dir, 'tools', 'test_net.py')} {parameters}")
 
 setup(
     name="dl_lib",
@@ -95,5 +114,6 @@ def get_extensions():
     extras_require={"all": ["shapely", "psutil"]},
     ext_modules=get_extensions(),
     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
-    scripts=["tools/dl_train", "tools/dl_test"],
+    scripts=["tools/dl_train", "tools/dl_test"] if os_name == 'linux'
+    else ["tools/dl_train.bat", "tools/dl_test.bat"],
 )
diff --git a/tools/train_net.py b/tools/train_net.py
index 87d6f2e..a144e30 100644
--- a/tools/train_net.py
+++ b/tools/train_net.py
@@ -79,13 +79,14 @@ def main(args):
     cfg, logger = default_setup(config, args)
     model = build_model(cfg)
     logger.info(f"Model structure: {model}")
-    file_sys = os.statvfs(cfg.OUTPUT_DIR)
-    free_space_Gb = (file_sys.f_bfree * file_sys.f_frsize) / 2**30
-    # We assume that a single dumped model is 700Mb
-    eval_space_Gb = (cfg.SOLVER.LR_SCHEDULER.MAX_ITER // cfg.SOLVER.CHECKPOINT_PERIOD) * 700 / 2**10
-    if eval_space_Gb > free_space_Gb:
-        logger.warning(f"{Fore.RED}Remaining space({free_space_Gb}GB) "
-                       f"is less than ({eval_space_Gb}GB){Style.RESET_ALL}")
+    if sys.platform == "linux":
+        file_sys = os.statvfs(cfg.OUTPUT_DIR)
+        free_space_Gb = (file_sys.f_bfree * file_sys.f_frsize) / 2**30
+        # We assume that a single dumped model is 700Mb
+        eval_space_Gb = (cfg.SOLVER.LR_SCHEDULER.MAX_ITER // cfg.SOLVER.CHECKPOINT_PERIOD) * 700 / 2**10
+        if eval_space_Gb > free_space_Gb:
+            logger.warning(f"{Fore.RED}Remaining space({free_space_Gb}GB) "
+                           f"is less than ({eval_space_Gb}GB){Style.RESET_ALL}")
     if args.eval_only:
         DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
             cfg.MODEL.WEIGHTS, resume=args.resume