diff --git a/detectron/core/config.py b/detectron/core/config.py
index d508305..e165e37 100644
--- a/detectron/core/config.py
+++ b/detectron/core/config.py
@@ -876,6 +876,9 @@
 # Number of patches in the dataset
 __C.BODY_UV_RCNN.NUM_PATCHES = -1
 
+# Number of semantic parts used to sample annotation points
+__C.BODY_UV_RCNN.NUM_SEMANTIC_PARTS = 14
+
 # Number of stacked Conv layers in body UV head
 __C.BODY_UV_RCNN.NUM_STACKED_CONVS = 8
 # Dimension of the hidden representation output by the body UV head
diff --git a/detectron/core/test.py b/detectron/core/test.py
index 877d803..4fc5854 100644
--- a/detectron/core/test.py
+++ b/detectron/core/test.py
@@ -948,7 +948,7 @@ def im_detect_body_uv(model, im_scale, boxes):
         # Removed squeeze calls due to singleton dimension issues
         CurAnnIndex = np.argmax(CurAnnIndex, axis=0)
         CurIndex_UV = np.argmax(CurIndex_UV, axis=0)
-        CurIndex_UV = CurIndex_UV * (CurAnnIndex>0).astype(np.float32)
+        CurIndex_UV = CurIndex_UV * (CurAnnIndex > 0).astype(np.float32)
 
         output = np.zeros([3, int(by), int(bx)], dtype=np.float32)
         output[0] = CurIndex_UV
@@ -956,8 +956,8 @@ def im_detect_body_uv(model, im_scale, boxes):
         for part_id in range(1, K):
             CurrentU = CurU_uv[part_id]
             CurrentV = CurV_uv[part_id]
-            output[1, CurIndex_UV==part_id] = CurrentU[CurIndex_UV==part_id]
-            output[2, CurIndex_UV==part_id] = CurrentV[CurIndex_UV==part_id]
+            output[1, CurIndex_UV == part_id] = CurrentU[CurIndex_UV == part_id]
+            output[2, CurIndex_UV == part_id] = CurrentV[CurIndex_UV == part_id]
         outputs.append(output)
 
     num_classes = cfg.MODEL.NUM_CLASSES
diff --git a/detectron/datasets/json_dataset.py b/detectron/datasets/json_dataset.py
index c6757e5..f65117f 100644
--- a/detectron/datasets/json_dataset.py
+++ b/detectron/datasets/json_dataset.py
@@ -154,7 +154,7 @@ def _prep_roidb_entry(self, entry):
                 (0, 3, self.num_keypoints), dtype=np.int32
             )
         if cfg.MODEL.BODY_UV_ON:
-            entry['ignore_UV_body'] = np.empty((0),  dtype=np.bool)
+            entry['ignore_UV_body'] = np.empty((0), dtype=np.bool)
         #    entry['Box_image_links_body'] = []
         # Remove unwanted fields that come from the json file (if they exist)
         for k in ['date_captured', 'url', 'license', 'file_name']:
@@ -200,7 +200,7 @@ def _add_gt_annotations(self, entry):
                 valid_objs.append(obj)
                 valid_segms.append(obj['segmentation'])
                 ###
-                if 'dp_x' in obj.keys():
+                if 'dp_x' in obj:
                     valid_dp_x.append(obj['dp_x'])
                     valid_dp_y.append(obj['dp_y'])
                     valid_dp_I.append(obj['dp_I'])
@@ -216,7 +216,7 @@ def _add_gt_annotations(self, entry):
                     valid_dp_masks.append([])
                 ###
         num_valid_objs = len(valid_objs)
-        ##
+
         boxes = np.zeros((num_valid_objs, 4), dtype=entry['boxes'].dtype)
         gt_classes = np.zeros((num_valid_objs), dtype=entry['gt_classes'].dtype)
         gt_overlaps = np.zeros(
@@ -234,7 +234,7 @@ def _add_gt_annotations(self, entry):
                 dtype=entry['gt_keypoints'].dtype
             )
         if cfg.MODEL.BODY_UV_ON:
-            ignore_UV_body = np.zeros((num_valid_objs))
+            ignore_UV_body = np.zeros((num_valid_objs), dtype=entry['ignore_UV_body'].dtype)
             #Box_image_body  = [None]*num_valid_objs
 
         im_has_visible_keypoints = False
diff --git a/detectron/datasets/roidb.py b/detectron/datasets/roidb.py
index e37e3d1..83382f2 100644
--- a/detectron/datasets/roidb.py
+++ b/detectron/datasets/roidb.py
@@ -121,7 +121,7 @@ def is_valid(entry):
         if cfg.MODEL.BODY_UV_ON and cfg.BODY_UV_RCNN.BODY_UV_IMS:
             # Exclude images with no body uv
             valid = valid and entry['has_body_uv']        
-	return valid
+        return valid
 
     num = len(roidb)
     filtered_roidb = [entry for entry in roidb if is_valid(entry)]
diff --git a/detectron/modeling/body_uv_rcnn_heads.py b/detectron/modeling/body_uv_rcnn_heads.py
index dd67f43..b771dd9 100644
--- a/detectron/modeling/body_uv_rcnn_heads.py
+++ b/detectron/modeling/body_uv_rcnn_heads.py
@@ -3,7 +3,22 @@
 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
-#
+##############################################################################
+
+"""Various network "heads" for dense human pose estimation in DensePose.
+
+The design is as follows:
+
+... -> RoI ----\                                   /-> mask output  -> cls loss
+                -> RoIFeatureXform -> body UV head  -> patch output -> cls loss
+... -> Feature /                                   \-> UV output    -> reg loss
+       Map
+
+The body UV head produces a feature representation of the RoI for the purpose
+of dense semantic mask prediction, body surface patch prediction and body UV
+coordinates regression. The body UV output module converts the feature
+representation into heatmaps for dense mask, patch index and UV coordinates.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -11,142 +26,123 @@
 from __future__ import unicode_literals
 
 from caffe2.python import core
-
 from detectron.core.config import cfg
-
+from detectron.utils.c2 import const_fill
 import detectron.modeling.ResNet as ResNet
 import detectron.utils.blob as blob_utils
 
 # ---------------------------------------------------------------------------- #
-# Body UV heads
+# Body UV outputs and losses
 # ---------------------------------------------------------------------------- #
 
-def add_body_uv_outputs(model, blob_in, dim, pref=''):
-    ####
-    model.ConvTranspose(blob_in, 'AnnIndex_lowres'+pref, dim, 15,cfg.BODY_UV_RCNN.DECONV_KERNEL, pad=int(cfg.BODY_UV_RCNN.DECONV_KERNEL / 2 - 1), stride=2, weight_init=(cfg.BODY_UV_RCNN.CONV_INIT, {'std': 0.001}), bias_init=('ConstantFill', {'value': 0.}))    
-    ####
-    model.ConvTranspose(blob_in, 'Index_UV_lowres'+pref, dim, cfg.BODY_UV_RCNN.NUM_PATCHES+1,cfg.BODY_UV_RCNN.DECONV_KERNEL, pad=int(cfg.BODY_UV_RCNN.DECONV_KERNEL / 2 - 1), stride=2, weight_init=(cfg.BODY_UV_RCNN.CONV_INIT, {'std': 0.001}), bias_init=('ConstantFill', {'value': 0.}))    
-    ####
-    model.ConvTranspose(
-        blob_in, 'U_lowres'+pref, dim, (cfg.BODY_UV_RCNN.NUM_PATCHES+1),
-        cfg.BODY_UV_RCNN.DECONV_KERNEL,
-        pad=int(cfg.BODY_UV_RCNN.DECONV_KERNEL / 2 - 1),
-        stride=2,
-        weight_init=(cfg.BODY_UV_RCNN.CONV_INIT, {'std': 0.001}),
-        bias_init=('ConstantFill', {'value': 0.}))
-    #####
-    model.ConvTranspose(
-            blob_in, 'V_lowres'+pref, dim, cfg.BODY_UV_RCNN.NUM_PATCHES+1,
+def add_body_uv_outputs(model, blob_in, dim):
+    """Add DensePose body UV specific outputs: heatmaps of dense mask, patch index
+    and patch-specific UV coordinates. All dense masks are mapped to labels in
+    [0, ... S] for S semantically meaningful body parts.
+    """
+    # Apply ConvTranspose to the feature representation; results in 2x upsampling
+    for name in ['AnnIndex', 'Index_UV', 'U', 'V']:
+        if name == 'AnnIndex':
+            dim_out = cfg.BODY_UV_RCNN.NUM_SEMANTIC_PARTS + 1
+        else:
+            dim_out = cfg.BODY_UV_RCNN.NUM_PATCHES + 1
+        model.ConvTranspose(
+            blob_in,
+            name + '_lowres',
+            dim,
+            dim_out,
             cfg.BODY_UV_RCNN.DECONV_KERNEL,
             pad=int(cfg.BODY_UV_RCNN.DECONV_KERNEL / 2 - 1),
             stride=2,
             weight_init=(cfg.BODY_UV_RCNN.CONV_INIT, {'std': 0.001}),
-            bias_init=('ConstantFill', {'value': 0.}))
-    ####
-    blob_Ann_Index = model.BilinearInterpolation('AnnIndex_lowres'+pref, 'AnnIndex'+pref,  cfg.BODY_UV_RCNN.NUM_PATCHES+1 , cfg.BODY_UV_RCNN.NUM_PATCHES+1, cfg.BODY_UV_RCNN.UP_SCALE)
-    blob_Index = model.BilinearInterpolation('Index_UV_lowres'+pref, 'Index_UV'+pref,  cfg.BODY_UV_RCNN.NUM_PATCHES+1 , cfg.BODY_UV_RCNN.NUM_PATCHES+1, cfg.BODY_UV_RCNN.UP_SCALE)
-    blob_U = model.BilinearInterpolation('U_lowres'+pref, 'U_estimated'+pref,  cfg.BODY_UV_RCNN.NUM_PATCHES+1 , cfg.BODY_UV_RCNN.NUM_PATCHES+1, cfg.BODY_UV_RCNN.UP_SCALE)
-    blob_V = model.BilinearInterpolation('V_lowres'+pref, 'V_estimated'+pref,  cfg.BODY_UV_RCNN.NUM_PATCHES+1 , cfg.BODY_UV_RCNN.NUM_PATCHES+1, cfg.BODY_UV_RCNN.UP_SCALE)
-    ###
-    return blob_U,blob_V,blob_Index,blob_Ann_Index
-
-
-def add_body_uv_losses(model, pref=''):
-
-    ## Reshape for GT blobs.
-    model.net.Reshape( ['body_uv_X_points'], ['X_points_reshaped'+pref, 'X_points_shape'+pref],  shape=( -1 ,1 ) )
-    model.net.Reshape( ['body_uv_Y_points'], ['Y_points_reshaped'+pref, 'Y_points_shape'+pref],  shape=( -1 ,1 ) )
-    model.net.Reshape( ['body_uv_I_points'], ['I_points_reshaped'+pref, 'I_points_shape'+pref],  shape=( -1 ,1 ) )
-    model.net.Reshape( ['body_uv_Ind_points'], ['Ind_points_reshaped'+pref, 'Ind_points_shape'+pref],  shape=( -1 ,1 ) )
-    ## Concat Ind,x,y to get Coordinates blob.
-    model.net.Concat( ['Ind_points_reshaped'+pref,'X_points_reshaped'+pref, \
-                       'Y_points_reshaped'+pref],['Coordinates'+pref,'Coordinate_Shapes'+pref ], axis = 1 )
-    ##
-    ### Now reshape UV blobs, such that they are 1x1x(196*NumSamples)xNUM_PATCHES 
-    ## U blob to
-    ##
-    model.net.Reshape(['body_uv_U_points'], \
-                      ['U_points_reshaped'+pref, 'U_points_old_shape'+pref],\
-                      shape=(-1,cfg.BODY_UV_RCNN.NUM_PATCHES+1,196))
-    model.net.Transpose(['U_points_reshaped'+pref] ,['U_points_reshaped_transpose'+pref],axes=(0,2,1) )
-    model.net.Reshape(['U_points_reshaped_transpose'+pref], \
-                      ['U_points'+pref, 'U_points_old_shape2'+pref], \
-                      shape=(1,1,-1,cfg.BODY_UV_RCNN.NUM_PATCHES+1))
-    ## V blob
-    ##
-    model.net.Reshape(['body_uv_V_points'], \
-                      ['V_points_reshaped'+pref, 'V_points_old_shape'+pref],\
-                      shape=(-1,cfg.BODY_UV_RCNN.NUM_PATCHES+1,196))
-    model.net.Transpose(['V_points_reshaped'+pref] ,['V_points_reshaped_transpose'+pref],axes=(0,2,1) )
-    model.net.Reshape(['V_points_reshaped_transpose'+pref], \
-                      ['V_points'+pref, 'V_points_old_shape2'+pref], \
-                      shape=(1,1,-1,cfg.BODY_UV_RCNN.NUM_PATCHES+1))
-    ###
-    ## UV weights blob
-    ##
-    model.net.Reshape(['body_uv_point_weights'], \
-                      ['Uv_point_weights_reshaped'+pref, 'Uv_point_weights_old_shape'+pref],\
-                      shape=(-1,cfg.BODY_UV_RCNN.NUM_PATCHES+1,196))
-    model.net.Transpose(['Uv_point_weights_reshaped'+pref] ,['Uv_point_weights_reshaped_transpose'+pref],axes=(0,2,1) )
-    model.net.Reshape(['Uv_point_weights_reshaped_transpose'+pref], \
-                      ['Uv_point_weights'+pref, 'Uv_point_weights_old_shape2'+pref], \
-                      shape=(1,1,-1,cfg.BODY_UV_RCNN.NUM_PATCHES+1))
-
-    #####################
-    ###  Pool IUV for points via bilinear interpolation.
-    model.PoolPointsInterp(['U_estimated','Coordinates'+pref], ['interp_U'+pref])
-    model.PoolPointsInterp(['V_estimated','Coordinates'+pref], ['interp_V'+pref])
-    model.PoolPointsInterp(['Index_UV'+pref,'Coordinates'+pref], ['interp_Index_UV'+pref])
-
-    ## Reshape interpolated UV coordinates to apply the loss.
-    
-    model.net.Reshape(['interp_U'+pref], \
-                      ['interp_U_reshaped'+pref, 'interp_U_shape'+pref],\
-                      shape=(1, 1, -1 , cfg.BODY_UV_RCNN.NUM_PATCHES+1))
-    
-    model.net.Reshape(['interp_V'+pref], \
-                      ['interp_V_reshaped'+pref, 'interp_V_shape'+pref],\
-                      shape=(1, 1, -1 , cfg.BODY_UV_RCNN.NUM_PATCHES+1))
-    ###
-
-    ### Do the actual labels here !!!!
-    model.net.Reshape( ['body_uv_ann_labels'],    \
-                      ['body_uv_ann_labels_reshaped'   +pref, 'body_uv_ann_labels_old_shape'+pref], \
-                      shape=(-1, cfg.BODY_UV_RCNN.HEATMAP_SIZE , cfg.BODY_UV_RCNN.HEATMAP_SIZE))
-    
-    model.net.Reshape( ['body_uv_ann_weights'],   \
-                      ['body_uv_ann_weights_reshaped'   +pref, 'body_uv_ann_weights_old_shape'+pref], \
-                      shape=( -1 , cfg.BODY_UV_RCNN.HEATMAP_SIZE , cfg.BODY_UV_RCNN.HEATMAP_SIZE))
-    ###
-    model.net.Cast( ['I_points_reshaped'+pref], ['I_points_reshaped_int'+pref], to=core.DataType.INT32)
-    ### Now add the actual losses 
-    ## The mask segmentation loss (dense)
-    probs_seg_AnnIndex, loss_seg_AnnIndex = model.net.SpatialSoftmaxWithLoss( \
-                          ['AnnIndex'+pref, 'body_uv_ann_labels_reshaped'+pref,'body_uv_ann_weights_reshaped'+pref],\
-                          ['probs_seg_AnnIndex'+pref,'loss_seg_AnnIndex'+pref], \
-                           scale=cfg.BODY_UV_RCNN.INDEX_WEIGHTS / cfg.NUM_GPUS)
-    ## Point Patch Index Loss.
-    probs_IndexUVPoints, loss_IndexUVPoints = model.net.SoftmaxWithLoss(\
-                          ['interp_Index_UV'+pref,'I_points_reshaped_int'+pref],\
-                          ['probs_IndexUVPoints'+pref,'loss_IndexUVPoints'+pref], \
-                          scale=cfg.BODY_UV_RCNN.PART_WEIGHTS / cfg.NUM_GPUS, spatial=0)
-    ## U and V point losses.
-    loss_Upoints = model.net.SmoothL1Loss( \
-                          ['interp_U_reshaped'+pref, 'U_points'+pref, \
-                               'Uv_point_weights'+pref, 'Uv_point_weights'+pref], \
-                          'loss_Upoints'+pref, \
-                            scale=cfg.BODY_UV_RCNN.POINT_REGRESSION_WEIGHTS  / cfg.NUM_GPUS)
+            bias_init=const_fill(0.0)
+        )
+    # Increase heatmap output size via bilinear upsampling
+    blob_outputs = []
+    for name in ['AnnIndex', 'Index_UV', 'U', 'V']:
+        blob_outputs.append(
+            model.BilinearInterpolation(
+                name + '_lowres',
+                name + '_estimated' if name in ['U', 'V'] else name,
+                cfg.BODY_UV_RCNN.NUM_PATCHES + 1,
+                cfg.BODY_UV_RCNN.NUM_PATCHES + 1,
+                cfg.BODY_UV_RCNN.UP_SCALE
+            )
+        )
+
+    return blob_outputs
+
+
+def add_body_uv_losses(model):
+    """Add DensePose body UV specific losses."""
+    # Pool estimated IUV points via bilinear interpolation.
+    for name in ['U', 'V', 'Index_UV']:
+        model.PoolPointsInterp(
+            [
+                name + '_estimated' if name in ['U', 'V'] else name,
+                'body_uv_coords_xy'
+            ],
+            ['interp_' + name]
+        )
     
-    loss_Vpoints = model.net.SmoothL1Loss( \
-                          ['interp_V_reshaped'+pref, 'V_points'+pref, \
-                               'Uv_point_weights'+pref, 'Uv_point_weights'+pref], \
-                          'loss_Vpoints'+pref, scale=cfg.BODY_UV_RCNN.POINT_REGRESSION_WEIGHTS / cfg.NUM_GPUS)
-    ## Add the losses.
-    loss_gradients = blob_utils.get_loss_gradients(model, \
-                       [ loss_Upoints, loss_Vpoints, loss_seg_AnnIndex, loss_IndexUVPoints])
-    model.losses = list(set(model.losses + \
-                       ['loss_Upoints'+pref , 'loss_Vpoints'+pref , \
-                        'loss_seg_AnnIndex'+pref ,'loss_IndexUVPoints'+pref]))
+    # Compute spatial softmax normalized probabilities, after which
+    # cross-entropy loss is computed for semantic parts classification.
+    probs_AnnIndex, loss_AnnIndex = model.net.SpatialSoftmaxWithLoss(
+        [
+            'AnnIndex', 
+            'body_uv_parts', 'body_uv_parts_weights'
+        ],
+        ['probs_AnnIndex', 'loss_AnnIndex'],
+        scale=cfg.BODY_UV_RCNN.INDEX_WEIGHTS / cfg.NUM_GPUS
+    )
+    # Softmax loss for surface patch classification.
+    probs_I_points, loss_I_points = model.net.SoftmaxWithLoss(
+        ['interp_Index_UV', 'body_uv_I_points'],
+        ['probs_I_points', 'loss_I_points'],
+        scale=cfg.BODY_UV_RCNN.PART_WEIGHTS / cfg.NUM_GPUS, 
+        spatial=0
+    )
+    ## Smooth L1 loss for each patch-specific UV coordinates regression.
+    # Reshape U,V blobs of both interpolated and ground-truth to compute
+    # summarized (instead of averaged) SmoothL1Loss.
+    loss_UV = list()
+    model.net.Reshape(
+        ['body_uv_point_weights'],
+        ['UV_point_weights', 'body_uv_point_weights_shape'],
+        shape=(1, -1, cfg.BODY_UV_RCNN.NUM_PATCHES + 1)
+    )
+    for name in ['U', 'V']:
+        # Reshape U/V coordinates of both interpolated points and ground-truth
+        # points from (#points, #patches) to (1, #points, #patches).
+        model.net.Reshape(
+            ['body_uv_' + name + '_points'],
+            [name + '_points', 'body_uv_' + name + '_points_shape'],
+            shape=(1, -1, cfg.BODY_UV_RCNN.NUM_PATCHES + 1)
+        )
+        model.net.Reshape(
+            ['interp_' + name],
+            ['interp_' + name + '_reshaped', 'interp_' + name + 'shape'],
+            shape=(1, -1, cfg.BODY_UV_RCNN.NUM_PATCHES + 1)
+        )
+        # Compute summarized SmoothL1Loss of all points.
+        loss_UV.append(
+            model.net.SmoothL1Loss(
+                [
+                    'interp_' + name + '_reshaped', name + '_points',
+                    'UV_point_weights', 'UV_point_weights'
+                ],
+                'loss_' + name + '_points',
+                scale=cfg.BODY_UV_RCNN.POINT_REGRESSION_WEIGHTS / cfg.NUM_GPUS
+            )
+        )
+    # Add all losses to compute gradients
+    loss_gradients = blob_utils.get_loss_gradients(
+        model, [loss_AnnIndex, loss_I_points] + loss_UV
+    )
+    # Update model training losses
+    model.AddLosses(
+        ['loss_' + name for name in ['AnnIndex', 'I_points', 'U_points', 'V_points']]
+    )
 
     return loss_gradients
 
@@ -155,17 +151,17 @@ def add_body_uv_losses(model, pref=''):
 # Body UV heads
 # ---------------------------------------------------------------------------- #
 
-def add_ResNet_roi_conv5_head_for_bodyUV(
-        model, blob_in, dim_in, spatial_scale
-):
+def add_ResNet_roi_conv5_head_for_bodyUV(model, blob_in, dim_in, spatial_scale):
     """Add a ResNet "conv5" / "stage5" head for body UV prediction."""
     model.RoIFeatureTransform(
-        blob_in, '_[body_uv]_pool5',
+        blob_in, 
+        '_[body_uv]_pool5',
         blob_rois='body_uv_rois',
         method=cfg.BODY_UV_RCNN.ROI_XFORM_METHOD,
         resolution=cfg.BODY_UV_RCNN.ROI_XFORM_RESOLUTION,
         sampling_ratio=cfg.BODY_UV_RCNN.ROI_XFORM_SAMPLING_RATIO,
-        spatial_scale=spatial_scale)
+        spatial_scale=spatial_scale
+    )
     # Using the prefix '_[body_uv]_' to 'res5' enables initializing the head's
     # parameters using pretrained 'res5' parameters if given (see
     # utils.net.initialize_from_weights_file)
@@ -184,7 +180,7 @@ def add_ResNet_roi_conv5_head_for_bodyUV(
 
 
 def add_roi_body_uv_head_v1convX(model, blob_in, dim_in, spatial_scale):
-    """v1convX design: X * (conv)."""
+    """Add a DensePose body UV head. v1convX design: X * (conv)."""
     hidden_dim = cfg.BODY_UV_RCNN.CONV_HEAD_DIM
     kernel_size = cfg.BODY_UV_RCNN.CONV_HEAD_KERNEL
     pad_size = kernel_size // 2
@@ -208,7 +204,7 @@ def add_roi_body_uv_head_v1convX(model, blob_in, dim_in, spatial_scale):
             stride=1,
             pad=pad_size,
             weight_init=(cfg.BODY_UV_RCNN.CONV_INIT, {'std': 0.01}),
-            bias_init=('ConstantFill', {'value': 0.})
+            bias_init=const_fill(0.0)
         )
         current = model.Relu(current, current)
         dim_in = hidden_dim
diff --git a/detectron/modeling/model_builder.py b/detectron/modeling/model_builder.py
index 35f9f2c..0eb3899 100644
--- a/detectron/modeling/model_builder.py
+++ b/detectron/modeling/model_builder.py
@@ -329,7 +329,7 @@ def _add_roi_body_uv_head(
     model, add_roi_body_uv_head_func, blob_in, dim_in, spatial_scale_in
 ):
     """Add a body UV prediction head to the model."""
-    # Capture model graph before adding the mask head
+    # Capture model graph before adding the body UV head
     bbox_net = copy.deepcopy(model.net.Proto())
     # Add the body UV head
     blob_body_uv_head, dim_body_uv_head = add_roi_body_uv_head_func(
@@ -343,7 +343,7 @@ def _add_roi_body_uv_head(
     if not model.train:  # == inference
         # Inference uses a cascade of box predictions, then body uv predictions
         # This requires separate nets for box and body uv prediction.
-        # So we extract the keypoint prediction net, store it as its own
+        # So we extract the body uv prediction net, store it as its own
         # network, then restore model.net to be the bbox-only network
         model.body_uv_net, body_uv_blob_out = c2_utils.SuffixNet(
             'body_uv_net', model.net, len(bbox_net.op), blobs_body_uv
diff --git a/detectron/ops/pool_points_interp.cc b/detectron/ops/pool_points_interp.cc
index 0bbc682..54c6245 100644
--- a/detectron/ops/pool_points_interp.cc
+++ b/detectron/ops/pool_points_interp.cc
@@ -1,33 +1,66 @@
 /**
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
  */
 
-
 #include "pool_points_interp.h"
 
 namespace caffe2 {
-//namespace {
 
 REGISTER_CPU_OPERATOR(PoolPointsInterp,
                       PoolPointsInterpOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(PoolPointsInterpGradient,
                       PoolPointsInterpGradientOp<float, CPUContext>);
 
-// Input: X, points; Output: Y
-OPERATOR_SCHEMA(PoolPointsInterp).NumInputs(2).NumOutputs(1);
-// Input: X, points, dY (aka "gradOutput");
-// Output: dX (aka "gradInput")
-OPERATOR_SCHEMA(PoolPointsInterpGradient).NumInputs(3).NumOutputs(1);
+OPERATOR_SCHEMA(PoolPointsInterp)
+    .NumInputs(2)
+    .NumOutputs(1)
+    .Input(
+        0,
+        "X",
+        "4D feature/heat map input of shape (N, C, H, W).")
+    .Input(
+        1,
+        "coords",
+        "2D input of shape (P, 2) specifying P points with 2 columns "
+        "representing 2D coordinates on the image (x, y). The "
+        "coordinates have been converted to in the coordinate system of X.")
+    .Output(
+        0,
+        "Y",
+        "2D output of shape (P, K). The r-th batch element is a "
+        "pooled/interpolated index or UV coordinate corresponding "
+        "to the r-th point over all K patches (including background).");
+
+OPERATOR_SCHEMA(PoolPointsInterpGradient)
+    .NumInputs(3)
+    .NumOutputs(1)
+    .Input(
+        0,
+        "X",
+        "See PoolPointsInterp.")
+    .Input(
+        1,
+        "coords",
+        "See PoolPointsInterp.")
+    .Input(
+        2,
+        "dY",
+        "Gradient of forward output 0 (Y)")
+    .Output(
+        0,
+        "dX",
+        "Gradient of forward input 0 (X)");
 
 class GetPoolPointsInterpGradient : public GradientMakerBase {
   using GradientMakerBase::GradientMakerBase;
   vector<OperatorDef> GetGradientDefs() override {
     return SingleGradientDef(
-        "PoolPointsInterpGradient", "",
+        "PoolPointsInterpGradient",
+        "",
         vector<string>{I(0), I(1), GO(0)},
         vector<string>{GI(0)});
   }
@@ -35,5 +68,4 @@ class GetPoolPointsInterpGradient : public GradientMakerBase {
 
 REGISTER_GRADIENT(PoolPointsInterp, GetPoolPointsInterpGradient);
 
-//} // namespace
 } // namespace caffe2
diff --git a/detectron/ops/pool_points_interp.cu b/detectron/ops/pool_points_interp.cu
index 6286e2c..7a9f755 100644
--- a/detectron/ops/pool_points_interp.cu
+++ b/detectron/ops/pool_points_interp.cu
@@ -1,9 +1,9 @@
 /**
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
  */
 
 #include <cfloat>
@@ -28,24 +28,26 @@ float gpu_atomic_add(const float val, float* address) {
 
 template <typename T>
 __device__ T bilinear_interpolate(const T* bottom_data,
-    const int height, const int width,
-    T y, T x,
+    const int height, const int width, T x, T y,
     const int index /* index for debug only*/) {
-
   // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    //empty
+  if (x < -1.0 || x > width || y < -1.0 || y > height) {
     return 0;
   }
 
-  if (y <= 0) y = 0;
   if (x <= 0) x = 0;
+  if (y <= 0) y = 0;
 
-  int y_low = (int) y;
   int x_low = (int) x;
-  int y_high;
-  int x_high;
+  int y_low = (int) y;
+  int x_high, y_high;
 
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T) x_low;
+  } else {
+    x_high = x_low + 1;
+  }
   if (y_low >= height - 1) {
     y_high = y_low = height - 1;
     y = (T) y_low;
@@ -53,82 +55,62 @@ __device__ T bilinear_interpolate(const T* bottom_data,
     y_high = y_low + 1;
   }
 
-  if (x_low >= width - 1) {
-    x_high = x_low = width - 1;
-    x = (T) x_low;
-  } else {
-    x_high = x_low + 1;
-  }
+  // lambdas in X, Y axes
+  T lx = x - x_low, ly = y - y_low;
+  T hx = 1. - lx, hy = 1. - ly;
 
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
   // do bilinear interpolation
-  T v1 = bottom_data[y_low * width + x_low];
-  T v2 = bottom_data[y_low * width + x_high];
-  T v3 = bottom_data[y_high * width + x_low];
-  T v4 = bottom_data[y_high * width + x_high];
+  T v1 = bottom_data[y_low * width + x_low];   // top-left point
+  T v2 = bottom_data[y_low * width + x_high];  // top-right point
+  T v3 = bottom_data[y_high * width + x_low];  // bottom-left point
+  T v4 = bottom_data[y_high * width + x_high]; // bottom-right point
   T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 
-  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-
-  return val;
+  return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
 }
 
 template <typename T>
-__global__ void PointWarpForward(const int nthreads, const T* bottom_data,
-    const T spatial_scale, const int channels,
-    const int height, const int width,
-    const T* bottom_rois, T* top_data) {
+__global__ void PoolPointsInterpForward(const int nthreads, const T* bottom_data,
+    const T spatial_scale, const int channels, const int height, const int width,
+    const T* coords, T* top_data) {
 
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c) is an element in the pooled/interpolated output
     int c = index % channels;
     int n = index / channels;
-    //
-    const T* offset_bottom_rois = bottom_rois + n * 3;
-
-    int roi_batch_ind = n/196; // Should be original !!
-    //
-    T X_point = offset_bottom_rois[1] * spatial_scale;
-    T Y_point = offset_bottom_rois[2] * spatial_scale;
-
-
 
+    const T* offset_coords = coords + n * 2;
+    // Get index of current fg roi among all fg rois in a minibatch
+    int roi_batch_ind = n / 196;
+    // Get spatial coordinate (x, y)
+    T x = offset_coords[0] * spatial_scale;
+    T y = offset_coords[1] * spatial_scale;
     const T* offset_bottom_data = bottom_data + (roi_batch_ind * channels + c) * height * width;
-
-    T val = bilinear_interpolate(offset_bottom_data, height, width, Y_point, X_point, index);
-    top_data[index] = val;
+    // Compute interpolated value
+    top_data[index] = bilinear_interpolate(
+        offset_bottom_data, height, width, x, y, index);
   }
 }
 
 template <typename T>
 __device__ void bilinear_interpolate_gradient(
-    const int height, const int width,
-    T y, T x,
+    const int height, const int width, T x, T y,
     T & w1, T & w2, T & w3, T & w4,
     int & x_low, int & x_high, int & y_low, int & y_high,
     const int index /* index for debug only*/) {
-
   // deal with cases that inverse elements are out of feature map boundary
-  if (y < -1.0 || y > height || x < -1.0 || x > width) {
-    //empty
+  if (x < -1.0 || x > width || y < -1.0 || y > height) {
+    // empty
     w1 = w2 = w3 = w4 = 0.;
     x_low = x_high = y_low = y_high = -1;
     return;
   }
 
-  if (y <= 0) y = 0;
   if (x <= 0) x = 0;
+  if (y <= 0) y = 0;
 
-  y_low = (int) y;
   x_low = (int) x;
-
-  if (y_low >= height - 1) {
-    y_high = y_low = height - 1;
-    y = (T) y_low;
-  } else {
-    y_high = y_low + 1;
-  }
+  y_low = (int) y;
 
   if (x_low >= width - 1) {
     x_high = x_low = width - 1;
@@ -136,11 +118,15 @@ __device__ void bilinear_interpolate_gradient(
   } else {
     x_high = x_low + 1;
   }
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T) y_low;
+  } else {
+    y_high = y_low + 1;
+  }
 
-  T ly = y - y_low;
-  T lx = x - x_low;
-  T hy = 1. - ly, hx = 1. - lx;
-
+  T lx = x - x_low, ly = y - y_low;
+  T hx = 1. - lx, hy = 1. - ly;
 
   w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
 
@@ -148,30 +134,23 @@ __device__ void bilinear_interpolate_gradient(
 }
 
 template <typename T>
-__global__ void PointWarpBackwardFeature(const int nthreads, const T* top_diff,
-    const int num_rois, const T spatial_scale,
-    const int channels, const int height, const int width,
-
-    T* bottom_diff,
-    const T* bottom_rois) {
+__global__ void PoolPointsInterpBackward(const int nthreads, const T* top_diff,
+    const int num_rois, const T spatial_scale, const int channels,
+    const int height, const int width, T* bottom_diff, const T* coords) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
-    int c = index  % channels;
-    int n = index  / channels;
-
-    const T* offset_bottom_rois = bottom_rois + n * 3;
-    // int roi_batch_ind = offset_bottom_rois[0];
-    int roi_batch_ind = n/196; // Should be original !!
+    int c = index % channels;
+    int n = index / channels;
 
-    T X_point = offset_bottom_rois[1] * spatial_scale;
-    T Y_point = offset_bottom_rois[2] * spatial_scale;
+    const T* offset_coords = coords + n * 2;
+    int roi_batch_ind = n / 196;
+    T x = offset_coords[0] * spatial_scale;
+    T y = offset_coords[1] * spatial_scale;
 
     T w1, w2, w3, w4;
     int x_low, x_high, y_low, y_high;
 
-    bilinear_interpolate_gradient(height, width, Y_point, X_point,
-        w1, w2, w3, w4,
-        x_low, x_high, y_low, y_high,
-        index);
+    bilinear_interpolate_gradient(height, width, x, y,
+        w1, w2, w3, w4, x_low, x_high, y_low, y_high, index);
 
     T* offset_bottom_diff = bottom_diff + (roi_batch_ind * channels + c) * height * width;
     //
@@ -193,35 +172,30 @@ __global__ void PointWarpBackwardFeature(const int nthreads, const T* top_diff,
     } // if
 
   } // CUDA_1D_KERNEL_LOOP
-} // ROIWarpBackward
-
+} // PoolPointsInterpBackward
 
 } // namespace
 
 template<>
 bool PoolPointsInterpOp<float, CUDAContext>::RunOnDevice() {
   auto& X = Input(0);  // Input data to pool
-  auto& R = Input(1);  // RoIs
-  auto* Y = Output(0); // RoI pooled data
+  auto& R = Input(1);  // Spatial coordinates of all points within RoIs
 
   if (R.size() == 0) {
     // Handle empty rois
-    Y->Resize(0, X.dim32(1));
-    // The following mutable_data calls are needed to allocate the tensors
-    Y->mutable_data<float>();
+    std::vector<int64_t> sizes = {0, X.dim32(1)};
+    /* auto* Y = */ Output(0, sizes, at::dtype<float>());
     return true;
   }
 
-  Y->Resize(R.dim32(0), X.dim32(1));
+  auto* Y = Output(0, {R.dim32(0), X.dim32(1)}, at::dtype<float>());  // Pooled interpolated data
   int output_size = Y->size();
-  PointWarpForward<float><<<CAFFE_GET_BLOCKS(output_size),
-                          CAFFE_CUDA_NUM_THREADS,
-                          0, context_.cuda_stream()>>>(
+  PoolPointsInterpForward<float><<<CAFFE_GET_BLOCKS(output_size),
+                                  CAFFE_CUDA_NUM_THREADS,
+                                  0, context_.cuda_stream()>>>(
       output_size, X.data<float>(), spatial_scale_,
       X.dim32(1), X.dim32(2), X.dim32(3),
-      R.data<float>(),
-      Y->mutable_data<float>()
-    );
+      R.data<float>(), Y->mutable_data<float>());
   return true;
 }
 
@@ -232,59 +206,46 @@ __global__ void SetKernel(const int N, const T alpha, T* Y) {
     Y[i] = alpha;
   }
 }
-}
-
+} // namespace
 
 namespace {    
-
-
 template <typename T>
 __global__ void SetEvenIndsToVal(size_t num_even_inds, T val, T* data) {
   CUDA_1D_KERNEL_LOOP(i, num_even_inds) {
     data[i << 1] = val;
   }
 }
-}
-
+} // namespace
 
-    
 template<>
 bool PoolPointsInterpGradientOp<float, CUDAContext>::RunOnDevice() {
   auto& X  = Input(0);  // Input data to pool
-  auto& R  = Input(1);  // RoIs
+  auto& R  = Input(1);  // 2D Spatial coordinates of all points within RoIs
   auto& dY = Input(2);  // Gradient of net w.r.t. output of "forward" op
                         // (aka "gradOutput")
-  auto* dX = Output(0); // Gradient of net w.r.t. input to "forward" op
-                        // (aka "gradInput")
-
-  dX->ResizeLike(X);
+  auto* dX = Output(
+    0, X.sizes(), at::dtype<float>());  // Gradient of net w.r.t. input to
+                                        // "forward" op (aka "gradInput")
 
-  SetKernel<float>
-          <<<CAFFE_GET_BLOCKS(dX->size()),
-             CAFFE_CUDA_NUM_THREADS,
-             0, 
-             context_.cuda_stream()>>>( 
-              dX->size(),
-              0.f, 
-              dX->mutable_data<float>());
+  SetKernel<float><<<CAFFE_GET_BLOCKS(dX->size()),
+                    CAFFE_CUDA_NUM_THREADS,
+                    0, context_.cuda_stream()>>>(
+      dX->size(), 0.f, dX->mutable_data<float>());
 
   if (dY.size() > 0) {  // Handle possibly empty gradient if there were no rois
-    PointWarpBackwardFeature<float><<<CAFFE_GET_BLOCKS(dY.size()),
-                             CAFFE_CUDA_NUM_THREADS,
-                             0, context_.cuda_stream()>>>(
+    PoolPointsInterpBackward<float><<<CAFFE_GET_BLOCKS(dY.size()),
+                                      CAFFE_CUDA_NUM_THREADS,
+                                      0, context_.cuda_stream()>>>(
         dY.size(), dY.data<float>(), R.dim32(0), spatial_scale_,
         X.dim32(1), X.dim32(2), X.dim32(3),
-        dX->mutable_data<float>(),
-        R.data<float>());
+        dX->mutable_data<float>(), R.data<float>());
   }
   return true;
 }
 
 
-//namespace {
 REGISTER_CUDA_OPERATOR(PoolPointsInterp,
                        PoolPointsInterpOp<float, CUDAContext>);
 REGISTER_CUDA_OPERATOR(PoolPointsInterpGradient,
                        PoolPointsInterpGradientOp<float, CUDAContext>);
-//} // namespace
 } // namespace caffe2
diff --git a/detectron/ops/pool_points_interp.h b/detectron/ops/pool_points_interp.h
index 367a29c..02d5c65 100644
--- a/detectron/ops/pool_points_interp.h
+++ b/detectron/ops/pool_points_interp.h
@@ -1,12 +1,11 @@
 /**
-# Copyright (c) Facebook, Inc. and its affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the license found in the
+ * LICENSE file in the root directory of this source tree.
  */
 
-
 #ifndef POOL_POINTS_INTERP_OP_H_
 #define POOL_POINTS_INTERP_OP_H_
 
@@ -21,13 +20,14 @@ class PoolPointsInterpOp final : public Operator<Context> {
  public:
   PoolPointsInterpOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws),
-        spatial_scale_(OperatorBase::GetSingleArgument<float>(
+        spatial_scale_(this->template GetSingleArgument<float>(
               "spatial_scale", 1.)) {
     DCHECK_GT(spatial_scale_, 0);
   }
   USE_OPERATOR_CONTEXT_FUNCTIONS;
 
   bool RunOnDevice() override {
+    // No CPU implementation for now
     CAFFE_NOT_IMPLEMENTED;
   }
 
@@ -40,13 +40,14 @@ class PoolPointsInterpGradientOp final : public Operator<Context> {
  public:
   PoolPointsInterpGradientOp(const OperatorDef& def, Workspace* ws)
       : Operator<Context>(def, ws),
-        spatial_scale_(OperatorBase::GetSingleArgument<float>(
-              "spatial_scale", 1.)){
+        spatial_scale_(this->template GetSingleArgument<float>(
+              "spatial_scale", 1.)) {
     DCHECK_GT(spatial_scale_, 0);
   }
   USE_OPERATOR_CONTEXT_FUNCTIONS;
 
   bool RunOnDevice() override {
+    // No CPU implementation for now
     CAFFE_NOT_IMPLEMENTED;
   }
 
@@ -56,4 +57,4 @@ class PoolPointsInterpGradientOp final : public Operator<Context> {
 
 } // namespace caffe2
 
-#endif // PoolPointsInterpOp
+#endif // POOL_POINTS_INTERP_OP_H_
diff --git a/detectron/roi_data/body_uv_rcnn.py b/detectron/roi_data/body_uv_rcnn.py
index 1cb0f0b..27c817a 100644
--- a/detectron/roi_data/body_uv_rcnn.py
+++ b/detectron/roi_data/body_uv_rcnn.py
@@ -3,18 +3,21 @@
 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+##############################################################################
+
+"""Construct minibatches for DensePose training. Handles the minibatch blobs
+that are specific to DensePose. Other blobs that are generic to RPN or
+Fast/er R-CNN are handled by their respecitive roi_data modules.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 from __future__ import unicode_literals
-#
-from scipy.io import loadmat
-import copy
+
 import cv2
 import logging
 import numpy as np
-#
 
 from detectron.core.config import cfg
 import detectron.utils.blob as blob_utils
@@ -22,191 +25,180 @@
 import detectron.utils.segms as segm_utils
 import detectron.utils.densepose_methods as dp_utils
 
-#
-from memory_profiler import profile
-#
-import os
-#
 logger = logging.getLogger(__name__)
-#
+
 DP = dp_utils.DensePoseMethods()
-#
+
 
 def add_body_uv_rcnn_blobs(blobs, sampled_boxes, roidb, im_scale, batch_idx):
-    IsFlipped = roidb['flipped']
+    """Add DensePose specific blobs to the given inputs blobs dictionary."""
     M = cfg.BODY_UV_RCNN.HEATMAP_SIZE
-    #
+    # Prepare the body UV targets by associating one gt box which contains
+    # body UV annotations to each training roi that has a fg class label.
     polys_gt_inds = np.where(roidb['ignore_UV_body'] == 0)[0]
-    boxes_from_polys = [roidb['boxes'][i,:] for i in polys_gt_inds]
-    if not(boxes_from_polys):
-        pass
-    else:
-        boxes_from_polys = np.vstack(boxes_from_polys)
-    boxes_from_polys = np.array(boxes_from_polys)
-
+    boxes_from_polys = roidb['boxes'][polys_gt_inds]
+    # Select foreground RoIs
     fg_inds = np.where(blobs['labels_int32'] > 0)[0]
-    roi_has_mask = np.zeros( blobs['labels_int32'].shape )
+    roi_has_body_uv = np.zeros_like(blobs['labels_int32'], dtype=np.int32)
 
-    if (bool(boxes_from_polys.any()) & (fg_inds.shape[0] > 0) ):
+    if ((boxes_from_polys.shape[0] > 0) & (fg_inds.shape[0] > 0)):
+        # Find overlap between all foreground RoIs and the gt bounding boxes
+        # containing each body UV annotaion.
         rois_fg = sampled_boxes[fg_inds]
-        #
-        rois_fg.astype(np.float32, copy=False)
-        boxes_from_polys.astype(np.float32, copy=False)
-        #
         overlaps_bbfg_bbpolys = box_utils.bbox_overlaps(
             rois_fg.astype(np.float32, copy=False),
-            boxes_from_polys.astype(np.float32, copy=False))
+            boxes_from_polys.astype(np.float32, copy=False)
+        )
+        # Select foreground RoIs as those with > 0.7 overlap
         fg_polys_value = np.max(overlaps_bbfg_bbpolys, axis=1)
-        fg_inds = fg_inds[fg_polys_value>0.7]
-
-    if (bool(boxes_from_polys.any()) & (fg_inds.shape[0] > 0) ):
-        for jj in fg_inds:
-            roi_has_mask[jj] = 1
-         
-        # Create blobs for densepose supervision.
-        ################################################## The mask
-        All_labels = blob_utils.zeros((fg_inds.shape[0], M ** 2), int32=True)
-        All_Weights = blob_utils.zeros((fg_inds.shape[0], M ** 2), int32=True)
-        ################################################# The points
-        X_points = blob_utils.zeros((fg_inds.shape[0], 196), int32=False)
-        Y_points = blob_utils.zeros((fg_inds.shape[0], 196), int32=False)
-        Ind_points = blob_utils.zeros((fg_inds.shape[0], 196), int32=True)
+        fg_inds = fg_inds[fg_polys_value > 0.7]
+
+    if ((boxes_from_polys.shape[0] > 0) & (fg_inds.shape[0] > 0)):
+        roi_has_body_uv[fg_inds] = 1
+        # Create body UV blobs
+        # Dense masks, each mask for a given fg roi is of size M x M.
+        part_inds = blob_utils.zeros((fg_inds.shape[0], M, M), int32=True)
+        # Weights assigned to each target in `part_inds`. By default, all 1's.
+        # part_inds_weights = blob_utils.zeros((fg_inds.shape[0], M, M), int32=True)
+        part_inds_weights = blob_utils.ones((fg_inds.shape[0], M, M), int32=False)
+        # 2D spatial coordinates (on the image). Shape is (#fg_rois, 2) in format
+        # (x, y).
+        coords_xy = blob_utils.zeros((fg_inds.shape[0], 196, 2), int32=False)
+        # 24 patch indices plus a background class
         I_points = blob_utils.zeros((fg_inds.shape[0], 196), int32=True)
+        # UV coordinates in each patch
         U_points = blob_utils.zeros((fg_inds.shape[0], 196), int32=False)
         V_points = blob_utils.zeros((fg_inds.shape[0], 196), int32=False)
-        Uv_point_weights = blob_utils.zeros((fg_inds.shape[0], 196), int32=False)
-        #################################################
+        # Uv_point_weights = blob_utils.zeros((fg_inds.shape[0], 196), int32=False)
 
         rois_fg = sampled_boxes[fg_inds]
-        overlaps_bbfg_bbpolys = box_utils.bbox_overlaps(
-            rois_fg.astype(np.float32, copy=False),
-            boxes_from_polys.astype(np.float32, copy=False))
+        overlaps_bbfg_bbpolys = overlaps_bbfg_bbpolys[fg_inds]
+        # Map from each fg roi to the index of the gt box with highest overlap
         fg_polys_inds = np.argmax(overlaps_bbfg_bbpolys, axis=1)
 
+        # Add body UV targets for each fg roi
         for i in range(rois_fg.shape[0]):
-            #
-            fg_polys_ind = polys_gt_inds[ fg_polys_inds[i] ]
-            #
-            Ilabel = segm_utils.GetDensePoseMask( roidb['dp_masks'][ fg_polys_ind ] )
-            #
-            GT_I = np.array(roidb['dp_I'][ fg_polys_ind ])
-            GT_U = np.array(roidb['dp_U'][ fg_polys_ind ])
-            GT_V = np.array(roidb['dp_V'][ fg_polys_ind ])
-            GT_x = np.array(roidb['dp_x'][ fg_polys_ind ])
-            GT_y = np.array(roidb['dp_y'][ fg_polys_ind ])
-            GT_weights = np.ones(GT_I.shape).astype(np.float32)
-            #
-            ## Do the flipping of the densepose annotation !
-            if(IsFlipped):
-                GT_I,GT_U,GT_V,GT_x,GT_y,Ilabel = DP.get_symmetric_densepose(GT_I,GT_U,GT_V,GT_x,GT_y,Ilabel)
-            #
+            fg_polys_ind = fg_polys_inds[i]
+            polys_gt_ind = polys_gt_inds[fg_polys_ind]
+            # RLE encoded dense masks which are of size 256 x 256.
+            # Map all part masks to 14 labels (i.e., indices of semantic body parts).
+            dp_masks = dp_utils.GetDensePoseMask(
+                roidb['dp_masks'][polys_gt_ind], cfg.BODY_UV_RCNN.NUM_SEMANTIC_PARTS
+            )
+            # Surface patch indices of collected points
+            dp_I = np.array(roidb['dp_I'][polys_gt_ind], dtype=np.int32)
+            # UV coordinates of collected points
+            dp_U = np.array(roidb['dp_U'][polys_gt_ind], dtype=np.float32)
+            dp_V = np.array(roidb['dp_V'][polys_gt_ind], dtype=np.float32)
+            # dp_UV_weights = np.ones_like(dp_I).astype(np.float32)
+            # Spatial coordinates on the image which are scaled such that the bbox
+            # size is 256 x 256.
+            dp_x = np.array(roidb['dp_x'][polys_gt_ind], dtype=np.float32)
+            dp_y = np.array(roidb['dp_y'][polys_gt_ind], dtype=np.float32)
+            # Do the flipping of the densepose annotation
+            if roidb['flipped']:
+                dp_I, dp_U, dp_V, dp_x, dp_y, dp_masks = DP.get_symmetric_densepose(
+                    dp_I, dp_U, dp_V, dp_x, dp_y, dp_masks
+                )
+
             roi_fg = rois_fg[i]
-            roi_gt = boxes_from_polys[fg_polys_inds[i],:]
-            #
-            x1 = roi_fg[0]  ;   x2 = roi_fg[2]
-            y1 = roi_fg[1]  ;   y2 = roi_fg[3]
-            #
-            x1_source = roi_gt[0];  x2_source = roi_gt[2]
-            y1_source = roi_gt[1];  y2_source = roi_gt[3]
-            #
-            x_targets  = ( np.arange(x1,x2, (x2 - x1)/M ) - x1_source ) * ( 256. / (x2_source-x1_source) )  
-            y_targets  = ( np.arange(y1,y2, (y2 - y1)/M ) - y1_source ) * ( 256. / (y2_source-y1_source) )  
-            #
-            x_targets = x_targets[0:M] ## Strangely sometimes it can be M+1, so make sure size is OK!
-            y_targets = y_targets[0:M]
-            #
-            [X_targets,Y_targets] = np.meshgrid( x_targets, y_targets )
-            New_Index = cv2.remap(Ilabel,X_targets.astype(np.float32), Y_targets.astype(np.float32), interpolation=cv2.INTER_NEAREST, borderMode= cv2.BORDER_CONSTANT, borderValue=(0))
-            #
-            All_L = np.zeros(New_Index.shape)
-            All_W = np.ones(New_Index.shape)
-            #
-            All_L = New_Index
-            #
-            gt_length_x = x2_source - x1_source
-            gt_length_y = y2_source - y1_source
-            #
-            GT_y =  ((  GT_y / 256. * gt_length_y  ) + y1_source - y1 ) *  ( M /  ( y2 - y1 ) )
-            GT_x =  ((  GT_x / 256. * gt_length_x  ) + x1_source - x1 ) *  ( M /  ( x2 - x1 ) )
-            #
-            GT_I[GT_y<0] = 0
-            GT_I[GT_y>(M-1)] = 0
-            GT_I[GT_x<0] = 0
-            GT_I[GT_x>(M-1)] = 0
-            #
-            points_inside = GT_I>0
-            GT_U = GT_U[points_inside]
-            GT_V = GT_V[points_inside]
-            GT_x = GT_x[points_inside]
-            GT_y = GT_y[points_inside]
-            GT_weights = GT_weights[points_inside]
-            GT_I = GT_I[points_inside]
-            #
-            X_points[i, 0:len(GT_x)] = GT_x
-            Y_points[i, 0:len(GT_y)] = GT_y
-            Ind_points[i, 0:len(GT_I)] = i
-            I_points[i, 0:len(GT_I)] = GT_I
-            U_points[i, 0:len(GT_U)] = GT_U
-            V_points[i, 0:len(GT_V)] = GT_V
-            Uv_point_weights[i, 0:len(GT_weights)] = GT_weights
-            #
-            All_labels[i, :] = np.reshape(All_L.astype(np.int32), M ** 2)
-            All_Weights[i, :] = np.reshape(All_W.astype(np.int32), M ** 2)
-            ##
-    else:
+            gt_box = boxes_from_polys[fg_polys_ind]
+            fg_x1, fg_y1, fg_x2, fg_y2 = roi_fg[0:4]
+            gt_x1, gt_y1, gt_x2, gt_y2 = gt_box[0:4]
+            fg_width = fg_x2 - fg_x1;  fg_height = fg_y2 - fg_y1
+            gt_width = gt_x2 - gt_x1;  gt_height = gt_y2 - gt_y1
+            fg_scale_w = float(M) / fg_width
+            fg_scale_h = float(M) / fg_height
+            gt_scale_w = 256. / gt_width
+            gt_scale_h = 256. / gt_height
+            # Sample M points evenly within the fg roi and scale the relative coordinates
+            # (to associated gt box) such that the bounding box size is 256 x 256.
+            x_targets = (np.arange(fg_x1, fg_x2, fg_width / M) - gt_x1) * gt_scale_w
+            y_targets = (np.arange(fg_y1, fg_y2, fg_height / M) - gt_y1) * gt_scale_h
+            # Construct 2D coordiante matrices
+            x_targets, y_targets = np.meshgrid(x_targets[:M], y_targets[:M])
+            ## Another implementation option (which results in similar performance)
+            # x_targets = (np.linspace(fg_x1, fg_x2, M, endpoint=True, dtype=np.float32) - gt_x1) * gt_scale_w
+            # y_targets = (np.linspace(fg_y1, fg_y2, M, endpoint=True, dtype=np.float32) - gt_y1) * gt_scale_h
+            # x_targets = (np.linspace(fg_x1, fg_x2, M, endpoint=False) - gt_x1) * gt_scale_w
+            # y_targets = (np.linspace(fg_y1, fg_y2, M, endpoint=False) - gt_y1) * gt_scale_h
+            # x_targets, y_targets = np.meshgrid(x_targets, y_targets)
+
+            # Map dense masks of size 256 x 256 to target heatmap of size M x M.
+            part_inds[i] = cv2.remap(
+                dp_masks, x_targets.astype(np.float32), y_targets.astype(np.float32),
+                interpolation=cv2.INTER_NEAREST,
+                borderMode=cv2.BORDER_CONSTANT, borderValue=(0)
+            )
+
+            # Scale annotated spatial coordinates from bbox of size 256 x 256 to target
+            # heatmap of size M x M.
+            dp_x = (dp_x / gt_scale_w + gt_x1 - fg_x1) * fg_scale_w
+            dp_y = (dp_y / gt_scale_h + gt_y1 - fg_y1) * fg_scale_h   
+            # Set patch index of points outside the heatmap as 0 (background).
+            dp_I[dp_x < 0] = 0; dp_I[dp_x > (M - 1)] = 0
+            dp_I[dp_y < 0] = 0; dp_I[dp_y > (M - 1)] = 0
+            # Get body UV annotations of points inside the heatmap.
+            points_inside = dp_I > 0
+            dp_x = dp_x[points_inside]
+            dp_y = dp_y[points_inside]
+            dp_I = dp_I[points_inside]
+            dp_U = dp_U[points_inside]
+            dp_V = dp_V[points_inside] 
+            # dp_UV_weights = dp_UV_weights[points_inside]
+
+            # Update body UV blobs
+            num_dp_points = len(dp_I)
+            # coords_xy[i, 0:num_dp_points, 0] = i  # fg_roi index
+            coords_xy[i, 0:num_dp_points, 0] = dp_x
+            coords_xy[i, 0:num_dp_points, 1] = dp_y
+            I_points[i, 0:num_dp_points] = dp_I.astype(np.int32)
+            U_points[i, 0:num_dp_points] = dp_U
+            V_points[i, 0:num_dp_points] = dp_V
+            # Uv_point_weights[i, 0:len(dp_UV_weights)] = dp_UV_weights
+    else:  # If there are no fg rois
+        # The network cannot handle empty blobs, so we must provide a blob.
+        # We simply take the first bg roi, give it an all 0's body UV annotations
+        # and label it with class zero (bg).
         bg_inds = np.where(blobs['labels_int32'] == 0)[0]
-        #
-        if(len(bg_inds)==0):
+        # `rois_fg` is actually one background roi, but that's ok because ...
+        if len(bg_inds) == 0:
             rois_fg = sampled_boxes[0].reshape((1, -1))
         else:
             rois_fg = sampled_boxes[bg_inds[0]].reshape((1, -1))
+        # Mark that the first roi has body UV annotation
+        roi_has_body_uv[0] = 1
+        # We give it all 0's blobs
+        part_inds = blob_utils.zeros((1, M, M), int32=True)
+        part_inds_weights = blob_utils.zeros((1, M, M), int32=False)
+        coords_xy = blob_utils.zeros((1, 196, 2), int32=False)
+        I_points   = blob_utils.zeros((1, 196), int32=True)
+        U_points   = blob_utils.zeros((1, 196), int32=False)
+        V_points   = blob_utils.zeros((1, 196), int32=False)
+        # Uv_point_weights = blob_utils.zeros((1, 196), int32=False)
 
-        roi_has_mask[0] = 1
-        #
-        X_points = blob_utils.zeros((1, 196), int32=False)
-        Y_points = blob_utils.zeros((1, 196), int32=False)
-        Ind_points = blob_utils.zeros((1, 196), int32=True)
-        I_points = blob_utils.zeros((1,196), int32=True)
-        U_points = blob_utils.zeros((1, 196), int32=False)
-        V_points = blob_utils.zeros((1, 196), int32=False)
-        Uv_point_weights = blob_utils.zeros((1, 196), int32=False)
-        #
-        All_labels = -blob_utils.ones((1, M ** 2), int32=True) * 0 ## zeros
-        All_Weights = -blob_utils.ones((1, M ** 2), int32=True) * 0 ## zeros
-    #
+    # Scale rois_fg and format as (batch_idx, x1, y1, x2, y2)
     rois_fg *= im_scale
     repeated_batch_idx = batch_idx * blob_utils.ones((rois_fg.shape[0], 1))
     rois_fg = np.hstack((repeated_batch_idx, rois_fg))
-    #
-    K = cfg.BODY_UV_RCNN.NUM_PATCHES
-    #
-    U_points = np.tile( U_points , [1,K+1] )
-    V_points = np.tile( V_points , [1,K+1] )
-    Uv_Weight_Points = np.zeros(U_points.shape)
-    #
-    for jjj in xrange(1,K+1):
-        Uv_Weight_Points[ : , jjj * I_points.shape[1]  : (jjj+1) * I_points.shape[1] ] = ( I_points == jjj ).astype(np.float32)
-    #
-    ################
-    # Update blobs dict with Mask R-CNN blobs
-    ###############
-    #
-    blobs['body_uv_rois'] = np.array(rois_fg)
-    blobs['roi_has_body_uv_int32'] = np.array(roi_has_mask).astype(np.int32)
-    ##
-    blobs['body_uv_ann_labels'] = np.array(All_labels).astype(np.int32)
-    blobs['body_uv_ann_weights'] = np.array(All_Weights).astype(np.float32)
-    #
-    ##########################
-    blobs['body_uv_X_points'] = X_points.astype(np.float32)
-    blobs['body_uv_Y_points'] = Y_points.astype(np.float32)
-    blobs['body_uv_Ind_points'] = Ind_points.astype(np.float32)
-    blobs['body_uv_I_points'] = I_points.astype(np.float32)
-    blobs['body_uv_U_points'] = U_points.astype(np.float32)  #### VERY IMPORTANT :   These are switched here :
-    blobs['body_uv_V_points'] = V_points.astype(np.float32)
-    blobs['body_uv_point_weights'] = Uv_Weight_Points.astype(np.float32)
-    ###################
-
-
-
+    # Create body UV blobs for all patches (including background)
+    K = cfg.BODY_UV_RCNN.NUM_PATCHES + 1
+    # Construct U/V_points blobs for all patches by repeating it #num_patches times.
+    # Shape: (#rois, 196, K)
+    U_points = np.repeat(U_points[:, :, np.newaxis], K, axis=-1)
+    V_points = np.repeat(V_points[:, :, np.newaxis], K, axis=-1)
+    uv_point_weights = np.zeros_like(U_points)
+    # Set binary weights for UV targets in each patch
+    for i in np.arange(1, K):
+        uv_point_weights[:, :, i] = (I_points == i).astype(np.float32)
 
+    # Update blobs dict with body UV blobs
+    blobs['body_uv_rois'] = rois_fg
+    blobs['roi_has_body_uv_int32'] = roi_has_body_uv  # shape: (#rois,)
+    blobs['body_uv_parts'] = part_inds  # shape: (#rois, M, M)
+    blobs['body_uv_parts_weights'] = part_inds_weights
+    blobs['body_uv_coords_xy'] = coords_xy.reshape(-1, 2)  # shape: (#rois * 196, 2)
+    blobs['body_uv_I_points'] = I_points.reshape(-1, 1)  # shape: (#rois * 196, 1)
+    blobs['body_uv_U_points'] = U_points  # shape: (#rois, 196, K)
+    blobs['body_uv_V_points'] = V_points
+    blobs['body_uv_point_weights'] = uv_point_weights
diff --git a/detectron/roi_data/fast_rcnn.py b/detectron/roi_data/fast_rcnn.py
index 2635974..153ce61 100644
--- a/detectron/roi_data/fast_rcnn.py
+++ b/detectron/roi_data/fast_rcnn.py
@@ -41,7 +41,6 @@ def get_fast_rcnn_blob_names(is_training=True):
         # labels_int32 blob: R categorical labels in [0, ..., K] for K
         # foreground classes plus background
         blob_names += ['labels_int32']
-    if is_training:
         # bbox_targets blob: R bounding-box regression targets with 4
         # targets per class
         blob_names += ['bbox_targets']
@@ -81,19 +80,32 @@ def get_fast_rcnn_blob_names(is_training=True):
         ########################
 
     if is_training and cfg.MODEL.BODY_UV_ON:
+        # 'body_uv_rois': RoIs sampled for training the body UV estimation branch.
+        # Shape is (#fg_rois, 5) in format (batch_idx, x1, y1, x2, y2).
         blob_names += ['body_uv_rois']
+        # 'roi_has_body_uv': binary labels for the RoIs specified in 'rois'
+        # indicating if each RoI has a body or not. Shape is (#rois).
         blob_names += ['roi_has_body_uv_int32']
-        #########
-        # ###################################################
-        blob_names += ['body_uv_ann_labels']
-        blob_names += ['body_uv_ann_weights']
-        # #################################################
-        blob_names += ['body_uv_X_points']
-        blob_names += ['body_uv_Y_points']
-        blob_names += ['body_uv_Ind_points']
+        # 'body_uv_parts': index of part in [0, ..., S] where S is the number of
+        # semantic parts used to sample body UV points for the RoIs specified in
+        # 'body_uv_rois'. Shape is (#rois, M, M) where M is the heat map size.
+        blob_names += ['body_uv_parts']
+        # 'body_uv_parts_weights': weight assigned to each target in 'body_uv_parts'.
+        # Shape is (#rois, M, M). Used in SpatialSoftmaxWithLoss.
+        blob_names += ['body_uv_parts_weights']
+        # 'body_uv_coords_xy': 2D spatial coordinates of collected points on
+        # the image. Shape is (#rois * 196, 2) in format (dp_x, dp_y).
+        # Used in PoolPointsInterp.
+        blob_names += ['body_uv_coords_xy']
+        # 'body_uv_I_points': surface patch indices in [0, ..., K] for K patches
+        # plus background. Shape is (#rois * 196, 1). Used in SoftmaxWithLoss.
         blob_names += ['body_uv_I_points']
+        # 'body_uv_U/V_points': UV coordinates of collected points in each patch.
+        # Shape is (#rois, 196, K). Used in PoolPointsInterp and SmoothL1Loss.
         blob_names += ['body_uv_U_points']
         blob_names += ['body_uv_V_points']
+        # 'body_uv_point_weights': weight assigned to each target in
+        # 'body_uv_U/V_points'. Shape is (#rois, 196, K). Used in SmoothL1Loss.
         blob_names += ['body_uv_point_weights']
 
     if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS:
@@ -173,7 +185,7 @@ def _sample_rois(roidb, im_scale, batch_idx):
     # against there being fewer than desired)
     bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
     bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size)
-    # Sample foreground regions without replacement
+    # Sample background regions without replacement
     if bg_inds.size > 0:
         bg_inds = npr.choice(
             bg_inds, size=bg_rois_per_this_image, replace=False
diff --git a/detectron/utils/densepose_methods.py b/detectron/utils/densepose_methods.py
index 4d1dcf9..3c19aa9 100644
--- a/detectron/utils/densepose_methods.py
+++ b/detectron/utils/densepose_methods.py
@@ -3,141 +3,137 @@
 #
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+##############################################################################
 
+"""DensePose utilities."""
+
+from scipy.io import loadmat
+import os.path as osp
 import numpy as np
-import copy
-import cv2
-from scipy.io  import loadmat
-import scipy.spatial.distance
-import os 
+import scipy.spatial.distance as ssd
+import pycocotools.mask as mask_util
+
+
+def GetDensePoseMask(Polys, num_parts=14):
+    """Get dense masks from the encoded masks."""
+    MaskGen = np.zeros((256, 256), dtype=np.int32)
+    for i in range(1, num_parts + 1):
+        if Polys[i - 1]:
+            current_mask = mask_util.decode(Polys[i - 1])
+            MaskGen[current_mask > 0] = i
+    return MaskGen
 
 
 class DensePoseMethods:
     def __init__(self):
-        #
-        ALP_UV = loadmat( os.path.join(os.path.dirname(__file__), '../../DensePoseData/UV_data/UV_Processed.mat')  )
-        self.FaceIndices = np.array( ALP_UV['All_FaceIndices']).squeeze()
-        self.FacesDensePose = ALP_UV['All_Faces']-1
+        ALP_UV = loadmat(
+            osp.join(osp.dirname(__file__), '../../DensePoseData/UV_data/UV_Processed.mat')
+        )
+        self.FaceIndices = np.array(ALP_UV['All_FaceIndices']).squeeze()
+        self.FacesDensePose = ALP_UV['All_Faces'] - 1
         self.U_norm = ALP_UV['All_U_norm'].squeeze()
         self.V_norm = ALP_UV['All_V_norm'].squeeze()
-        self.All_vertices =  ALP_UV['All_vertices'][0]
-        ## Info to compute symmetries.
-        self.SemanticMaskSymmetries = [0,1,3,2,5,4,7,6,9,8,11,10,13,12,14]
-        self.Index_Symmetry_List = [1,2,4,3,6,5,8,7,10,9,12,11,14,13,16,15,18,17,20,19,22,21,24,23];
-        UV_symmetry_filename = os.path.join(os.path.dirname(__file__), '../../DensePoseData/UV_data/UV_symmetry_transforms.mat')
-        self.UV_symmetry_transformations = loadmat( UV_symmetry_filename )
-    
-
-    def get_symmetric_densepose(self,I,U,V,x,y,Mask):
-        ### This is a function to get the mirror symmetric UV labels.
-        Labels_sym= np.zeros(I.shape)
-        U_sym= np.zeros(U.shape)
-        V_sym= np.zeros(V.shape)
-        ###
-        for i in ( range(24)):
-            if i+1 in I:
-                Labels_sym[I == (i+1)] = self.Index_Symmetry_List[i]
-                jj = np.where(I == (i+1))
-                ###
-                U_loc = (U[jj]*255).astype(np.int64)
-                V_loc = (V[jj]*255).astype(np.int64)
-                ###
-                V_sym[jj] = self.UV_symmetry_transformations['V_transforms'][0,i][V_loc,U_loc]
-                U_sym[jj] = self.UV_symmetry_transformations['U_transforms'][0,i][V_loc,U_loc]
-        ##
-        Mask_flip = np.fliplr(Mask)
-        Mask_flipped = np.zeros(Mask.shape)
-        #
-        for i in ( range(14)):
-            Mask_flipped[Mask_flip == (i+1)] = self.SemanticMaskSymmetries[i+1]
-        #
-        [y_max , x_max ] = Mask_flip.shape
-        y_sym = y
-        x_sym = x_max-x
-        #
-        return Labels_sym , U_sym , V_sym , x_sym , y_sym , Mask_flipped
-    
-    
-    
-    def barycentric_coordinates_exists(self,P0, P1, P2, P):
-        u = P1 - P0
-        v = P2 - P0
-        w = P - P0
-        #
-        vCrossW = np.cross(v,w)
+        self.All_vertices = ALP_UV['All_vertices'][0]
+        self.SemanticMaskSymmetries = [
+            0, 1, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14
+        ]
+        self.Index_Symmetry_List = [
+            1, 2, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15, 18, 17, 20, 19, 22, 21, 24, 23
+        ]
+        self.UV_symmetry_transformations = loadmat(
+            osp.join(osp.dirname(__file__), '../../DensePoseData/UV_data/UV_symmetry_transforms.mat')
+        )
+
+    def get_symmetric_densepose(self, I, U, V, x, y, mask):
+        """Get the mirror symmetric UV annotations"""
+        symm_I = np.zeros_like(I)
+        symm_U = np.zeros_like(U)
+        symm_V = np.zeros_like(V)
+        for i in range(24):
+            inds = np.where(I == (i + 1))[0]
+            if len(inds) > 0:
+                symm_I[inds] = self.Index_Symmetry_List[i]
+                loc_U = (U[inds] * 255).astype(np.int32)
+                loc_V = (V[inds] * 255).astype(np.int32)
+                symm_U[inds] = self.UV_symmetry_transformations['U_transforms'][0, i][loc_V, loc_U]
+                symm_V[inds] = self.UV_symmetry_transformations['V_transforms'][0, i][loc_V, loc_U]
+
+        flip_mask = np.fliplr(mask)
+        symm_mask = np.zeros_like(mask)
+        for i in range(1, 15):
+            symm_mask[flip_mask == i] = self.SemanticMaskSymmetries[i]
+        x_max = flip_mask.shape[1]
+        symm_x = x_max - x
+        symm_y = y
+        return symm_I, symm_U, symm_V, symm_x, symm_y, symm_mask
+
+    def barycentric_coordinates_exists(self, P0, P1, P2, P):
+        u = P1 - P0; v = P2 - P0; w = P - P0
+        vCrossW = np.cross(v, w)
         vCrossU = np.cross(v, u)
-        if (np.dot(vCrossW, vCrossU) < 0):
-            return False;
-        #
+        if np.dot(vCrossW, vCrossU) < 0:
+            return False
+
         uCrossW = np.cross(u, w)
         uCrossV = np.cross(u, v)
-        #
         if (np.dot(uCrossW, uCrossV) < 0):
-            return False;
-        #
-        denom = np.sqrt((uCrossV**2).sum())
-        r = np.sqrt((vCrossW**2).sum())/denom
-        t = np.sqrt((uCrossW**2).sum())/denom
-        #
-        return((r <=1) & (t <= 1) & (r + t <= 1))
-
-    def barycentric_coordinates(self,P0, P1, P2, P):
-        u = P1 - P0
-        v = P2 - P0
-        w = P - P0
-        #
-        vCrossW = np.cross(v,w)
+            return False
+
+        denom = np.sqrt((uCrossV ** 2).sum())
+        r = np.sqrt((vCrossW ** 2).sum()) / denom
+        t = np.sqrt((uCrossW ** 2).sum()) / denom
+        return ((r <= 1) & (t <= 1) & (r + t <= 1))
+
+    def barycentric_coordinates(self, P0, P1, P2, P):
+        u = P1 - P0; v = P2 - P0; w = P - P0
+        vCrossW = np.cross(v, w)
         vCrossU = np.cross(v, u)
-        #
+        if np.dot(vCrossW, vCrossU) < 0:
+            return -1, -1, -1
         uCrossW = np.cross(u, w)
         uCrossV = np.cross(u, v)
-        #
-        denom = np.sqrt((uCrossV**2).sum())
-        r = np.sqrt((vCrossW**2).sum())/denom
-        t = np.sqrt((uCrossW**2).sum())/denom
-        #
-        return(1-(r+t),r,t)
-
-    def IUV2FBC( self, I_point , U_point, V_point):
-        P = [ U_point , V_point , 0 ]
-        FaceIndicesNow  = np.where( self.FaceIndices == I_point )
-        FacesNow = self.FacesDensePose[FaceIndicesNow]
-        #
-        P_0 = np.vstack( (self.U_norm[FacesNow][:,0], self.V_norm[FacesNow][:,0], np.zeros(self.U_norm[FacesNow][:,0].shape))).transpose()
-        P_1 = np.vstack( (self.U_norm[FacesNow][:,1], self.V_norm[FacesNow][:,1], np.zeros(self.U_norm[FacesNow][:,1].shape))).transpose()
-        P_2 = np.vstack( (self.U_norm[FacesNow][:,2], self.V_norm[FacesNow][:,2], np.zeros(self.U_norm[FacesNow][:,2].shape))).transpose()
-        #
-
-        for i, [P0,P1,P2] in enumerate( zip(P_0,P_1,P_2)) :
-            if(self.barycentric_coordinates_exists(P0, P1, P2, P)):
-                [bc1,bc2,bc3] = self.barycentric_coordinates(P0, P1, P2, P)
-                return(FaceIndicesNow[0][i],bc1,bc2,bc3)
-        #
+        if np.dot(uCrossW, uCrossV) < 0:
+            return -1, -1, -1
+        denom = np.sqrt((uCrossV ** 2).sum())
+        r = np.sqrt((vCrossW ** 2).sum()) / denom
+        t = np.sqrt((uCrossW ** 2).sum()) / denom
+        if ((r <= 1) & (t <= 1) & (r + t <= 1)):
+            return 1 - (r + t), r, t
+        else:
+            return -1, -1, -1
+
+    def IUV2FBC(self, I_point, U_point, V_point):
+        """Convert IUV to FBC (faceIndex and barycentric coordinates)."""
+        P = [U_point, V_point, 0]
+        faceIndicesNow = np.where(self.FaceIndices == I_point)[0]
+        FacesNow = self.FacesDensePose[faceIndicesNow]
+        v0 = np.zeros_like(self.U_norm[FacesNow][:, 0])
+        P_0 = np.vstack((self.U_norm[FacesNow][:, 0], self.V_norm[FacesNow][:, 0], v0)).transpose()
+        P_1 = np.vstack((self.U_norm[FacesNow][:, 1], self.V_norm[FacesNow][:, 1], v0)).transpose()
+        P_2 = np.vstack((self.U_norm[FacesNow][:, 2], self.V_norm[FacesNow][:, 2], v0)).transpose()
+
+        for i, [P0, P1, P2] in enumerate(zip(P_0, P_1, P_2)) :
+            bc1, bc2, bc3 = self.barycentric_coordinates(P0, P1, P2, P)
+            if bc1 != -1:
+                return faceIndicesNow[i], bc1, bc2, bc3
+
         # If the found UV is not inside any faces, select the vertex that is closest!
-        #
-        D1 = scipy.spatial.distance.cdist( np.array( [U_point,V_point])[np.newaxis,:] , P_0[:,0:2]).squeeze()
-        D2 = scipy.spatial.distance.cdist( np.array( [U_point,V_point])[np.newaxis,:] , P_1[:,0:2]).squeeze()
-        D3 = scipy.spatial.distance.cdist( np.array( [U_point,V_point])[np.newaxis,:] , P_2[:,0:2]).squeeze()
-        #
-        minD1 = D1.min()
-        minD2 = D2.min()
-        minD3 = D3.min()
-        #
-        if((minD1< minD2) & (minD1< minD3)):
-            return(  FaceIndicesNow[0][np.argmin(D1)] , 1.,0.,0. )
-        elif((minD2< minD1) & (minD2< minD3)):
-            return(  FaceIndicesNow[0][np.argmin(D2)] , 0.,1.,0. )
+        D1 = ssd.cdist(np.array([U_point, V_point])[np.newaxis, :], P_0[:, 0:2]).squeeze()
+        D2 = ssd.cdist(np.array([U_point, V_point])[np.newaxis, :], P_1[:, 0:2]).squeeze()
+        D3 = ssd.cdist(np.array([U_point, V_point])[np.newaxis, :], P_2[:, 0:2]).squeeze()
+        minD1 = D1.min(); minD2 = D2.min(); minD3 = D3.min()
+        if ((minD1 < minD2) & (minD1 < minD3)):
+            return faceIndicesNow[np.argmin(D1)], 1., 0., 0.
+        elif ((minD2 < minD1) & (minD2 < minD3)):
+            return faceIndicesNow[np.argmin(D2)], 0., 1., 0.
         else:
-            return(  FaceIndicesNow[0][np.argmin(D3)] , 0.,0.,1. )
-
-
-    def FBC2PointOnSurface( self, FaceIndex, bc1,bc2,bc3,Vertices ):
-        ##
-        Vert_indices = self.All_vertices[self.FacesDensePose[FaceIndex]]-1
-        ##
-        p = Vertices[Vert_indices[0],:] * bc1 +  \
-            Vertices[Vert_indices[1],:] * bc2 +  \
-            Vertices[Vert_indices[2],:] * bc3 
-        ##
-        return(p)    
-  
+            return faceIndicesNow[np.argmin(D3)], 0., 0., 1.
+
+    def FBC2PointOnSurface(self, face_ind, bc1, bc2, bc3, vertices):
+        """Use FBC to get 3D coordinates on the surface."""
+        Vert_indices = self.All_vertices[self.FacesDensePose[face_ind]] - 1
+        # p = vertices[Vert_indices[0], :] * bc1 +  \
+        #     vertices[Vert_indices[1], :] * bc2 +  \
+        #     vertices[Vert_indices[2], :] * bc3 
+        p = np.matmul(np.array([[bc1, bc2, bc3]]), vertices[Vert_indices]).squeeze()
+        return p
diff --git a/detectron/utils/segms.py b/detectron/utils/segms.py
index bf1ac3f..9967a24 100644
--- a/detectron/utils/segms.py
+++ b/detectron/utils/segms.py
@@ -20,19 +20,9 @@
 from __future__ import unicode_literals
 
 import numpy as np
-
 import pycocotools.mask as mask_util
 
 
-def GetDensePoseMask(Polys):
-    MaskGen = np.zeros([256,256])
-    for i in range(1,15):
-        if(Polys[i-1]):
-            current_mask = mask_util.decode(Polys[i-1])
-            MaskGen[current_mask>0] = i
-    return MaskGen
-
-
 def flip_segms(segms, height, width):
     """Left/right flip each mask in a list of masks."""
     def _flip_poly(poly, width):