From 3f171dbfad4674949d07fb48753e0284cd6b73a1 Mon Sep 17 00:00:00 2001
From: rkansal47 <rkansal@cern.ch>
Date: Sat, 27 Jul 2024 01:10:40 -0700
Subject: [PATCH] empty subjet bug fix

---
 src/HHbbVV/processors/corrections.py          | 13 ++-
 .../scale_factors/top_reweighting.ipynb       | 99 ++++---------------
 2 files changed, 31 insertions(+), 81 deletions(-)

diff --git a/src/HHbbVV/processors/corrections.py b/src/HHbbVV/processors/corrections.py
index 39125144..480b0911 100644
--- a/src/HHbbVV/processors/corrections.py
+++ b/src/HHbbVV/processors/corrections.py
@@ -903,12 +903,21 @@ def _get_lund_arrays(
     kt_subjets_pt = kt_subjets_vec.pt * jec_correction
     # get constituents
     kt_subjet_consts = kt_clustering.exclusive_jets_constituents(num_prongs)
-    # breakpoint()
     kt_subjet_consts = kt_subjet_consts[kt_subjet_consts.pt > min_pt]
+    kt_subjet_consts = ak.flatten(kt_subjet_consts, axis=1)
+
+    # dummy particle to pad empty subjets. SF for these subjets will be 1
+    dummy_particle = ak.Array(
+        [{kin_key: 0.0 for kin_key in P4}],
+        with_name="PtEtaPhiMLorentzVector",
+    )
+
+    # pad empty subjets
+    kt_subjet_consts = ak.fill_none(ak.pad_none(kt_subjet_consts, 1, axis=1), dummy_particle[0])
 
     # then re-cluster with CA
     # won't need to flatten once https://github.com/scikit-hep/fastjet/pull/145 is released
-    ca_clustering = fastjet.ClusterSequence(ak.flatten(kt_subjet_consts, axis=1), cadef)
+    ca_clustering = fastjet.ClusterSequence(kt_subjet_consts, cadef)
     lds = ca_clustering.exclusive_jets_lund_declusterings(1)
 
     return lds, kt_subjets_vec, kt_subjets_pt
diff --git a/src/HHbbVV/scale_factors/top_reweighting.ipynb b/src/HHbbVV/scale_factors/top_reweighting.ipynb
index 8de28262..2be8baa0 100644
--- a/src/HHbbVV/scale_factors/top_reweighting.ipynb
+++ b/src/HHbbVV/scale_factors/top_reweighting.ipynb
@@ -799,7 +799,20 @@
     "cadef = fastjet.JetDefinition(fastjet.cambridge_algorithm, dR)\n",
     "ktdef = fastjet.JetDefinition(fastjet.kt_algorithm, dR)\n",
     "\n",
-    "num_prongs = 3"
+    "num_prongs = 3\n",
+    "min_pt = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dummy_particle = ak.Array(\n",
+    "    [{kin_key: 0.0 for kin_key in skim_vars}],\n",
+    "    with_name=\"PtEtaPhiMLorentzVector\",\n",
+    ")"
    ]
   },
   {
@@ -815,10 +828,15 @@
     "    np.linalg.norm((kt_subjets.px, kt_subjets.py), axis=0) * jec_correction[:, np.newaxis]\n",
     ")\n",
     "kt_subjet_consts = kt_clustering.exclusive_jets_constituents(3)\n",
+    "kt_subjet_consts = kt_subjet_consts[kt_subjet_consts.pt > min_pt]\n",
+    "\n",
+    "kt_subjet_consts = ak.flatten(kt_subjet_consts, axis=1)\n",
+    "filled_consts = ak.fill_none(ak.pad_none(kt_subjet_consts, 1, axis=1), dummy_particle[0])\n",
     "\n",
     "# then re-cluster with CA\n",
     "# won't need to flatten once https://github.com/scikit-hep/fastjet/pull/145 is released\n",
-    "ca_clustering = fastjet.ClusterSequence(ak.flatten(kt_subjet_consts, axis=1), cadef)\n",
+    "# ca_clustering = fastjet.ClusterSequence(ak.flatten(kt_subjet_consts, axis=1), cadef)\n",
+    "ca_clustering = fastjet.ClusterSequence(filled_consts, cadef)\n",
     "lds = ca_clustering.exclusive_jets_lund_declusterings(1)\n",
     "lds_flat = ak.flatten(lds, axis=1)"
    ]
@@ -883,24 +901,6 @@
     "    return ld_offsets, flat_logD, flat_logkt, flat_subjet_pt"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "kt_subjets_pt"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "kt_subjets_pt[np.arange(len(kt_subjets_pt)), closest_sjidx]"
-   ]
-  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -1185,33 +1185,6 @@
     "flat_weight = np.repeat(np.repeat(weight, num_prongs), ak.count(lds_flat.kt, axis=1))"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "type(lds.layout) is ak._ext.ListArray64"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "lp_hist"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sum([lp_hist, []])"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -1235,29 +1208,6 @@
     "lp_hist[0, ...]"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Testing summing over histograms\n",
-    "from copy import deepcopy\n",
-    "\n",
-    "lp_hist2 = deepcopy(lp_hist)\n",
-    "lp_hist2.values()[:] = 1\n",
-    "lp_hist2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sum([lp_hist, lp_hist2])"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1359,15 +1309,6 @@
     "sf_vals = np.array(sf_vals)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sf_vals"
-   ]
-  },
   {
    "attachments": {},
    "cell_type": "markdown",