From 51c2f6799d7c5141d5b8a79e6bea268b9424e416 Mon Sep 17 00:00:00 2001
From: rkansal47 <rkansal@cern.ch>
Date: Thu, 25 Jul 2024 02:56:22 -0700
Subject: [PATCH] updates for hhbbvv test

---
 README.md                                   |  6 ++
 src/HHbbVV/postprocessing/TopAnalysis.ipynb | 66 +++++++++++++--------
 src/HHbbVV/postprocessing/TrainBDT.py       |  1 +
 src/HHbbVV/postprocessing/corrections.py    | 14 ++++-
 src/HHbbVV/postprocessing/postprocessing.py |  3 +
 5 files changed, 64 insertions(+), 26 deletions(-)

diff --git a/README.md b/README.md
index f6fc36a2..795dbea0 100644
--- a/README.md
+++ b/README.md
@@ -321,6 +321,12 @@ In `src/HHbbVV/postprocessing':
 python BDTPreProcessing.py --data-dir "../../../../data/skimmer/Feb24/" --signal-data-dir "../../../../data/skimmer/Jun10/" --plot-dir "../../../plots/BDTPreProcessing/$TAG/" --year "2017" --bdt-data (--control-plots)
 ```
 
+Running inference with a trained model, e.g.:
+
+```bash
+python src/HHbbVV/postprocessing/BDTPreProcessing.py --no-save-data --inference --bdt-preds-dir temp/24_04_05_k2v0_training_eqsig_vbf_vars_rm_deta/ --data-dir temp --year 2016 --sig-samples HHbbVV --bg-keys "" --no-data --no-do-jshifts
+```
+
 ### BDT Trainings
 
 ```bash
diff --git a/src/HHbbVV/postprocessing/TopAnalysis.ipynb b/src/HHbbVV/postprocessing/TopAnalysis.ipynb
index 2404d54b..393a3e68 100644
--- a/src/HHbbVV/postprocessing/TopAnalysis.ipynb
+++ b/src/HHbbVV/postprocessing/TopAnalysis.ipynb
@@ -376,21 +376,21 @@
     "# package_path = Path(__file__).parent.parent.resolve()\n",
     "package_path = Path(\"../\").resolve()\n",
     "\n",
-    "for key in [\n",
-    "    \"TTToSemiLeptonic\",\n",
-    "    # \"ST_tW_antitop_5f_NoFullyHadronicDecays\",\n",
-    "    # \"ST_tW_top_5f_NoFullyHadronicDecays\",\n",
-    "    # \"ST_s-channel_4f_leptonDecays\",\n",
-    "    # \"ST_t-channel_antitop_4f_InclusiveDecays\",\n",
-    "    # \"ST_t-channel_top_4f_InclusiveDecays\",\n",
-    "]:\n",
-    "    sig_lp_hist = utils.get_pickles(f\"{signal_data_dir}/{year}/{key}/pickles\", year, key)[\"lp_hist\"]\n",
-    "\n",
-    "    # remove negatives\n",
-    "    sig_lp_hist.values()[sig_lp_hist.values() < 0] = 0\n",
-    "\n",
-    "    with (package_path / f\"corrections/lp_ratios/signals/{year}_{key}.hist\").open(\"wb\") as f:\n",
-    "        pickle.dump(sig_lp_hist, f)"
+    "# for key in [\n",
+    "#     \"TTToSemiLeptonic\",\n",
+    "#     # \"ST_tW_antitop_5f_NoFullyHadronicDecays\",\n",
+    "#     # \"ST_tW_top_5f_NoFullyHadronicDecays\",\n",
+    "#     # \"ST_s-channel_4f_leptonDecays\",\n",
+    "#     # \"ST_t-channel_antitop_4f_InclusiveDecays\",\n",
+    "#     # \"ST_t-channel_top_4f_InclusiveDecays\",\n",
+    "# ]:\n",
+    "#     sig_lp_hist = utils.get_pickles(f\"{signal_data_dir}/{year}/{key}/pickles\", year, key)[\"lp_hist\"]\n",
+    "\n",
+    "#     # remove negatives\n",
+    "#     sig_lp_hist.values()[sig_lp_hist.values() < 0] = 0\n",
+    "\n",
+    "#     with (package_path / f\"corrections/lp_ratios/signals/{year}_{key}.hist\").open(\"wb\") as f:\n",
+    "#         pickle.dump(sig_lp_hist, f)"
    ]
   },
   {
@@ -422,12 +422,15 @@
     "import uproot\n",
     "\n",
     "# initialize lund plane scale factors lookups\n",
-    "f = uproot.open(package_path / f\"corrections/lp_ratios/ratio_{year[:4]}.root\")\n",
+    "# f = uproot.open(package_path / f\"corrections/lp_ratios/ratio_{year[:4]}.root\")\n",
+    "f = uproot.open(package_path / f\"corrections/lp_ratios/ratio_2018.root\")\n",
     "\n",
     "# 3D histogram: [subjet_pt, ln(0.8/Delta), ln(kT/GeV)]\n",
     "mc_nom = f[\"mc_nom\"].to_numpy()\n",
     "ratio_edges = mc_nom[1:]\n",
-    "mc_nom = mc_nom[0]"
+    "mc_nom = mc_nom[0]\n",
+    "\n",
+    "ratio_nom = f[\"ratio_nom\"].to_numpy()[0]"
    ]
   },
   {
@@ -436,7 +439,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.imshow(sig_lp_hist.values()[0])"
+    "np.min(ratio_nom[ratio_nom > 0])"
    ]
   },
   {
@@ -445,7 +448,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.imshow(mc_nom[0])"
+    "with (package_path / f\"corrections/lp_ratios/signals/2018_GluGluToHHTobbVV_node_cHHH1.hist\").open(\n",
+    "    \"rb\"\n",
+    ") as f:\n",
+    "    sig_lp_hist = pickle.load(f)"
    ]
   },
   {
@@ -454,8 +460,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.imshow(sig_mc_ratio_pt[5])\n",
-    "plt.colorbar()"
+    "plt.imshow(sig_lp_hist.values()[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(mc_nom[0])"
    ]
   },
   {
@@ -465,7 +479,10 @@
    "outputs": [],
    "source": [
     "mc_tot = np.sum(mc_nom)\n",
-    "sig_tot = sig_lp_hist.sum()"
+    "sig_tot = sig_lp_hist.sum()\n",
+    "sig_mc_ratio = np.clip(\n",
+    "    np.nan_to_num((sig_lp_hist.values() / sig_tot) / (mc_nom / mc_tot), nan=1), 0.5, 2.0\n",
+    ")"
    ]
   },
   {
@@ -487,9 +504,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sig_mc_ratio = np.clip(\n",
-    "    np.nan_to_num((sig_lp_hist.values() / sig_tot) / (mc_nom / mc_tot), nan=1), 0.5, 2.0\n",
-    ")"
+    "plt.imshow(sig_mc_ratio_pt[0])\n",
+    "plt.colorbar()"
    ]
   },
   {
diff --git a/src/HHbbVV/postprocessing/TrainBDT.py b/src/HHbbVV/postprocessing/TrainBDT.py
index 5c649b3e..80a0a3c5 100644
--- a/src/HHbbVV/postprocessing/TrainBDT.py
+++ b/src/HHbbVV/postprocessing/TrainBDT.py
@@ -821,6 +821,7 @@ def do_inference_year(
     # preds = preds[:, :-1] if multiclass else preds[:, 1]  # save n-1 probs to save space
     preds = preds if multiclass else preds[:, 1]
     np.save(f"{model_dir}/inferences/{year}/preds.npy", preds)
+    print(preds)
 
     if jec_jmsr_shifts:
         for jshift in jec_shifts:
diff --git a/src/HHbbVV/postprocessing/corrections.py b/src/HHbbVV/postprocessing/corrections.py
index 64b95dd5..a6e68b74 100644
--- a/src/HHbbVV/postprocessing/corrections.py
+++ b/src/HHbbVV/postprocessing/corrections.py
@@ -310,10 +310,20 @@ def postprocess_lpsfs(
         else:
             raise ValueError("LP SF shapes are invalid")
 
+        nom_mean = None
         for key in ["lp_sf_nom", "lp_sf_toys", "lp_sf_pt_extrap_vars"] + sf_vars:
             CLIP = 5.0
             td[key] = np.clip(np.nan_to_num(td[key], nan=1.0), 1.0 / CLIP, CLIP)
-            td[key] = td[key] / np.mean(td[key], axis=0)
+
+            if key == "lp_sf_nom":
+                nom_mean = np.mean(td[key], axis=0)
+
+            if "unmatched" not in key:
+                td[key] = td[key] / np.mean(td[key], axis=0)
+            else:
+                # unmatched normalization is otherwise dominated by unmatched jets which aren't in the pass regions
+                # which artificially inflates this uncertainty
+                td[key] = td[key] / nom_mean
 
         # add to dataframe
         if save_all:
@@ -389,4 +399,6 @@ def get_lpsf(
     # tot_rel_unc = np.mean([tot_rel_unc_up, tot_rel_unc_down])
     tot_unc = (lp_sf * tot_rel_unc_up, lp_sf * tot_rel_unc_down)
 
+    # breakpoint()
+
     return lp_sf, tot_unc, uncs, uncs_asym
diff --git a/src/HHbbVV/postprocessing/postprocessing.py b/src/HHbbVV/postprocessing/postprocessing.py
index 1ba3ed05..4dad9bd9 100644
--- a/src/HHbbVV/postprocessing/postprocessing.py
+++ b/src/HHbbVV/postprocessing/postprocessing.py
@@ -520,8 +520,11 @@ def _add_nonres_columns(df, bb_mask, vbf_vars=False, ptlabel="", mlabel=""):
 
     if f"vbf_Mass_jj{ptlabel}" not in df.columns:
         df[f"vbf_Mass_jj{ptlabel}"] = jj.M
+        # df[f"vbf_Mass_jj{ptlabel}"] = np.nan_to_num(jj.M)
     if "vbf_dEta_jj" not in df.columns:
         df["vbf_dEta_jj"] = np.abs(vbf1.eta - vbf2.eta)
+        # df["vbf_dEta_jj"] = np.nan_to_num(np.abs(vbf1.eta - vbf2.eta))
+
     # print(f"VBF jet vars: {time.time() - start:.2f}")
 
     if not vbf_vars: