From 51c2f6799d7c5141d5b8a79e6bea268b9424e416 Mon Sep 17 00:00:00 2001 From: rkansal47 Date: Thu, 25 Jul 2024 02:56:22 -0700 Subject: [PATCH] updates for hhbbvv test --- README.md | 6 ++ src/HHbbVV/postprocessing/TopAnalysis.ipynb | 66 +++++++++++++-------- src/HHbbVV/postprocessing/TrainBDT.py | 1 + src/HHbbVV/postprocessing/corrections.py | 14 ++++- src/HHbbVV/postprocessing/postprocessing.py | 3 + 5 files changed, 64 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index f6fc36a2..795dbea0 100644 --- a/README.md +++ b/README.md @@ -321,6 +321,12 @@ In `src/HHbbVV/postprocessing': python BDTPreProcessing.py --data-dir "../../../../data/skimmer/Feb24/" --signal-data-dir "../../../../data/skimmer/Jun10/" --plot-dir "../../../plots/BDTPreProcessing/$TAG/" --year "2017" --bdt-data (--control-plots) ``` +Running inference with a trained model, e.g.: + +```bash +python src/HHbbVV/postprocessing/BDTPreProcessing.py --no-save-data --inference --bdt-preds-dir temp/24_04_05_k2v0_training_eqsig_vbf_vars_rm_deta/ --data-dir temp --year 2016 --sig-samples HHbbVV --bg-keys "" --no-data --no-do-jshifts +``` + ### BDT Trainings ```bash diff --git a/src/HHbbVV/postprocessing/TopAnalysis.ipynb b/src/HHbbVV/postprocessing/TopAnalysis.ipynb index 2404d54b..393a3e68 100644 --- a/src/HHbbVV/postprocessing/TopAnalysis.ipynb +++ b/src/HHbbVV/postprocessing/TopAnalysis.ipynb @@ -376,21 +376,21 @@ "# package_path = Path(__file__).parent.parent.resolve()\n", "package_path = Path(\"../\").resolve()\n", "\n", - "for key in [\n", - " \"TTToSemiLeptonic\",\n", - " # \"ST_tW_antitop_5f_NoFullyHadronicDecays\",\n", - " # \"ST_tW_top_5f_NoFullyHadronicDecays\",\n", - " # \"ST_s-channel_4f_leptonDecays\",\n", - " # \"ST_t-channel_antitop_4f_InclusiveDecays\",\n", - " # \"ST_t-channel_top_4f_InclusiveDecays\",\n", - "]:\n", - " sig_lp_hist = utils.get_pickles(f\"{signal_data_dir}/{year}/{key}/pickles\", year, key)[\"lp_hist\"]\n", - "\n", - " # remove negatives\n", - " sig_lp_hist.values()[sig_lp_hist.values() < 0] = 0\n", - "\n", - " with (package_path / f\"corrections/lp_ratios/signals/{year}_{key}.hist\").open(\"wb\") as f:\n", - " pickle.dump(sig_lp_hist, f)" + "# for key in [\n", + "# \"TTToSemiLeptonic\",\n", + "# # \"ST_tW_antitop_5f_NoFullyHadronicDecays\",\n", + "# # \"ST_tW_top_5f_NoFullyHadronicDecays\",\n", + "# # \"ST_s-channel_4f_leptonDecays\",\n", + "# # \"ST_t-channel_antitop_4f_InclusiveDecays\",\n", + "# # \"ST_t-channel_top_4f_InclusiveDecays\",\n", + "# ]:\n", + "# sig_lp_hist = utils.get_pickles(f\"{signal_data_dir}/{year}/{key}/pickles\", year, key)[\"lp_hist\"]\n", + "\n", + "# # remove negatives\n", + "# sig_lp_hist.values()[sig_lp_hist.values() < 0] = 0\n", + "\n", + "# with (package_path / f\"corrections/lp_ratios/signals/{year}_{key}.hist\").open(\"wb\") as f:\n", + "# pickle.dump(sig_lp_hist, f)" ] }, { @@ -422,12 +422,15 @@ "import uproot\n", "\n", "# initialize lund plane scale factors lookups\n", - "f = uproot.open(package_path / f\"corrections/lp_ratios/ratio_{year[:4]}.root\")\n", + "# f = uproot.open(package_path / f\"corrections/lp_ratios/ratio_{year[:4]}.root\")\n", + "f = uproot.open(package_path / f\"corrections/lp_ratios/ratio_2018.root\")\n", "\n", "# 3D histogram: [subjet_pt, ln(0.8/Delta), ln(kT/GeV)]\n", "mc_nom = f[\"mc_nom\"].to_numpy()\n", "ratio_edges = mc_nom[1:]\n", - "mc_nom = mc_nom[0]" + "mc_nom = mc_nom[0]\n", + "\n", + "ratio_nom = f[\"ratio_nom\"].to_numpy()[0]" ] }, { @@ -436,7 +439,7 @@ "metadata": {}, "outputs": [], "source": [ - "plt.imshow(sig_lp_hist.values()[0])" + "np.min(ratio_nom[ratio_nom > 0])" ] }, { @@ -445,7 +448,10 @@ "metadata": {}, "outputs": [], "source": [ - "plt.imshow(mc_nom[0])" + "with (package_path / f\"corrections/lp_ratios/signals/2018_GluGluToHHTobbVV_node_cHHH1.hist\").open(\n", + " \"rb\"\n", + ") as f:\n", + " sig_lp_hist = pickle.load(f)" ] }, { @@ -454,8 +460,16 @@ "metadata": {}, "outputs": [], "source": [ - "plt.imshow(sig_mc_ratio_pt[5])\n", - "plt.colorbar()" + "plt.imshow(sig_lp_hist.values()[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.imshow(mc_nom[0])" ] }, { @@ -465,7 +479,10 @@ "outputs": [], "source": [ "mc_tot = np.sum(mc_nom)\n", - "sig_tot = sig_lp_hist.sum()" + "sig_tot = sig_lp_hist.sum()\n", + "sig_mc_ratio = np.clip(\n", + " np.nan_to_num((sig_lp_hist.values() / sig_tot) / (mc_nom / mc_tot), nan=1), 0.5, 2.0\n", + ")" ] }, { @@ -487,9 +504,8 @@ "metadata": {}, "outputs": [], "source": [ - "sig_mc_ratio = np.clip(\n", - " np.nan_to_num((sig_lp_hist.values() / sig_tot) / (mc_nom / mc_tot), nan=1), 0.5, 2.0\n", - ")" + "plt.imshow(sig_mc_ratio_pt[0])\n", + "plt.colorbar()" ] }, { diff --git a/src/HHbbVV/postprocessing/TrainBDT.py b/src/HHbbVV/postprocessing/TrainBDT.py index 5c649b3e..80a0a3c5 100644 --- a/src/HHbbVV/postprocessing/TrainBDT.py +++ b/src/HHbbVV/postprocessing/TrainBDT.py @@ -821,6 +821,7 @@ def do_inference_year( # preds = preds[:, :-1] if multiclass else preds[:, 1] # save n-1 probs to save space preds = preds if multiclass else preds[:, 1] np.save(f"{model_dir}/inferences/{year}/preds.npy", preds) + print(preds) if jec_jmsr_shifts: for jshift in jec_shifts: diff --git a/src/HHbbVV/postprocessing/corrections.py b/src/HHbbVV/postprocessing/corrections.py index 64b95dd5..a6e68b74 100644 --- a/src/HHbbVV/postprocessing/corrections.py +++ b/src/HHbbVV/postprocessing/corrections.py @@ -310,10 +310,20 @@ def postprocess_lpsfs( else: raise ValueError("LP SF shapes are invalid") + nom_mean = None for key in ["lp_sf_nom", "lp_sf_toys", "lp_sf_pt_extrap_vars"] + sf_vars: CLIP = 5.0 td[key] = np.clip(np.nan_to_num(td[key], nan=1.0), 1.0 / CLIP, CLIP) - td[key] = td[key] / np.mean(td[key], axis=0) + + if key == "lp_sf_nom": + nom_mean = np.mean(td[key], axis=0) + + if "unmatched" not in key: + td[key] = td[key] / np.mean(td[key], axis=0) + else: + # unmatched normalization is otherwise dominated by unmatched jets which aren't in the pass regions + # which artificially inflates this uncertainty + td[key] = td[key] / nom_mean # add to dataframe if save_all: @@ -389,4 +399,6 @@ def get_lpsf( # tot_rel_unc = np.mean([tot_rel_unc_up, tot_rel_unc_down]) tot_unc = (lp_sf * tot_rel_unc_up, lp_sf * tot_rel_unc_down) + # breakpoint() + return lp_sf, tot_unc, uncs, uncs_asym diff --git a/src/HHbbVV/postprocessing/postprocessing.py b/src/HHbbVV/postprocessing/postprocessing.py index 1ba3ed05..4dad9bd9 100644 --- a/src/HHbbVV/postprocessing/postprocessing.py +++ b/src/HHbbVV/postprocessing/postprocessing.py @@ -520,8 +520,11 @@ def _add_nonres_columns(df, bb_mask, vbf_vars=False, ptlabel="", mlabel=""): if f"vbf_Mass_jj{ptlabel}" not in df.columns: df[f"vbf_Mass_jj{ptlabel}"] = jj.M + # df[f"vbf_Mass_jj{ptlabel}"] = np.nan_to_num(jj.M) if "vbf_dEta_jj" not in df.columns: df["vbf_dEta_jj"] = np.abs(vbf1.eta - vbf2.eta) + # df["vbf_dEta_jj"] = np.nan_to_num(np.abs(vbf1.eta - vbf2.eta)) + # print(f"VBF jet vars: {time.time() - start:.2f}") if not vbf_vars: