lp postprocessing

rkansal47 · Jul 24, 2024 · 24b9096 · 24b9096
1 parent 316ee18
commit 24b9096
Show file tree

Hide file tree

Showing 16 changed files with 301 additions and 164 deletions.
diff --git a/.gitignore b/.gitignore
@@ -197,3 +197,4 @@ src/HHbbVV/postprocessing/templates_old
 src/HHbbVV/postprocessing/outs
 
 paper/plots
+temp
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,7 +20,7 @@ repos:
     rev: "v4.6.0"
     hooks:
       - id: check-added-large-files
-        args: ["--maxkb=2000"]
+        args: ["--maxkb=10000"]
       - id: check-case-conflict
       - id: check-merge-conflict
       - id: check-symlinks

diff --git a/README.md b/README.md
@@ -227,6 +227,14 @@ Or just signal:
 python src/condor/submit.py --year 2017 --tag $TAG --samples HH --subsamples GluGluToHHTobbVV_node_cHHH1 --processor skimmer --submit
 ```
 
+
+Submitting signal files to get only the Lund plane densities of all the signals:
+
+```bash
+for year in 2016APV 2016 2017 2018; do python src/condor/submit_from_yaml.py --year $year --tag 24Jul24LundPlaneDensity --processor skimmer --git-branch update_lp --yaml src/condor/submit_configs/skimmer_24_07_24_signal_lp.yaml --site ucsd --submit --no-save-skims --no-inference; done
+```
+
+
 ### TaggerInputSkimmer
 
 Applies a loose pre-selection cut, saves ntuples with training inputs.

diff --git a/src/HHbbVV/corrections/lp_ratios/signals/2016APV_GluGluToHHTobbVV_node_cHHH1.hist b/src/HHbbVV/corrections/lp_ratios/signals/2016APV_GluGluToHHTobbVV_node_cHHH1.hist
diff --git a/src/HHbbVV/corrections/lp_ratios/signals/2016_GluGluToHHTobbVV_node_cHHH1.hist b/src/HHbbVV/corrections/lp_ratios/signals/2016_GluGluToHHTobbVV_node_cHHH1.hist
diff --git a/src/HHbbVV/corrections/lp_ratios/signals/2017_GluGluToHHTobbVV_node_cHHH1.hist b/src/HHbbVV/corrections/lp_ratios/signals/2017_GluGluToHHTobbVV_node_cHHH1.hist
diff --git a/src/HHbbVV/corrections/lp_ratios/signals/2018_GluGluToHHTobbVV_node_cHHH1.hist b/src/HHbbVV/corrections/lp_ratios/signals/2018_GluGluToHHTobbVV_node_cHHH1.hist
diff --git a/src/HHbbVV/hh_vars.py b/src/HHbbVV/hh_vars.py
@@ -55,7 +55,7 @@
         ("HHbbVV", "GluGluToHHTobbVV_node_cHHH1"),
         ("ggHH_kl_2p45_kt_1_HHbbVV", "GluGluToHHTobbVV_node_cHHH2p45"),
         ("ggHH_kl_5_kt_1_HHbbVV", "GluGluToHHTobbVV_node_cHHH5"),
-        ("ggHH_kl_0_kt_1_HHbbVV", "GluGluToHHTobbVV_node_cHHH0"),
+        # ("ggHH_kl_0_kt_1_HHbbVV", "GluGluToHHTobbVV_node_cHHH0"),  # not used in combination
         ("VBFHHbbVV", "VBF_HHTobbVV_CV_1_C2V_1_C3_1"),
         ("qqHH_CV_1_C2V_0_kl_1_HHbbVV", "VBF_HHTobbVV_CV_1_C2V_0_C3_1"),
         ("qqHH_CV_1p5_C2V_1_kl_1_HHbbVV", "VBF_HHTobbVV_CV_1_5_C2V_1_C3_1"),

diff --git a/src/HHbbVV/postprocessing/BDTPreProcessing.py b/src/HHbbVV/postprocessing/BDTPreProcessing.py
@@ -1,14 +1,18 @@
 from __future__ import annotations
 
+import argparse
 import warnings
 from collections import OrderedDict
+from copy import copy
 from pathlib import Path
 
 import pandas as pd
 import postprocessing
+import TrainBDT
 import utils
 from pandas.errors import SettingWithCopyWarning
 
+from HHbbVV import run_utils
 from HHbbVV.hh_vars import (
     BDT_sample_order,
     jec_shifts,
@@ -69,20 +73,27 @@ def main(args):
 
     bdt_data_dir = args.data_dir / "bdt_data"
     bdt_data_dir.mkdir(exist_ok=True)
-    save_bdt_data(
-        events_dict, bb_masks, BDT_sample_order, bdt_data_dir / f"{args.year}_bdt_data.parquet"
-    )
+
+    for key in copy(BDT_sample_order):
+        if key not in all_samples:
+            BDT_sample_order.remove(key)
+
+    bdt_events_dict = get_bdt_data(events_dict, bb_masks, BDT_sample_order)
+
+    if args.save_data:
+        save_bdt_data(
+            bdt_events_dict, BDT_sample_order, bdt_data_dir / f"{args.year}_bdt_data.parquet"
+        )
+
+    if args.inference:
+        run_inference(args.year, bdt_events_dict, args.bdt_preds_dir, args.do_jshifts)
 
 
-def save_bdt_data(
+def get_bdt_data(
     events_dict: dict[str, pd.DataFrame],
     bb_masks: dict[str, pd.DataFrame],
     BDT_sample_order: str,
-    out_file: Path,
 ):
-    import pyarrow as pa
-    import pyarrow.parquet as pq
-
     jec_jmsr_vars = []
 
     for var in BDT_data_vars:
@@ -103,6 +114,13 @@ def save_bdt_data(
         events["Dataset"] = key
         bdt_events_dict.append(events)
 
+    return bdt_events_dict
+
+
+def save_bdt_data(bdt_events_dict: list[pd.DataFrame], BDT_sample_order: list[str], out_file: Path):
+    import pyarrow as pa
+    import pyarrow.parquet as pq
+
     print("Saving BDT data to", out_file)
 
     bdt_events = pd.concat(bdt_events_dict, axis=0)
@@ -117,7 +135,37 @@ def save_bdt_data(
         f.write(str(sample_order_dict))
 
 
+def run_inference(
+    year: str, bdt_events_dict: list[pd.DataFrame], bdt_preds_dir: str, do_jshifts: bool
+):
+    import xgboost as xgb
+
+    model = xgb.XGBClassifier()
+    model.load_model(args.bdt_model)
+
+    bdt_events = pd.concat(bdt_events_dict, axis=0)
+
+    TrainBDT.do_inference_year(
+        model,
+        bdt_preds_dir,
+        year,
+        bdt_events,
+        TrainBDT.AllTaggerBDTVars,
+        jec_jmsr_shifts=do_jshifts,
+        multiclass=True,
+    )
+
+
 if __name__ == "__main__":
-    args = postprocessing.parse_args()
+    parser = argparse.ArgumentParser()
+    run_utils.add_bool_arg(parser, "save-data", default=True, help="save preprocessed data")
+    run_utils.add_bool_arg(parser, "inference", default=False, help="run inference on data")
+    parser.add_argument(
+        "--bdt-model",
+        default="src/HHbbVV/postprocessing/bdt_models/24_04_05_k2v0_training_eqsig_vbf_vars_rm_deta.model",
+        help="path to BDT model, if running inference",
+        type=str,
+    )
+    args = postprocessing.parse_args(parser)
     args.data_dir = Path(args.data_dir)
     main(args)
diff --git a/src/HHbbVV/postprocessing/TrainBDT.py b/src/HHbbVV/postprocessing/TrainBDT.py
@@ -74,7 +74,7 @@
     "VVFatJetPtOverbbFatJetPt",
     "vbf_dEta_jj",
     "DijetdPhi",  # TODO: current dPhi is buggy
-    "DijetdEta",
+    # "DijetdEta",
 ]
 
 
@@ -786,59 +786,73 @@ def evaluate_model(
         pickle.dump(test, f)
 
 
-def do_inference(
+def do_inference_year(
     model: xgb.XGBClassifier,
     model_dir: str,
-    data_dict: dict[str, pd.DataFrame],
+    year: str,
+    data: pd.DataFrame,
     bdtVars: list[str],
     jec_jmsr_shifts: bool = True,
     multiclass: bool = False,
 ):
-    """ """
     import time
 
     (model_dir / "inferences").mkdir(exist_ok=True, parents=True)
 
+    year_data_dict = {year: data}
+    (model_dir / "inferences" / year).mkdir(exist_ok=True, parents=True)
+
+    sample_order = list(pd.unique(data["Dataset"]))
+    value_counts = data["Dataset"].value_counts()
+    sample_order_dict = OrderedDict([(sample, value_counts[sample]) for sample in sample_order])
+
+    with (model_dir / f"inferences/{year}/sample_order.txt").open("w") as f:
+        f.write(str(sample_order_dict))
+
+    print("Running inference")
+    X = get_X(year_data_dict, bdtVars)
+    model.get_booster().feature_names = bdtVars
+
+    print(X)
+
+    start = time.time()
+    preds = model.predict_proba(X)
+    print(f"Finished in {time.time() - start:.2f}s")
+    # preds = preds[:, :-1] if multiclass else preds[:, 1]  # save n-1 probs to save space
+    preds = preds if multiclass else preds[:, 1]
+    np.save(f"{model_dir}/inferences/{year}/preds.npy", preds)
+
+    if jec_jmsr_shifts:
+        for jshift in jec_shifts:
+            print("Running inference for", jshift)
+            X, mcvars = get_X(year_data_dict, bdtVars, jec_shift=jshift)
+            # have to change model's feature names since we're passing in a dataframe
+            model.get_booster().feature_names = mcvars
+            preds = model.predict_proba(X)
+            preds = preds if multiclass else preds[:, 1]
+            np.save(f"{model_dir}/inferences/{year}/preds_{jshift}.npy", preds)
+
+        for jshift in jmsr_shifts:
+            print("Running inference for", jshift)
+            X, mcvars = get_X(year_data_dict, bdtVars, jmsr_shift=jshift)
+            # have to change model's feature names since we're passing in a dataframe
+            model.get_booster().feature_names = mcvars
+            preds = model.predict_proba(X)
+            preds = preds if multiclass else preds[:, 1]
+            np.save(f"{model_dir}/inferences/{year}/preds_{jshift}.npy", preds)
+
+
+def do_inference(
+    model: xgb.XGBClassifier,
+    model_dir: str,
+    data_dict: dict[str, pd.DataFrame],
+    bdtVars: list[str],
+    jec_jmsr_shifts: bool = True,
+    multiclass: bool = False,
+):
+    """Wrapper to run inference over all years"""
     for year, data in data_dict.items():
-        year_data_dict = {year: data}
-        (model_dir / "inferences" / year).mkdir(exist_ok=True, parents=True)
-
-        sample_order = list(pd.unique(data["Dataset"]))
-        value_counts = data["Dataset"].value_counts()
-        sample_order_dict = OrderedDict([(sample, value_counts[sample]) for sample in sample_order])
-
-        with (model_dir / f"inferences/{year}/sample_order.txt").open("w") as f:
-            f.write(str(sample_order_dict))
-
-        print("Running inference")
-        X = get_X(year_data_dict, bdtVars)
-        model.get_booster().feature_names = bdtVars
-
-        start = time.time()
-        preds = model.predict_proba(X)
-        print(f"Finished in {time.time() - start:.2f}s")
-        # preds = preds[:, :-1] if multiclass else preds[:, 1]  # save n-1 probs to save space
-        preds = preds if multiclass else preds[:, 1]
-        np.save(f"{model_dir}/inferences/{year}/preds.npy", preds)
-
-        if jec_jmsr_shifts:
-            for jshift in jec_shifts:
-                print("Running inference for", jshift)
-                X, mcvars = get_X(year_data_dict, bdtVars, jec_shift=jshift)
-                # have to change model's feature names since we're passing in a dataframe
-                model.get_booster().feature_names = mcvars
-                preds = model.predict_proba(X)
-                preds = preds if multiclass else preds[:, 1]
-                np.save(f"{model_dir}/inferences/{year}/preds_{jshift}.npy", preds)
-
-            for jshift in jmsr_shifts:
-                print("Running inference for", jshift)
-                X, mcvars = get_X(year_data_dict, bdtVars, jmsr_shift=jshift)
-                # have to change model's feature names since we're passing in a dataframe
-                model.get_booster().feature_names = mcvars
-                preds = model.predict_proba(X)
-                preds = preds if multiclass else preds[:, 1]
-                np.save(f"{model_dir}/inferences/{year}/preds_{jshift}.npy", preds)
+        do_inference_year(model, model_dir, year, data, bdtVars, jec_jmsr_shifts, multiclass)
 
 
 if __name__ == "__main__":

diff --git a/src/HHbbVV/postprocessing/bdt_models/24_04_05_k2v0_training_eqsig_vbf_vars_rm_deta.model b/src/HHbbVV/postprocessing/bdt_models/24_04_05_k2v0_training_eqsig_vbf_vars_rm_deta.model
Original file line number	Diff line number	Diff line change
Expand Up		@@ -197,3 +197,4 @@ src/HHbbVV/postprocessing/templates_old
		src/HHbbVV/postprocessing/outs

		paper/plots
		temp