Skip to content

Commit

Permalink
lp postprocessing
Browse files Browse the repository at this point in the history
  • Loading branch information
rkansal47 committed Jul 24, 2024
1 parent 316ee18 commit 24b9096
Show file tree
Hide file tree
Showing 16 changed files with 301 additions and 164 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -197,3 +197,4 @@ src/HHbbVV/postprocessing/templates_old
src/HHbbVV/postprocessing/outs

paper/plots
temp
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ repos:
rev: "v4.6.0"
hooks:
- id: check-added-large-files
args: ["--maxkb=2000"]
args: ["--maxkb=10000"]
- id: check-case-conflict
- id: check-merge-conflict
- id: check-symlinks
Expand Down
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,14 @@ Or just signal:
python src/condor/submit.py --year 2017 --tag $TAG --samples HH --subsamples GluGluToHHTobbVV_node_cHHH1 --processor skimmer --submit
```


Submitting signal files to get only the Lund plane densities of all the signals:

```bash
for year in 2016APV 2016 2017 2018; do python src/condor/submit_from_yaml.py --year $year --tag 24Jul24LundPlaneDensity --processor skimmer --git-branch update_lp --yaml src/condor/submit_configs/skimmer_24_07_24_signal_lp.yaml --site ucsd --submit --no-save-skims --no-inference; done
```


### TaggerInputSkimmer

Applies a loose pre-selection cut, saves ntuples with training inputs.
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
2 changes: 1 addition & 1 deletion src/HHbbVV/hh_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
("HHbbVV", "GluGluToHHTobbVV_node_cHHH1"),
("ggHH_kl_2p45_kt_1_HHbbVV", "GluGluToHHTobbVV_node_cHHH2p45"),
("ggHH_kl_5_kt_1_HHbbVV", "GluGluToHHTobbVV_node_cHHH5"),
("ggHH_kl_0_kt_1_HHbbVV", "GluGluToHHTobbVV_node_cHHH0"),
# ("ggHH_kl_0_kt_1_HHbbVV", "GluGluToHHTobbVV_node_cHHH0"), # not used in combination
("VBFHHbbVV", "VBF_HHTobbVV_CV_1_C2V_1_C3_1"),
("qqHH_CV_1_C2V_0_kl_1_HHbbVV", "VBF_HHTobbVV_CV_1_C2V_0_C3_1"),
("qqHH_CV_1p5_C2V_1_kl_1_HHbbVV", "VBF_HHTobbVV_CV_1_5_C2V_1_C3_1"),
Expand Down
66 changes: 57 additions & 9 deletions src/HHbbVV/postprocessing/BDTPreProcessing.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
from __future__ import annotations

import argparse
import warnings
from collections import OrderedDict
from copy import copy
from pathlib import Path

import pandas as pd
import postprocessing
import TrainBDT
import utils
from pandas.errors import SettingWithCopyWarning

from HHbbVV import run_utils
from HHbbVV.hh_vars import (
BDT_sample_order,
jec_shifts,
Expand Down Expand Up @@ -69,20 +73,27 @@ def main(args):

bdt_data_dir = args.data_dir / "bdt_data"
bdt_data_dir.mkdir(exist_ok=True)
save_bdt_data(
events_dict, bb_masks, BDT_sample_order, bdt_data_dir / f"{args.year}_bdt_data.parquet"
)

for key in copy(BDT_sample_order):
if key not in all_samples:
BDT_sample_order.remove(key)

bdt_events_dict = get_bdt_data(events_dict, bb_masks, BDT_sample_order)

if args.save_data:
save_bdt_data(
bdt_events_dict, BDT_sample_order, bdt_data_dir / f"{args.year}_bdt_data.parquet"
)

if args.inference:
run_inference(args.year, bdt_events_dict, args.bdt_preds_dir, args.do_jshifts)


def save_bdt_data(
def get_bdt_data(
events_dict: dict[str, pd.DataFrame],
bb_masks: dict[str, pd.DataFrame],
BDT_sample_order: str,
out_file: Path,
):
import pyarrow as pa
import pyarrow.parquet as pq

jec_jmsr_vars = []

for var in BDT_data_vars:
Expand All @@ -103,6 +114,13 @@ def save_bdt_data(
events["Dataset"] = key
bdt_events_dict.append(events)

return bdt_events_dict


def save_bdt_data(bdt_events_dict: list[pd.DataFrame], BDT_sample_order: list[str], out_file: Path):
import pyarrow as pa
import pyarrow.parquet as pq

print("Saving BDT data to", out_file)

bdt_events = pd.concat(bdt_events_dict, axis=0)
Expand All @@ -117,7 +135,37 @@ def save_bdt_data(
f.write(str(sample_order_dict))


def run_inference(
year: str, bdt_events_dict: list[pd.DataFrame], bdt_preds_dir: str, do_jshifts: bool
):
import xgboost as xgb

model = xgb.XGBClassifier()
model.load_model(args.bdt_model)

bdt_events = pd.concat(bdt_events_dict, axis=0)

TrainBDT.do_inference_year(
model,
bdt_preds_dir,
year,
bdt_events,
TrainBDT.AllTaggerBDTVars,
jec_jmsr_shifts=do_jshifts,
multiclass=True,
)


if __name__ == "__main__":
args = postprocessing.parse_args()
parser = argparse.ArgumentParser()
run_utils.add_bool_arg(parser, "save-data", default=True, help="save preprocessed data")
run_utils.add_bool_arg(parser, "inference", default=False, help="run inference on data")
parser.add_argument(
"--bdt-model",
default="src/HHbbVV/postprocessing/bdt_models/24_04_05_k2v0_training_eqsig_vbf_vars_rm_deta.model",
help="path to BDT model, if running inference",
type=str,
)
args = postprocessing.parse_args(parser)
args.data_dir = Path(args.data_dir)
main(args)
100 changes: 57 additions & 43 deletions src/HHbbVV/postprocessing/TrainBDT.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
"VVFatJetPtOverbbFatJetPt",
"vbf_dEta_jj",
"DijetdPhi", # TODO: current dPhi is buggy
"DijetdEta",
# "DijetdEta",
]


Expand Down Expand Up @@ -786,59 +786,73 @@ def evaluate_model(
pickle.dump(test, f)


def do_inference(
def do_inference_year(
model: xgb.XGBClassifier,
model_dir: str,
data_dict: dict[str, pd.DataFrame],
year: str,
data: pd.DataFrame,
bdtVars: list[str],
jec_jmsr_shifts: bool = True,
multiclass: bool = False,
):
""" """
import time

(model_dir / "inferences").mkdir(exist_ok=True, parents=True)

year_data_dict = {year: data}
(model_dir / "inferences" / year).mkdir(exist_ok=True, parents=True)

sample_order = list(pd.unique(data["Dataset"]))
value_counts = data["Dataset"].value_counts()
sample_order_dict = OrderedDict([(sample, value_counts[sample]) for sample in sample_order])

with (model_dir / f"inferences/{year}/sample_order.txt").open("w") as f:
f.write(str(sample_order_dict))

print("Running inference")
X = get_X(year_data_dict, bdtVars)
model.get_booster().feature_names = bdtVars

print(X)

start = time.time()
preds = model.predict_proba(X)
print(f"Finished in {time.time() - start:.2f}s")
# preds = preds[:, :-1] if multiclass else preds[:, 1] # save n-1 probs to save space
preds = preds if multiclass else preds[:, 1]
np.save(f"{model_dir}/inferences/{year}/preds.npy", preds)

if jec_jmsr_shifts:
for jshift in jec_shifts:
print("Running inference for", jshift)
X, mcvars = get_X(year_data_dict, bdtVars, jec_shift=jshift)
# have to change model's feature names since we're passing in a dataframe
model.get_booster().feature_names = mcvars
preds = model.predict_proba(X)
preds = preds if multiclass else preds[:, 1]
np.save(f"{model_dir}/inferences/{year}/preds_{jshift}.npy", preds)

for jshift in jmsr_shifts:
print("Running inference for", jshift)
X, mcvars = get_X(year_data_dict, bdtVars, jmsr_shift=jshift)
# have to change model's feature names since we're passing in a dataframe
model.get_booster().feature_names = mcvars
preds = model.predict_proba(X)
preds = preds if multiclass else preds[:, 1]
np.save(f"{model_dir}/inferences/{year}/preds_{jshift}.npy", preds)


def do_inference(
model: xgb.XGBClassifier,
model_dir: str,
data_dict: dict[str, pd.DataFrame],
bdtVars: list[str],
jec_jmsr_shifts: bool = True,
multiclass: bool = False,
):
"""Wrapper to run inference over all years"""
for year, data in data_dict.items():
year_data_dict = {year: data}
(model_dir / "inferences" / year).mkdir(exist_ok=True, parents=True)

sample_order = list(pd.unique(data["Dataset"]))
value_counts = data["Dataset"].value_counts()
sample_order_dict = OrderedDict([(sample, value_counts[sample]) for sample in sample_order])

with (model_dir / f"inferences/{year}/sample_order.txt").open("w") as f:
f.write(str(sample_order_dict))

print("Running inference")
X = get_X(year_data_dict, bdtVars)
model.get_booster().feature_names = bdtVars

start = time.time()
preds = model.predict_proba(X)
print(f"Finished in {time.time() - start:.2f}s")
# preds = preds[:, :-1] if multiclass else preds[:, 1] # save n-1 probs to save space
preds = preds if multiclass else preds[:, 1]
np.save(f"{model_dir}/inferences/{year}/preds.npy", preds)

if jec_jmsr_shifts:
for jshift in jec_shifts:
print("Running inference for", jshift)
X, mcvars = get_X(year_data_dict, bdtVars, jec_shift=jshift)
# have to change model's feature names since we're passing in a dataframe
model.get_booster().feature_names = mcvars
preds = model.predict_proba(X)
preds = preds if multiclass else preds[:, 1]
np.save(f"{model_dir}/inferences/{year}/preds_{jshift}.npy", preds)

for jshift in jmsr_shifts:
print("Running inference for", jshift)
X, mcvars = get_X(year_data_dict, bdtVars, jmsr_shift=jshift)
# have to change model's feature names since we're passing in a dataframe
model.get_booster().feature_names = mcvars
preds = model.predict_proba(X)
preds = preds if multiclass else preds[:, 1]
np.save(f"{model_dir}/inferences/{year}/preds_{jshift}.npy", preds)
do_inference_year(model, model_dir, year, data, bdtVars, jec_jmsr_shifts, multiclass)


if __name__ == "__main__":
Expand Down
Binary file not shown.
Loading

0 comments on commit 24b9096

Please sign in to comment.