diff --git a/.gitignore b/.gitignore index 8380c5db..073a78d0 100644 --- a/.gitignore +++ b/.gitignore @@ -197,3 +197,4 @@ src/HHbbVV/postprocessing/templates_old src/HHbbVV/postprocessing/outs paper/plots +temp diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index dea80fd6..2927dd23 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,7 +20,7 @@ repos: rev: "v4.6.0" hooks: - id: check-added-large-files - args: ["--maxkb=2000"] + args: ["--maxkb=10000"] - id: check-case-conflict - id: check-merge-conflict - id: check-symlinks diff --git a/README.md b/README.md index 26755488..f6fc36a2 100644 --- a/README.md +++ b/README.md @@ -227,6 +227,14 @@ Or just signal: python src/condor/submit.py --year 2017 --tag $TAG --samples HH --subsamples GluGluToHHTobbVV_node_cHHH1 --processor skimmer --submit ``` + +Submitting signal files to get only the Lund plane densities of all the signals: + +```bash +for year in 2016APV 2016 2017 2018; do python src/condor/submit_from_yaml.py --year $year --tag 24Jul24LundPlaneDensity --processor skimmer --git-branch update_lp --yaml src/condor/submit_configs/skimmer_24_07_24_signal_lp.yaml --site ucsd --submit --no-save-skims --no-inference; done +``` + + ### TaggerInputSkimmer Applies a loose pre-selection cut, saves ntuples with training inputs. diff --git a/src/HHbbVV/corrections/lp_ratios/signals/2016APV_GluGluToHHTobbVV_node_cHHH1.hist b/src/HHbbVV/corrections/lp_ratios/signals/2016APV_GluGluToHHTobbVV_node_cHHH1.hist new file mode 100644 index 00000000..97ea5ba0 Binary files /dev/null and b/src/HHbbVV/corrections/lp_ratios/signals/2016APV_GluGluToHHTobbVV_node_cHHH1.hist differ diff --git a/src/HHbbVV/corrections/lp_ratios/signals/2016_GluGluToHHTobbVV_node_cHHH1.hist b/src/HHbbVV/corrections/lp_ratios/signals/2016_GluGluToHHTobbVV_node_cHHH1.hist new file mode 100644 index 00000000..ea18ea99 Binary files /dev/null and b/src/HHbbVV/corrections/lp_ratios/signals/2016_GluGluToHHTobbVV_node_cHHH1.hist differ diff --git a/src/HHbbVV/corrections/lp_ratios/signals/2017_GluGluToHHTobbVV_node_cHHH1.hist b/src/HHbbVV/corrections/lp_ratios/signals/2017_GluGluToHHTobbVV_node_cHHH1.hist new file mode 100644 index 00000000..0ada776f Binary files /dev/null and b/src/HHbbVV/corrections/lp_ratios/signals/2017_GluGluToHHTobbVV_node_cHHH1.hist differ diff --git a/src/HHbbVV/corrections/lp_ratios/signals/2018_GluGluToHHTobbVV_node_cHHH1.hist b/src/HHbbVV/corrections/lp_ratios/signals/2018_GluGluToHHTobbVV_node_cHHH1.hist new file mode 100644 index 00000000..34b7de97 Binary files /dev/null and b/src/HHbbVV/corrections/lp_ratios/signals/2018_GluGluToHHTobbVV_node_cHHH1.hist differ diff --git a/src/HHbbVV/hh_vars.py b/src/HHbbVV/hh_vars.py index c25e6b90..6419c6a1 100644 --- a/src/HHbbVV/hh_vars.py +++ b/src/HHbbVV/hh_vars.py @@ -55,7 +55,7 @@ ("HHbbVV", "GluGluToHHTobbVV_node_cHHH1"), ("ggHH_kl_2p45_kt_1_HHbbVV", "GluGluToHHTobbVV_node_cHHH2p45"), ("ggHH_kl_5_kt_1_HHbbVV", "GluGluToHHTobbVV_node_cHHH5"), - ("ggHH_kl_0_kt_1_HHbbVV", "GluGluToHHTobbVV_node_cHHH0"), + # ("ggHH_kl_0_kt_1_HHbbVV", "GluGluToHHTobbVV_node_cHHH0"), # not used in combination ("VBFHHbbVV", "VBF_HHTobbVV_CV_1_C2V_1_C3_1"), ("qqHH_CV_1_C2V_0_kl_1_HHbbVV", "VBF_HHTobbVV_CV_1_C2V_0_C3_1"), ("qqHH_CV_1p5_C2V_1_kl_1_HHbbVV", "VBF_HHTobbVV_CV_1_5_C2V_1_C3_1"), diff --git a/src/HHbbVV/postprocessing/BDTPreProcessing.py b/src/HHbbVV/postprocessing/BDTPreProcessing.py index 9b0c14f4..d0a64222 100644 --- a/src/HHbbVV/postprocessing/BDTPreProcessing.py +++ b/src/HHbbVV/postprocessing/BDTPreProcessing.py @@ -1,14 +1,18 @@ from __future__ import annotations +import argparse import warnings from collections import OrderedDict +from copy import copy from pathlib import Path import pandas as pd import postprocessing +import TrainBDT import utils from pandas.errors import SettingWithCopyWarning +from HHbbVV import run_utils from HHbbVV.hh_vars import ( BDT_sample_order, jec_shifts, @@ -69,20 +73,27 @@ def main(args): bdt_data_dir = args.data_dir / "bdt_data" bdt_data_dir.mkdir(exist_ok=True) - save_bdt_data( - events_dict, bb_masks, BDT_sample_order, bdt_data_dir / f"{args.year}_bdt_data.parquet" - ) + + for key in copy(BDT_sample_order): + if key not in all_samples: + BDT_sample_order.remove(key) + + bdt_events_dict = get_bdt_data(events_dict, bb_masks, BDT_sample_order) + + if args.save_data: + save_bdt_data( + bdt_events_dict, BDT_sample_order, bdt_data_dir / f"{args.year}_bdt_data.parquet" + ) + + if args.inference: + run_inference(args.year, bdt_events_dict, args.bdt_preds_dir, args.do_jshifts) -def save_bdt_data( +def get_bdt_data( events_dict: dict[str, pd.DataFrame], bb_masks: dict[str, pd.DataFrame], BDT_sample_order: str, - out_file: Path, ): - import pyarrow as pa - import pyarrow.parquet as pq - jec_jmsr_vars = [] for var in BDT_data_vars: @@ -103,6 +114,13 @@ def save_bdt_data( events["Dataset"] = key bdt_events_dict.append(events) + return bdt_events_dict + + +def save_bdt_data(bdt_events_dict: list[pd.DataFrame], BDT_sample_order: list[str], out_file: Path): + import pyarrow as pa + import pyarrow.parquet as pq + print("Saving BDT data to", out_file) bdt_events = pd.concat(bdt_events_dict, axis=0) @@ -117,7 +135,37 @@ def save_bdt_data( f.write(str(sample_order_dict)) +def run_inference( + year: str, bdt_events_dict: list[pd.DataFrame], bdt_preds_dir: str, do_jshifts: bool +): + import xgboost as xgb + + model = xgb.XGBClassifier() + model.load_model(args.bdt_model) + + bdt_events = pd.concat(bdt_events_dict, axis=0) + + TrainBDT.do_inference_year( + model, + bdt_preds_dir, + year, + bdt_events, + TrainBDT.AllTaggerBDTVars, + jec_jmsr_shifts=do_jshifts, + multiclass=True, + ) + + if __name__ == "__main__": - args = postprocessing.parse_args() + parser = argparse.ArgumentParser() + run_utils.add_bool_arg(parser, "save-data", default=True, help="save preprocessed data") + run_utils.add_bool_arg(parser, "inference", default=False, help="run inference on data") + parser.add_argument( + "--bdt-model", + default="src/HHbbVV/postprocessing/bdt_models/24_04_05_k2v0_training_eqsig_vbf_vars_rm_deta.model", + help="path to BDT model, if running inference", + type=str, + ) + args = postprocessing.parse_args(parser) args.data_dir = Path(args.data_dir) main(args) diff --git a/src/HHbbVV/postprocessing/TrainBDT.py b/src/HHbbVV/postprocessing/TrainBDT.py index 3a5dfd52..5c649b3e 100644 --- a/src/HHbbVV/postprocessing/TrainBDT.py +++ b/src/HHbbVV/postprocessing/TrainBDT.py @@ -74,7 +74,7 @@ "VVFatJetPtOverbbFatJetPt", "vbf_dEta_jj", "DijetdPhi", # TODO: current dPhi is buggy - "DijetdEta", + # "DijetdEta", ] @@ -786,59 +786,73 @@ def evaluate_model( pickle.dump(test, f) -def do_inference( +def do_inference_year( model: xgb.XGBClassifier, model_dir: str, - data_dict: dict[str, pd.DataFrame], + year: str, + data: pd.DataFrame, bdtVars: list[str], jec_jmsr_shifts: bool = True, multiclass: bool = False, ): - """ """ import time (model_dir / "inferences").mkdir(exist_ok=True, parents=True) + year_data_dict = {year: data} + (model_dir / "inferences" / year).mkdir(exist_ok=True, parents=True) + + sample_order = list(pd.unique(data["Dataset"])) + value_counts = data["Dataset"].value_counts() + sample_order_dict = OrderedDict([(sample, value_counts[sample]) for sample in sample_order]) + + with (model_dir / f"inferences/{year}/sample_order.txt").open("w") as f: + f.write(str(sample_order_dict)) + + print("Running inference") + X = get_X(year_data_dict, bdtVars) + model.get_booster().feature_names = bdtVars + + print(X) + + start = time.time() + preds = model.predict_proba(X) + print(f"Finished in {time.time() - start:.2f}s") + # preds = preds[:, :-1] if multiclass else preds[:, 1] # save n-1 probs to save space + preds = preds if multiclass else preds[:, 1] + np.save(f"{model_dir}/inferences/{year}/preds.npy", preds) + + if jec_jmsr_shifts: + for jshift in jec_shifts: + print("Running inference for", jshift) + X, mcvars = get_X(year_data_dict, bdtVars, jec_shift=jshift) + # have to change model's feature names since we're passing in a dataframe + model.get_booster().feature_names = mcvars + preds = model.predict_proba(X) + preds = preds if multiclass else preds[:, 1] + np.save(f"{model_dir}/inferences/{year}/preds_{jshift}.npy", preds) + + for jshift in jmsr_shifts: + print("Running inference for", jshift) + X, mcvars = get_X(year_data_dict, bdtVars, jmsr_shift=jshift) + # have to change model's feature names since we're passing in a dataframe + model.get_booster().feature_names = mcvars + preds = model.predict_proba(X) + preds = preds if multiclass else preds[:, 1] + np.save(f"{model_dir}/inferences/{year}/preds_{jshift}.npy", preds) + + +def do_inference( + model: xgb.XGBClassifier, + model_dir: str, + data_dict: dict[str, pd.DataFrame], + bdtVars: list[str], + jec_jmsr_shifts: bool = True, + multiclass: bool = False, +): + """Wrapper to run inference over all years""" for year, data in data_dict.items(): - year_data_dict = {year: data} - (model_dir / "inferences" / year).mkdir(exist_ok=True, parents=True) - - sample_order = list(pd.unique(data["Dataset"])) - value_counts = data["Dataset"].value_counts() - sample_order_dict = OrderedDict([(sample, value_counts[sample]) for sample in sample_order]) - - with (model_dir / f"inferences/{year}/sample_order.txt").open("w") as f: - f.write(str(sample_order_dict)) - - print("Running inference") - X = get_X(year_data_dict, bdtVars) - model.get_booster().feature_names = bdtVars - - start = time.time() - preds = model.predict_proba(X) - print(f"Finished in {time.time() - start:.2f}s") - # preds = preds[:, :-1] if multiclass else preds[:, 1] # save n-1 probs to save space - preds = preds if multiclass else preds[:, 1] - np.save(f"{model_dir}/inferences/{year}/preds.npy", preds) - - if jec_jmsr_shifts: - for jshift in jec_shifts: - print("Running inference for", jshift) - X, mcvars = get_X(year_data_dict, bdtVars, jec_shift=jshift) - # have to change model's feature names since we're passing in a dataframe - model.get_booster().feature_names = mcvars - preds = model.predict_proba(X) - preds = preds if multiclass else preds[:, 1] - np.save(f"{model_dir}/inferences/{year}/preds_{jshift}.npy", preds) - - for jshift in jmsr_shifts: - print("Running inference for", jshift) - X, mcvars = get_X(year_data_dict, bdtVars, jmsr_shift=jshift) - # have to change model's feature names since we're passing in a dataframe - model.get_booster().feature_names = mcvars - preds = model.predict_proba(X) - preds = preds if multiclass else preds[:, 1] - np.save(f"{model_dir}/inferences/{year}/preds_{jshift}.npy", preds) + do_inference_year(model, model_dir, year, data, bdtVars, jec_jmsr_shifts, multiclass) if __name__ == "__main__": diff --git a/src/HHbbVV/postprocessing/bdt_models/24_04_05_k2v0_training_eqsig_vbf_vars_rm_deta.model b/src/HHbbVV/postprocessing/bdt_models/24_04_05_k2v0_training_eqsig_vbf_vars_rm_deta.model new file mode 100644 index 00000000..3792b119 Binary files /dev/null and b/src/HHbbVV/postprocessing/bdt_models/24_04_05_k2v0_training_eqsig_vbf_vars_rm_deta.model differ diff --git a/src/HHbbVV/postprocessing/corrections.py b/src/HHbbVV/postprocessing/corrections.py index 7d571ee9..64b95dd5 100644 --- a/src/HHbbVV/postprocessing/corrections.py +++ b/src/HHbbVV/postprocessing/corrections.py @@ -2,6 +2,7 @@ import json import pickle +from copy import deepcopy from pathlib import Path import awkward as ak @@ -183,9 +184,10 @@ def get_uncorr_trig_eff_unc( def postprocess_lpsfs( events: pd.DataFrame, - num_jets: int = 2, + # num_jets: int = 2, num_lp_sf_toys: int = 100, save_all: bool = True, + CLIP: float = 5.0, ): """ (1) Splits LP SFs into bb and VV based on gen matching. @@ -197,27 +199,44 @@ def postprocess_lpsfs( num_jets (int, optional): The number of jets. Defaults to 2. num_lp_sf_toys (int, optional): The number of LP SF toys. Defaults to 100. save_all (bool, optional): Whether to save all LP SFs or only the nominal values. Defaults to True. + CLIP (float, optional): The value to clip the LP SFs at. Defaults to 5.0. """ + sf_vars = [ + # "lp_sf_nom", + "lp_sf_sys_down", + "lp_sf_sys_up", + "lp_sf_dist_down", + "lp_sf_dist_up", + "lp_sf_np_up", + "lp_sf_np_down", + "lp_sf_unmatched_up", + "lp_sf_unmatched_down", + ] + + matching_vars = [ + "lp_sf_double_matched_event", + "lp_sf_inside_boundary_quarks", + "lp_sf_outside_boundary_quarks", + "lp_sf_unmatched_quarks", + "lp_sf_rc_unmatched_quarks", + ] + # for jet in ["bb", "VV"]: for jet in ["VV"]: # temp dict td = {} # defaults of 1 for jets which aren't matched to anything - i.e. no SF - for key in ["lp_sf_nom", "lp_sf_sys_down", "lp_sf_sys_up"]: + for key in ["lp_sf_nom"] + sf_vars: td[key] = np.ones(len(events)) td["lp_sf_toys"] = np.ones((len(events), num_lp_sf_toys)) td["lp_sf_pt_extrap_vars"] = np.ones((len(events), num_lp_sf_toys)) # defaults of 0 - i.e. don't contribute to unc. - for key in [ - "lp_sf_double_matched_event", - "lp_sf_unmatched_quarks", - "lp_sf_boundary_quarks", - ]: + for key in matching_vars: td[key] = np.zeros(len(events)) # lp sfs saved for both jets @@ -225,27 +244,21 @@ def postprocess_lpsfs( raise NotImplementedError( "Updated LP SF post-processing not implemented yet for both jets with SFs" ) - # get events where we have a gen-matched fatjet - # ignore rare case (~0.002%) where two jets are matched to same gen Higgs - events.loc[np.sum(events[f"ak8FatJetH{jet}"], axis=1) > 1, f"ak8FatJetH{jet}"] = 0 - jet_match = events[f"ak8FatJetH{jet}"].astype(bool) - - # fill values from matched jets - for j in range(num_jets): - offset = num_lp_sf_toys + 1 - td["lp_sf_nom"][jet_match[j]] = events["lp_sf_lnN"][jet_match[j]][j * offset] - td["lp_sf_toys"][jet_match[j]] = events["lp_sf_lnN"][jet_match[j]].loc[ - :, j * offset + 1 : (j + 1) * offset - 1 - ] - - for key in [ - "lp_sf_sys_down", - "lp_sf_sys_up", - "lp_sf_double_matched_event", - "lp_sf_unmatched_quarks", - "lp_sf_boundary_quarks", - ]: - td[key][jet_match[j]] = events[key][jet_match[j]][j] + # # get events where we have a gen-matched fatjet + # # ignore rare case (~0.002%) where two jets are matched to same gen Higgs + # events.loc[np.sum(events[f"ak8FatJetH{jet}"], axis=1) > 1, f"ak8FatJetH{jet}"] = 0 + # jet_match = events[f"ak8FatJetH{jet}"].astype(bool) + + # # fill values from matched jets + # for j in range(num_jets): + # offset = num_lp_sf_toys + 1 + # td["lp_sf_nom"][jet_match[j]] = events["lp_sf_lnN"][jet_match[j]][j * offset] + # td["lp_sf_toys"][jet_match[j]] = events["lp_sf_lnN"][jet_match[j]].loc[ + # :, j * offset + 1 : (j + 1) * offset - 1 + # ] + + # for key in sf_vars + matching_vars: + # td[key][jet_match[j]] = events[key][jet_match[j]][j] # lp sfs saved only for hvv jet elif events["lp_sf_sys_up"].shape[1] == 1: # noqa: RET506 @@ -263,25 +276,43 @@ def postprocess_lpsfs( for key in [ "lp_sf_sys_down", "lp_sf_sys_up", - "lp_sf_double_matched_event", - "lp_sf_unmatched_quarks", - "lp_sf_boundary_quarks", - ]: + "lp_sf_dist_down", + "lp_sf_dist_up", + ] + matching_vars: td[key][jet_match] = events[key][jet_match].squeeze() + # num prongs uncertainty variations + up_prong_rc = ( # events which need re-clustering with +1 prong + (td["lp_sf_outside_boundary_quarks"] > 0) + | (td["lp_sf_double_matched_event"] > 0) + | (td["lp_sf_unmatched_quarks"] > 0) + ) + + down_prong_rc = ( # events which need re-clustering with -1 prong + (td["lp_sf_inside_boundary_quarks"] > 0) + | (td["lp_sf_double_matched_event"] > 0) + | (td["lp_sf_unmatched_quarks"] > 0) + ) + + for shift, prong_rc in [("up", up_prong_rc), ("down", down_prong_rc)]: + td[f"lp_sf_np_{shift}"] = deepcopy(td["lp_sf_nom"]) + # replace with NP up/down SFs where needed + td[f"lp_sf_np_{shift}"][prong_rc] = events[f"lp_sf_np_{shift}"][0][prong_rc] + + # unmatched quarks uncertainty variations + rc_unmatched = td["lp_sf_rc_unmatched_quarks"] > 0 + + for shift in ["up", "down"]: + td[f"lp_sf_unmatched_{shift}"] = deepcopy(td["lp_sf_nom"]) + # replace with max SF where needed + td[f"lp_sf_unmatched_{shift}"][rc_unmatched] = CLIP if shift == "up" else 1.0 / CLIP + else: raise ValueError("LP SF shapes are invalid") - for key in [ - "lp_sf_nom", - "lp_sf_toys", - "lp_sf_sys_down", - "lp_sf_sys_up", - "lp_sf_pt_extrap_vars", - ]: - # cut off at 10 - td[key] = np.minimum(td[key], 10) - # normalise + for key in ["lp_sf_nom", "lp_sf_toys", "lp_sf_pt_extrap_vars"] + sf_vars: + CLIP = 5.0 + td[key] = np.clip(np.nan_to_num(td[key], nan=1.0), 1.0 / CLIP, CLIP) td[key] = td[key] / np.mean(td[key], axis=0) # add to dataframe @@ -312,8 +343,6 @@ def get_lpsf( if sel is not None: events = events[sel] - tot_matched = np.sum(np.sum(events[f"ak8FatJetH{jet}"].astype(bool))) - weight = events[weight_key].to_numpy() tot_pre = np.sum(weight) tot_post = np.sum(weight * events[f"{jet}_lp_sf_nom"][0]) @@ -322,14 +351,15 @@ def get_lpsf( uncs = {} # difference in yields between up and down shifts on LP SFs - uncs["syst_unc"] = np.abs( - ( - np.sum(events[f"{jet}_lp_sf_sys_up"][0] * weight) - - np.sum(events[f"{jet}_lp_sf_sys_down"][0] * weight) + for unc in ["sys", "np"]: + uncs[f"{unc}_unc"] = np.abs( + ( + np.sum(events[f"{jet}_lp_sf_{unc}_up"][0] * weight) + - np.sum(events[f"{jet}_lp_sf_{unc}_down"][0] * weight) + ) + / 2 + / tot_post ) - / 2 - / tot_post - ) # std of yields after all smearings uncs["stat_unc"] = ( @@ -345,36 +375,18 @@ def get_lpsf( / tot_post ) - if VV: - # OR of double matched and boundary quarks - # >0.1 to avoid floating point errors - quark_unmatched = ( - events[f"{jet}_lp_sf_double_matched_event"][0] - + events[f"{jet}_lp_sf_boundary_quarks"][0] - ) > 0.1 - - sj_matching_unc = np.sum(quark_unmatched) - - # check quark <-> subjet matching for 2, 3, 4 prong-ed jets - # ignoring events which are already double matched / have boundary quarks to avoid double counting - num_prongs = events["ak8FatJetHVVNumProngs"][0][~quark_unmatched] - for nump in range(2, 5): - sj_matching_unc += ( - np.sum( - events[f"{jet}_lp_sf_unmatched_quarks"][0][~quark_unmatched][num_prongs == nump] - ) - / nump - ) + uncs_asym = {} - uncs["sj_matching_unc"] = sj_matching_unc / tot_matched - else: - num_prongs = 2 - uncs["sj_matching_unc"] = ( - (np.sum(events[f"{jet}_lp_sf_unmatched_quarks"][0]) / num_prongs) - + np.sum(events[f"{jet}_lp_sf_double_matched_event"][0]) - ) / tot_matched + for shift in ["up", "down"]: + uncs_asym[shift] = {} + for unc in ["dist", "unmatched"]: + uncs_asym[shift][unc] = np.abs( + (np.sum(events[f"{jet}_lp_sf_{unc}_{shift}"][0] * weight) - tot_post) / tot_post + ) - tot_rel_unc = np.linalg.norm(list(uncs.values())) - tot_unc = lp_sf * tot_rel_unc + tot_rel_unc_up = np.linalg.norm(list(uncs.values()) + list(uncs_asym["up"].values())) + tot_rel_unc_down = np.linalg.norm(list(uncs.values()) + list(uncs_asym["down"].values())) + # tot_rel_unc = np.mean([tot_rel_unc_up, tot_rel_unc_down]) + tot_unc = (lp_sf * tot_rel_unc_up, lp_sf * tot_rel_unc_down) - return lp_sf, tot_unc, uncs + return lp_sf, tot_unc, uncs, uncs_asym diff --git a/src/HHbbVV/postprocessing/postprocessing.py b/src/HHbbVV/postprocessing/postprocessing.py index 5275dbf5..1ba3ed05 100644 --- a/src/HHbbVV/postprocessing/postprocessing.py +++ b/src/HHbbVV/postprocessing/postprocessing.py @@ -518,6 +518,10 @@ def _add_nonres_columns(df, bb_mask, vbf_vars=False, ptlabel="", mlabel=""): if "DijetdPhi" not in df.columns: df["DijetdPhi"] = np.abs(bbJet.deltaphi(VVJet)) + if f"vbf_Mass_jj{ptlabel}" not in df.columns: + df[f"vbf_Mass_jj{ptlabel}"] = jj.M + if "vbf_dEta_jj" not in df.columns: + df["vbf_dEta_jj"] = np.abs(vbf1.eta - vbf2.eta) # print(f"VBF jet vars: {time.time() - start:.2f}") if not vbf_vars: @@ -541,10 +545,6 @@ def _add_nonres_columns(df, bb_mask, vbf_vars=False, ptlabel="", mlabel=""): df["vbf_dR_j1_Hbb"] = vbf2.deltaR(bbJet) if "vbf_dR_jj" not in df.columns: df["vbf_dR_jj"] = vbf1.deltaR(vbf2) - if f"vbf_Mass_jj{ptlabel}" not in df.columns: - df[f"vbf_Mass_jj{ptlabel}"] = jj.M - if "vbf_dEta_jj" not in df.columns: - df["vbf_dEta_jj"] = np.abs(vbf1.eta - vbf2.eta) if "DijetdEta" not in df.columns: df["DijetdEta"] = np.abs(bbJet.eta - VVJet.eta) @@ -1145,8 +1145,10 @@ def _lpsfs(args, filters, scan, scan_cuts, scan_wps, sig_keys, sig_samples): systematics, sig_samples=sig_samples, filters=filters, - all_years=True, + all_years=args.lp_sf_all_years, + year=args.year, bdt_preds_dir=args.bdt_preds_dir, + template_dir=args.template_dir, systs_file=systs_file, data_dir=args.signal_data_dirs[0], ) @@ -1172,6 +1174,7 @@ def _get_signal_all_years( samples: dict, filters: list, bdt_preds_dir: Path = None, + year: str = None, ): """Load signal samples for all years and combine them for LP SF measurements.""" print(f"Loading {sig_key} for all years") @@ -1198,7 +1201,9 @@ def _get_signal_all_years( for i in range(num_columns): column_labels.append(f"('{key}', '{i}')") - for year in years: + run_years = years if year is None else [year] + + for year in run_years: events_dict = load_samples( data_dir, {sig_key: samples[sig_key]}, @@ -1252,11 +1257,10 @@ def lpsfs( sig_keys: list[str], lp_selection_regions: Region | list[Region], systematics: dict, - events_dict: dict[str, pd.DataFrame] = None, - bb_masks: dict[str, pd.DataFrame] = None, sig_samples: dict[str, str] = None, filters: list = [], # noqa: B006 all_years: bool = False, + year: str = None, bdt_preds_dir: Path = None, template_dir: Path = None, systs_file: Path = None, @@ -1281,12 +1285,12 @@ def lpsfs( print(f"\nGetting LP SFs for {sig_key}") + assert data_dir is not None, "Need data_dir to load signals" + assert filters != [], "Need to specify filters" + assert sig_samples is not None, "Need sig_samples to load signals" + # SFs are correlated across all years so needs to be calculated with full dataset if all_years: - assert data_dir is not None, "Need data_dir to load signals for all years" - assert filters != [], "Need to specify filters to load signals for all years" - assert sig_samples is not None, "Need sig_samples to load signals for all years" - events_dict = _get_signal_all_years( sig_key, data_dir, sig_samples, filters, bdt_preds_dir ) @@ -1294,11 +1298,12 @@ def lpsfs( # ONLY FOR TESTING, can do just for a single year else: - assert events_dict is not None, "Need events_dict to calculate LP SFs for single year" - assert bb_masks is not None, "Need bb_masks to calculate LP SFs for single year" + events_dict = _get_signal_all_years( + sig_key, data_dir, sig_samples, filters, bdt_preds_dir, year=year + ) + bb_masks = bb_VV_assignment(events_dict) - events_dict[sig_key] = postprocess_lpsfs(events_dict[sig_key]) - continue + # continue for lp_region in lp_selection_regions: rlabel = lp_region.lpsf_region @@ -1313,13 +1318,17 @@ def lpsfs( rsysts[sig_key] = {} sel, _ = utils.make_selection(lp_region.cuts, events_dict, bb_masks) - lp_sf, unc, uncs = get_lpsf(events_dict[sig_key], sel[sig_key]) + lp_sf, unc, uncs_sym, uncs_asym = get_lpsf(events_dict[sig_key], sel[sig_key]) - print(f"LP Scale Factor for {sig_key} in {rlabel} region: {lp_sf:.2f} ± {unc:.2f}") + print( + f"LP Scale Factor for {sig_key} in {rlabel} region: {lp_sf:.2f} +{unc[0]:.2f}-{unc[1]:.2f}" + ) rsysts[sig_key]["lp_sf"] = lp_sf - rsysts[sig_key]["lp_sf_unc"] = unc / lp_sf - rsysts[sig_key]["lp_sf_uncs"] = uncs + rsysts[sig_key]["lp_sf_unc_up"] = unc[0] / lp_sf + rsysts[sig_key]["lp_sf_unc_down"] = unc[1] / lp_sf + rsysts[sig_key]["lp_sf_uncs_sym"] = uncs_sym + rsysts[sig_key]["lp_sf_uncs_asym"] = uncs_asym if systs_file is not None: with systs_file.open("w") as f: @@ -1333,9 +1342,13 @@ def lpsfs( for sig_key in sig_keys: systs = rsysts[sig_key] sf_table[sig_key] = { - "SF": f"{systs['lp_sf']:.2f} ± {systs['lp_sf'] * systs['lp_sf_unc']:.2f}", - **systs["lp_sf_uncs"], + "SF": f"{systs['lp_sf']:.2f} +{systs['lp_sf'] * systs['lp_sf_unc_up']:.2f}-{systs['lp_sf'] * systs['lp_sf_unc_down']:.2f}-", + **systs["lp_sf_uncs_sym"], } + for key in systs["lp_sf_uncs_asym"]["up"]: + sf_table[sig_key][ + key + ] = f"+{systs['lp_sf_uncs_asym']['up'][key]:.2f}-{systs['lp_sf_uncs_asym']['down'][key]:.2f}" print(f"\nLP Scale Factors in {rlabel}:\n", pd.DataFrame(sf_table).T) pd.DataFrame(sf_table).T.to_csv(f"{template_dir}/lpsfs_{rlabel}.csv") @@ -1916,8 +1929,10 @@ def save_templates( print("Saved templates to", template_file) -def parse_args(): - parser = argparse.ArgumentParser() +def parse_args(parser=None): + if parser is None: + parser = argparse.ArgumentParser() + parser.add_argument( "--data-dir", default=None, @@ -1935,7 +1950,7 @@ def parse_args(): parser.add_argument( "--year", - default="2017", + required=True, choices=["2016", "2016APV", "2017", "2018"], type=str, ) diff --git a/src/HHbbVV/scale_factors/accumulate_lp_hists.py b/src/HHbbVV/scale_factors/accumulate_lp_hists.py new file mode 100644 index 00000000..db86318c --- /dev/null +++ b/src/HHbbVV/scale_factors/accumulate_lp_hists.py @@ -0,0 +1,38 @@ +""" +Accumulating Lund plane density histograms from signal condor jobs. + +Author: Raghav Kansal +""" + +from __future__ import annotations + +import argparse +import pickle +from pathlib import Path + +from tqdm import tqdm + +from HHbbVV.hh_vars import nonres_samples, years +from HHbbVV.postprocessing import utils + +parser = argparse.ArgumentParser() +parser.add_argument("--data-path", required=True, help="path to skimmer outputs", type=str) +args = parser.parse_args() + + +package_path = Path(__file__).parent.parent.resolve() + +for year in years: + print(year) + for key in tqdm(nonres_samples.values()): + sig_lp_hist = utils.get_pickles(f"{args.data_path}/{year}/{key}/pickles", year, key)[ + "lp_hist" + ] + + # remove negatives + sig_lp_hist.values()[sig_lp_hist.values() < 0] = 0 + + with (package_path / f"corrections/lp_ratios/signals/{year}_{key}.hist").open("wb") as f: + pickle.dump(sig_lp_hist, f) + + break diff --git a/src/condor/check_jobs.py b/src/condor/check_jobs.py index bd43cff0..0f4e6d15 100644 --- a/src/condor/check_jobs.py +++ b/src/condor/check_jobs.py @@ -21,6 +21,7 @@ parser.add_argument("--user", default="rkansal", help="user", type=str) parser.add_argument("--site", default="lpc", help="t2 site", choices=["lpc", "ucsd"], type=str) +run_utils.add_bool_arg(parser, "check-parquet", default=True, help="check parquet files") run_utils.add_bool_arg(parser, "submit-missing", default=False, help="submit missing files") run_utils.add_bool_arg( parser, @@ -81,7 +82,7 @@ for sample in samples: print(f"Checking {sample}") - if not trigger_processor: + if not trigger_processor and args.check_parquet: if not Path(f"{eosdir}/{sample}/parquet").exists(): print_red(f"No parquet directory for {sample}!") @@ -115,7 +116,7 @@ int(out.split(".")[0].split("_")[-1]) for out in listdir(f"{eosdir}/{sample}/pickles") ] - if trigger_processor: + if trigger_processor or not args.check_parquet: print(f"Out pickles: {outs_pickles}") for i in range(jdl_dict[sample]): @@ -134,7 +135,7 @@ if args.submit_missing: os.system(f"condor_submit {jdl_file}") - if not trigger_processor and i not in outs_parquet: + if not trigger_processor and args.check_parquet and i not in outs_parquet: print_red(f"Missing output parquet #{i} for sample {sample}") diff --git a/src/condor/submit_configs/skimmer_24_07_24_signal_lp.yaml b/src/condor/submit_configs/skimmer_24_07_24_signal_lp.yaml index 1f197468..c154804a 100644 --- a/src/condor/submit_configs/skimmer_24_07_24_signal_lp.yaml +++ b/src/condor/submit_configs/skimmer_24_07_24_signal_lp.yaml @@ -16,7 +16,7 @@ "VBF_HHTobbVV_CV_1_C2V_1_C3_0", "VBF_HHTobbVV_CV_1_C2V_1_C3_2", ], - "files_per_job": 20, + "files_per_job": 10, "max_files": 60, "chunksize": 80000, # "maxchunks": 20