Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/rkansal47/HHbbVV
Browse files Browse the repository at this point in the history
  • Loading branch information
rkansal47 committed Apr 9, 2024
2 parents 8587564 + 0cf5171 commit 0fc0f67
Show file tree
Hide file tree
Showing 177 changed files with 1,895 additions and 118 deletions.
1 change: 1 addition & 0 deletions .codespell-whitelist.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
subjet
fpr
tpr
datas
6 changes: 5 additions & 1 deletion src/HHbbVV/postprocessing/CreateDatacard.py
Original file line number Diff line number Diff line change
Expand Up @@ -1163,7 +1163,11 @@ def get_systematics_abcd(channels, channels_dict, channels_summed, rates_dict):
vals = []
for channel in channels:
for _, key in enumerate(all_processes):
if key == qcd_data_key or skey not in channel_systs_dict[channel][key]:
if (
key == qcd_data_key
or key not in channel_systs_dict[channel]
or skey not in channel_systs_dict[channel][key]
):
vals.append("-")
else:
val, val_down = channel_systs_dict[channel][key][skey]
Expand Down
152 changes: 124 additions & 28 deletions src/HHbbVV/postprocessing/InferenceAnalysis.ipynb

Large diffs are not rendered by default.

201 changes: 142 additions & 59 deletions src/HHbbVV/postprocessing/TrainBDT.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,20 +61,20 @@
# "VVFatJetParTMD_THWW4q",
"VVFatJetParTMD_probHWW3q",
"VVFatJetParTMD_probQCD",
"vbf_Mass_jj",
"VVFatJetParTMD_probHWW4q",
"VVFatJetParticleNetMass",
"DijetMass",
"VVFatJetParTMD_probT",
"VVFatJetPtOverDijetPt",
"VVFatJetPt",
"DijetPt",
"VVFatJetPtOverDijetPt",
"MET_pt",
"bbFatJetPt",
"VVFatJetPt",
"VVFatJetPtOverbbFatJetPt",
"MET_pt",
"DijetdEta",
"DijetdPhi", # TODO: current dPhi is buggy
"vbf_Mass_jj",
"vbf_dEta_jj",
"DijetdPhi", # TODO: current dPhi is buggy
"DijetdEta",
]


Expand Down Expand Up @@ -286,6 +286,8 @@ def main(args):

if args.rem_feats:
bdtVars = bdtVars[: -args.rem_feats]
if len(args.rem_feats_name):
bdtVars = [var for var in bdtVars if var not in args.rem_feats_name]

print("BDT features:\n", bdtVars)

Expand Down Expand Up @@ -500,6 +502,128 @@ def _get_bdt_scores(preds, sig_keys, multiclass):
return preds[:, : len(sig_keys)] / (preds[:, : len(sig_keys)] + bg_tot)


def plot_train_test_shapes(
train, test, sig_keys, model_dir, test_size, training_keys, equalize_sig_bg
):
# BDT score shapes
bins = [[20, 0, 1], [20, 0.4, 1], [20, 0.8, 1], [20, 0.9, 1], [20, 0.98, 1]]

if len(sig_keys) == 1:
scores = [("BDTScore", "BDT Score")]
else:
scores = [
(f"BDTScore{sig_key}", f"BDT Score {plotting.sample_label_map[sig_key]}")
for sig_key in sig_keys
]

plot_vars = [utils.ShapeVar(*score, tbins) for score in scores for tbins in bins]

save_model_dir = model_dir / "hists"
save_model_dir.mkdir(exist_ok=True, parents=True)

for year in train:
for shape_var in plot_vars:
h = Hist(
hist.axis.StrCategory(["Train", "Test"], name="Data"),
hist.axis.StrCategory(training_keys, name="Sample"),
shape_var.axis,
storage="weight",
)

for dataset, label in [(train, "Train"), (test, "Test")]:
# Normalize the two distributions
data_sf = (0.5 / test_size) if label == "Test" else (0.5 / (1 - test_size))
for key in training_keys:
# scale signals back down to normal by ~equalizing scale factor
sf = data_sf / 1e6 if (key in sig_keys and equalize_sig_bg) else data_sf
data = dataset[year][dataset[year]["Dataset"] == key]
fill_data = {shape_var.var: data[shape_var.var]}
h.fill(Data=label, Sample=key, **fill_data, weight=data[weight_key] * sf)

plotting.ratioTestTrain(
h,
training_keys,
shape_var,
year,
save_model_dir,
name=f"{year}_{shape_var.var}_{shape_var.bins[1]}",
)

plotting.ratioTestTrain(
h,
[key for key in training_keys if key != "QCD"],
shape_var,
year,
save_model_dir,
name=f"{year}_{shape_var.var}_{shape_var.bins[1]}_noqcd",
)


# def plot_mass_shapes(model, bdtVars, multiclass, train, test, sig_keys, model_dir, training_keys):
def plot_mass_shapes(train, test, sig_keys, model_dir, training_keys):
cuts = [0, 0.1, 0.5, 0.9, 0.95]

shape_var = utils.ShapeVar(
var="bbFatJetParticleNetMass", label=r"$m^{bb}_{reg}$ (GeV)", bins=[20, 50, 250]
)

save_model_dir = model_dir / "mass_hists"
save_model_dir.mkdir(exist_ok=True, parents=True)

for data_dict, label in [(train, "train"), (test, "test")]:
for key in training_keys:
for sig_key in sig_keys:
fig, axs = plt.subplots(len(data_dict), 1, figsize=(12, 12 * len(data_dict)))
datas = []
for i, (year, data) in enumerate(data_dict.items()):
ed_key = {key: data[data["Dataset"] == key]}
datas.append(data[data["Dataset"] == key])

plotting.cutsLinePlot(
ed_key,
shape_var,
key,
f"BDTScore{sig_key}",
cuts,
year,
weight_key,
ax=axs[i] if len(data_dict) > 1 else axs,
)

plt.savefig(
save_model_dir / f"{label}_{key}_BDT{sig_key}Cuts.pdf", bbox_inches="tight"
)

if len(data_dict) > 1:
ed_key = {key: pd.concat(datas, axis=0)}
plotting.cutsLinePlot(
ed_key,
shape_var,
key,
f"BDTScore{sig_key}",
cuts,
"all",
weight_key,
plot_dir=save_model_dir,
name=f"{label}_{key}_BDT{sig_key}Cuts_AllYears",
show=False,
)

# make plots with artificially increased qcd statistics
# if key == qcd_key:
# nsampling = 100
# X = get_X(data_dict, bdtVars)
# X = X.iloc[np.tile(np.arange(len(X)), nsampling)]

# rng = np.random.default_rng(seed=42)
# X["VVFatJetParticleNetMass"] = rng.random(len(X)) * 15 + 125
# X["VVFatJetParTMD_probHWW3q"] = rng.random(len(X)) * 0.05 + 0.95
# X["VVFatJetParTMD_probQCD"] = rng.random(len(X)) * 0.025 + 0.025

# preds = model.predict_proba(get_X(data, bdtVars))
# preds = _get_bdt_scores(preds, sig_keys, multiclass)


def evaluate_model(
model: xgb.XGBClassifier,
model_dir: str,
Expand Down Expand Up @@ -626,58 +750,12 @@ def evaluate_model(
plotting.multiROCCurve(rocs, plot_dir=model_dir, name="roc_combined_thresholds")
plotting.multiROCCurve(rocs, thresholds=[], plot_dir=model_dir, name="roc_combined")

# BDT score shapes
bins = [[20, 0, 1], [20, 0.4, 1], [20, 0.8, 1], [20, 0.9, 1], [20, 0.98, 1]]

if len(sig_keys) == 1:
scores = [("BDTScore", "BDT Score")]
else:
scores = [
(f"BDTScore{sig_key}", f"BDT Score {plotting.sample_label_map[sig_key]}")
for sig_key in sig_keys
]

plot_vars = [utils.ShapeVar(*score, tbins) for score in scores for tbins in bins]

save_model_dir = model_dir / "hists"
save_model_dir.mkdir(exist_ok=True, parents=True)

for year in train:
for shape_var in plot_vars:
h = Hist(
hist.axis.StrCategory(["Train", "Test"], name="Data"),
hist.axis.StrCategory(training_keys, name="Sample"),
shape_var.axis,
storage="weight",
)

for dataset, label in [(train, "Train"), (test, "Test")]:
# Normalize the two distributions
data_sf = (0.5 / test_size) if label == "Test" else (0.5 / (1 - test_size))
for key in training_keys:
# scale signals back down to normal by ~equalizing scale factor
sf = data_sf / 1e6 if (key in sig_keys and equalize_sig_bg) else data_sf
data = dataset[year][dataset[year]["Dataset"] == key]
fill_data = {shape_var.var: data[shape_var.var]}
h.fill(Data=label, Sample=key, **fill_data, weight=data[weight_key] * sf)

plotting.ratioTestTrain(
h,
training_keys,
shape_var,
year,
save_model_dir,
name=f"{year}_{shape_var.var}_{shape_var.bins[1]}",
)

plotting.ratioTestTrain(
h,
[key for key in training_keys if key != "QCD"],
shape_var,
year,
save_model_dir,
name=f"{year}_{shape_var.var}_{shape_var.bins[1]}_noqcd",
)
plot_mass_shapes(train, test, sig_keys, model_dir, training_keys)
print("Made mass plots")
plot_train_test_shapes(
train, test, sig_keys, model_dir, test_size, training_keys, equalize_sig_bg
)
print("Made histograms")

# temporarily save train and test data as pickles to iterate on plots
with (model_dir / "train.pkl").open("wb") as f:
Expand Down Expand Up @@ -815,7 +893,12 @@ def do_inference(
"--n-estimators", default=10000, help="max number of trees to keep adding", type=int
)

parser.add_argument("--rem-feats", default=0, help="remove N lowest importance feats", type=int)
parser.add_argument(
"--rem-feats", default=0, help="remove N lowest importance training feats", type=int
)
parser.add_argument(
"--rem-feats-name", default=[], help="remove training features by name", type=str, nargs="*"
)

"""
Slightly worse to use a single tagger score
Expand Down
19 changes: 13 additions & 6 deletions src/HHbbVV/postprocessing/bash_scripts/BDTPlots.sh
Original file line number Diff line number Diff line change
@@ -1,22 +1,28 @@
#!/bin/bash
# shellcheck disable=SC2086,SC2043
# shellcheck disable=SC2086,SC2043,SC2206

####################################################################################################
# BDT Sculpting plots
# Author: Raghav Kansal
####################################################################################################

years=("2016APV" "2016" "2017" "2018")

MAIN_DIR="../../.."
data_dir="$MAIN_DIR/../data/skimmer/24Mar14UpdateData"
bdt_preds_dir="$data_dir/24_03_07_new_samples_max_depth_5/inferences"
bdt_preds_dir="$data_dir/24_04_03_k2v0_training_eqsig_vbf_vars/inferences"
TAG=""


options=$(getopt -o "" --long "tag:" -- "$@")
options=$(getopt -o "" --long "year:,tag:" -- "$@")
eval set -- "$options"

while true; do
case "$1" in
--year)
shift
years=($1)
;;
--tag)
shift
TAG=$1
Expand All @@ -41,9 +47,10 @@ if [[ -z $TAG ]]; then
exit 1
fi

for year in 2016APV 2016 2017 2018
for year in "${years[@]}"
do
python postprocessing.py --year $year --data-dir $data_dir --bdt-preds-dir $bdt_preds_dir --no-lp-sf-all-years \
--sig-samples GluGluToHHTobbVV_node_cHHH1 --bg-keys QCD TT "Z+Jets" \
echo $year
python postprocessing.py --year $year --data-dir $data_dir --bdt-preds-dir $bdt_preds_dir \
--sig-samples GluGluToHHTobbVV_node_cHHH1 qqHH_CV_1_C2V_0_kl_1_HHbbVV --bg-keys QCD TT "Z+Jets" --no-data \
--bdt-plots --plot-dir "$MAIN_DIR/plots/PostProcessing/$TAG"
done
4 changes: 2 additions & 2 deletions src/HHbbVV/postprocessing/bash_scripts/NonresTemplatesScan.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ years=("2016APV" "2016" "2017" "2018")

MAIN_DIR="../../.."
data_dir="$MAIN_DIR/../data/skimmer/24Mar14UpdateData"
bdt_preds_dir="$data_dir/24_04_03_k2v0_training_eqsig_vbf_vars/inferences"
bdt_preds_dir="$data_dir/24_04_05_k2v0_training_eqsig_vbf_vars_rm_deta/inferences"
# sig_samples="HHbbVV qqHH_CV_1_C2V_0_kl_1_HHbbVV"
sig_samples="qqHH_CV_1_C2V_0_kl_1_HHbbVV"
TAG=""
Expand All @@ -28,7 +28,7 @@ while true; do
;;
--bdt)
# bdt_cut="--nonres-bdt-wp 0.99 0.997 0.998 0.999 0.9997 0.9999"
bdt_cut="--nonres-bdt-wp 0.997 0.998 0.999 0.9997 0.9999"
bdt_cut="--nonres-bdt-wp 0.99 0.997 0.998 0.999 0.9997 0.9999"
;;
--txbb)
txbb_cut="--nonres-txbb-wp MP HP"
Expand Down
Loading

0 comments on commit 0fc0f67

Please sign in to comment.