Skip to content

Commit

Permalink
Merge pull request #4 from HEP-KBFI/example
Browse files Browse the repository at this point in the history
Example
  • Loading branch information
Laurits7 authored Apr 4, 2024
2 parents 5490ca8 + db01974 commit d6d274c
Show file tree
Hide file tree
Showing 16 changed files with 30,736 additions and 29,915 deletions.
19,992 changes: 5,997 additions & 13,995 deletions enreg/config/datasets/test.yaml

Large diffs are not rendered by default.

33,902 changes: 20,992 additions & 12,910 deletions enreg/config/datasets/train.yaml

Large diffs are not rendered by default.

5,991 changes: 3,003 additions & 2,988 deletions enreg/config/datasets/validation.yaml

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion enreg/config/ml_datasets.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# list_dir: $HOME/ml-tau-en-reg/enreg/config/datasets
list_dir: /home/laurits/tmp/datasets
list_dir: /home/laurits/ml-tau-en-reg/enreg/config/datasets
relative_sizes:
train: 0.7
test: 0.2
Expand Down
6 changes: 3 additions & 3 deletions enreg/config/model_training.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
verbosity: 1
output_dir: /home/laurits/ENREG/training/20240314_HEPMC_based_PT_tauID/
output_dir: /home/laurits/ENREG/training/20240403_PT_full_stats_ZH_plus_Z_regression_no_lifetme/
samples_to_use:
- ZH_Htautau
- QCD
# - Z_Ztautau # As there are no Z_Ztautau files in the training dataset
# - QCD
- Z_Ztautau # As there are no Z_Ztautau files in the training dataset
defaults:
- models: models
- datasets: datasets
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
feature_standardization:
standardize_inputs: False
method: "mean_rms"
path: /home/laurits/ml-tau-en-reg/enreg/data/ParticleTransformer_FeatureStandardization_wLifetime_2023Jun22.json
path: /home/laurits/ml-tau-en-reg/enreg/data/ParticleTransformer_FeatureStandardization.json
defaults:
- builder
- dataset
Expand Down
2 changes: 1 addition & 1 deletion enreg/config/models/ParticleTransformer/dataset.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
dataset:
max_cands: 25
use_pdgId: True
use_lifetime: True
use_lifetime: False
# metric: eta-phi
min_jet_theta: 0.0
max_jet_theta: 1000
Expand Down
6 changes: 3 additions & 3 deletions enreg/config/models/ParticleTransformer/training.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
training:
type: classificiation #regression # Or classification
type: regression # Or classification
model_file: ParticleTransformer_model.pt # sync with builder
max_num_files: -1
batch_size: 256
num_epochs: 500
batch_size: 512
num_epochs: 300
num_dataloader_workers: 6
classweight_sig: 1
classweight_bgr: 10
Expand Down
10 changes: 5 additions & 5 deletions enreg/config/ntupelizer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,21 +53,21 @@ branches:
- SiTracks_Refitted_1.covMatrix[21]
test_run: False
samples_to_process:
- Z_Ztautau
- ZH_Htautau
# - Z_Ztautau
# - ZH_Htautau
- QCD
samples:
Z_Ztautau:
input_dir: /local/joosep/clic_edm4hep/2024_03/p8_ee_Z_Ztautau_ecm380/
output_dir: /home/laurits/ENREG/ntuples/20240329_hepmc_genjets/Z_Ztautau
output_dir: /home/laurits/ENREG/ntuples/20240402_full_stats/Z_Ztautau
is_signal: True
ZH_Htautau:
input_dir: /local/joosep/clic_edm4hep/2024_03/p8_ee_ZH_Htautau_ecm380/
output_dir: /home/laurits/ENREG/ntuples/20240329_hepmc_genjets/ZH_Htautau
output_dir: /home/laurits/ENREG/ntuples/20240402_full_stats/ZH_Htautau
is_signal: True
QCD:
input_dir: /local/joosep/clic_edm4hep/2024_03/p8_ee_qq_ecm380/
output_dir: /home/laurits/ENREG/ntuples/20240329_hepmc_genjets/QCD
output_dir: /home/laurits/ENREG/ntuples/20240402_full_stats/QCD
is_signal: False
validation:
output_dir: $HOME/CLIC_ntuple_validation_v5
Expand Down
1 change: 0 additions & 1 deletion enreg/scripts/ntupelize_edm4hep.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from enreg.tools.data_management import ntupelizer_slurm_tools as nst



def save_record_to_file(data: dict, output_path: str) -> None:
print(f"Saving to precessed data to {output_path}")
ak.to_parquet(data, output_path)
Expand Down
90 changes: 90 additions & 0 deletions enreg/scripts/resubmit_ntupelizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import os
import glob
import time
import hydra
import shutil
import awkward as ak
from omegaconf import DictConfig
from enreg.tools.data_management import ntupelizer as nt


tmp_dir = "/home/laurits/tmp/EEEPRGGQTR"

def prepare_resubmission(tmp_dir):
resubmission_dir = os.path.join(tmp_dir, "executables", "resubmit")
shutil.rmtree(resubmission_dir)
os.makedirs(resubmission_dir, exist_ok=True)
input_paths_files = []
output_paths_files = []
for path in glob.glob(os.path.join(tmp_dir, "error_files", "*")):
if os.path.getsize(path) != 0:
index = os.path.basename(path).strip("error")
print(path)
executable_path = os.path.join(os.path.join(tmp_dir, "executables", f"execute{index}.sh"))
new_executable_path = os.path.join(os.path.join(resubmission_dir, f"execute{index}.sh"))
input_paths_file = os.path.join(tmp_dir, f"input_paths_{index}.txt")
output_paths_file = os.path.join(tmp_dir, f"output_paths_{index}.txt")
input_paths_files.append(input_paths_file)
output_paths_files.append(output_paths_file)
# print(f"Copying {executable_path} to {new_executable_path}")
shutil.copy(executable_path, new_executable_path)
print(f"Jobs to resubmit: {len(input_paths_files)}")
print(f"Run `bash enreg/scripts/submit_builder_batchJobs.sh {resubmission_dir}`")
return input_paths_files, output_paths_files


def find_faulty_files(input_paths_files, output_paths_files):
input_file_list = []
output_file_list = []
for input_paths_file, output_paths_file in zip(input_paths_files, output_paths_files):
with open(input_paths_file, "rt") as inFile:
for line in inFile:
input_file_list.append(line.strip('\n'))
with open(output_paths_file, "rt") as inFile:
for line in inFile:
output_file_list.append(line.strip('\n'))
return input_file_list, output_file_list


def save_record_to_file(data: dict, output_path: str) -> None:
print(f"Saving to precessed data to {output_path}")
ak.to_parquet(data, output_path)


def process_single_file(
input_path: str,
output_path: str,
cfg: DictConfig
):
sample = os.path.basename(os.path.dirname(output_path))
if not os.path.exists(output_path):
start_time = time.time()
remove_bkg = cfg.samples[sample].is_signal
data = nt.process_input_file(input_path, cfg.tree_path, cfg.branches, remove_background=remove_bkg)
save_record_to_file(data, output_path)
end_time = time.time()
print(f"Finished processing in {end_time-start_time} s.")
else:
print("File already processed, skipping.")


@hydra.main(config_path="../config", config_name="ntupelizer", version_base=None)
def main(cfg: DictConfig) -> None:
input_paths_files, output_paths_files = prepare_resubmission(cfg.tmp_dir)
input_file_list, output_file_list = find_faulty_files(input_paths_files, output_paths_files)
fails = []
for i, (input_file, output_file) in enumerate(zip(input_file_list, output_file_list)):
print(f"{i}/{len(output_file_list)}")
try:
process_single_file(input_file, output_file, cfg)
except Exception as e:
print("-------------------------------")
print(e)
print(f"Failed to process {input_file}")
fails.append(input_file)
print("-------------------------------")
print(f"Number bad files: {len(fails)}")


if __name__ == "__main__":
main()
6 changes: 6 additions & 0 deletions enreg/tools/data_management/lorentzNet_dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import math
import vector
import awkward as ak
from torch.utils.data import Dataset
from enreg.tool.models.LorentzNet import psi
from sklearn.preprocessing import OneHotEncoder
13 changes: 9 additions & 4 deletions enreg/tools/data_management/ntupelizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,7 @@ def retrieve_hepmc_gen_tau_info(hepmc_events, gen_jets):
taus = [p for p in event.particles if (abs(p.pid) == 15) and (p.status == 2)]
tau_children = [p.children for p in event.particles if (abs(p.pid) == 15) and (p.status == 2)]
tau_info["tau_full_p4s"].append([vector.awk(ak.zip({
"energy": [tau.momentum.e],
"mass": [tau.generated_mass],
"x": [tau.momentum.px],
"y": [tau.momentum.py],
"z": [tau.momentum.pz]}))[0] for tau in taus])
Expand Down Expand Up @@ -486,12 +486,17 @@ def load_events_from_hepmc(root_file_path: str):


def no_tau_genjet_matching(gen_jets):
filler = ak.zeros_like(gen_jets)
gen_tau_jet_info = {
"gen_jet_tau_vis_energy": ak.values_astype(ak.Array(ak.zeros_like(gen_jets) == ak.ones_like(gen_jets)), int),
"gen_jet_tau_decaymode": ak.values_astype(ak.Array(ak.ones_like(gen_jets) == ak.ones_like(gen_jets)), int) * -1,
"tau_gen_jet_charge": ak.values_astype(ak.Array(ak.ones_like(gen_jets) == ak.ones_like(gen_jets)), int) * -999,
"tau_gen_jet_p4s_full": ak.zeros_like(gen_jets),
"tau_gen_jet_p4s": ak.zeros_like(gen_jets),
"tau_gen_jet_p4s_full": vector.awk(
ak.zip({"mass": filler.mass, "px": filler.x, "py": filler.y, "pz": filler.z})
),
"tau_gen_jet_p4s": vector.awk(
ak.zip({"mass": filler.mass, "px": filler.x, "py": filler.y, "pz": filler.z})
),
"tau_gen_jet_DV_x": ak.values_astype(ak.Array(ak.zeros_like(gen_jets) == ak.ones_like(gen_jets)), int),
"tau_gen_jet_DV_y": ak.values_astype(ak.Array(ak.zeros_like(gen_jets) == ak.ones_like(gen_jets)), int),
"tau_gen_jet_DV_z": ak.values_astype(ak.Array(ak.zeros_like(gen_jets) == ak.ones_like(gen_jets)), int),
Expand All @@ -506,7 +511,7 @@ def retrieve_hepmc_gen_particles(hepmc_events):
event_stable_gen_particles = [p for p in event.particles if (p.status == 1) and (abs(p.pid) not in [12,14,16])]
stable_mc_particles.append([{"PDG": p.pid} for p in event_stable_gen_particles])
stable_mc_p4.append([vector.awk(ak.zip({
"energy": [gp.momentum.e],
"mass": [gp.generated_mass],
"x": [gp.momentum.px],
"y": [gp.momentum.py],
"z": [gp.momentum.pz]
Expand Down
6 changes: 3 additions & 3 deletions enreg/tools/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,11 +171,11 @@ def reinitialize_p4(p4_obj: ak.Array):
p4 : ak.Array
Particle with initialized 4-momenta.
"""
if "t" in p4_obj.fields:
if "tau" in p4_obj.fields:
p4 = vector.awk(
ak.zip(
{
"energy": p4_obj.t,
"mass": p4_obj.tau,
"x": p4_obj.x,
"y": p4_obj.y,
"z": p4_obj.z,
Expand All @@ -186,7 +186,7 @@ def reinitialize_p4(p4_obj: ak.Array):
p4 = vector.awk(
ak.zip(
{
"mass": p4_obj.tau,
"energy": p4_obj.t,
"x": p4_obj.x,
"y": p4_obj.y,
"z": p4_obj.z,
Expand Down
Loading

0 comments on commit d6d274c

Please sign in to comment.