From e6ef7828f1d7534a9a26f311f8cfa2299bdbfb3d Mon Sep 17 00:00:00 2001
From: dachengx <dx2227@columbia.edu>
Date: Mon, 29 Apr 2024 16:01:30 -0500
Subject: [PATCH] Separate 2 and 3 hits pairing and add normalization factor

---
 axidence/plugins/pairing/events_paired.py |   2 +
 axidence/plugins/pairing/peaks_paired.py  | 190 ++++++++++++++--------
 2 files changed, 126 insertions(+), 66 deletions(-)

diff --git a/axidence/plugins/pairing/events_paired.py b/axidence/plugins/pairing/events_paired.py
index 8b18b34..f9727e0 100644
--- a/axidence/plugins/pairing/events_paired.py
+++ b/axidence/plugins/pairing/events_paired.py
@@ -116,6 +116,7 @@ def event_fields(self):
                 np.int8,
             ),
             (("Event number in this dataset", "event_number"), np.int64),
+            (("Normalization of number of paired events", "normalization"), np.float32),
         ]
         return dtype
 
@@ -147,6 +148,7 @@ def compute(self, events_paired, peaks_paired):
                     "Maybe the paired events overlap."
                 )
             result["event_number"][i] = sp["event_number"][0]
+            result["normalization"][i] = sp["normalization"][0]
             for idx, main_peak in zip([event["s1_index"], event["s2_index"]], ["s1_", "s2_"]):
                 if idx >= 0:
                     for n in self.peak_fields:
diff --git a/axidence/plugins/pairing/peaks_paired.py b/axidence/plugins/pairing/peaks_paired.py
index 9ebdda2..d0bdd32 100644
--- a/axidence/plugins/pairing/peaks_paired.py
+++ b/axidence/plugins/pairing/peaks_paired.py
@@ -68,8 +68,15 @@ class PeaksPaired(ExhaustPlugin, DownChunkingPlugin):
     # multiple factor is 100, then we will make 100 AC events
     paring_rate_bootstrap_factor = straxen.URLConfig(
         default=1e2,
-        type=(int, float),
-        help="Bootstrap factor for AC rate",
+        type=(int, float, list, tuple),
+        help=(
+            "Bootstrap factor for AC rate, "
+            "if list or tuple, they are the factor for 2 and 3+ hits S1"
+        ),
+    )
+
+    s1_min_coincidence = straxen.URLConfig(
+        default=2, type=int, help="Minimum tight coincidence necessary to make an S1"
     )
 
     apply_shadow_matching = straxen.URLConfig(
@@ -115,6 +122,7 @@ def infer_dtype(self):
             (("Original type of group", "origin_group_type"), np.int8),
             (("Original s1_index in isolated S1", "origin_s1_index"), np.int32),
             (("Original s2_index in isolated S2", "origin_s2_index"), np.int32),
+            (("Normalization of number of paired events", "normalization"), np.float32),
         ]
         truth_dtype = [
             (("Event number in this dataset", "event_number"), np.int64),
@@ -126,6 +134,7 @@ def infer_dtype(self):
             ),
             (("Original isolated S1 group", "s1_group_number"), np.int32),
             (("Original isolated S2 group", "s2_group_number"), np.int32),
+            (("Normalization of number of paired events", "normalization"), np.float32),
         ] + strax.time_fields
         return dict(peaks_paired=peaks_dtype, truth_paired=truth_dtype)
 
@@ -138,6 +147,17 @@ def setup(self, prepare=True):
             self.rng = np.random.default_rng(seed=self.pairing_seed)
         self.time_left = self.paring_time_interval // 2
         self.time_right = self.paring_time_interval - self.time_left
+        if self.s1_min_coincidence != 2:
+            raise NotImplementedError("Only support s1_min_coincidence = 2 now!")
+        if isinstance(self.paring_rate_bootstrap_factor, (list, tuple)):
+            if len(self.paring_rate_bootstrap_factor) != 2:
+                raise ValueError(
+                    "The length of paring_rate_bootstrap_factor should be 2 "
+                    "if provided list or tuple!"
+                )
+            self.bootstrap_factor = list(self.paring_rate_bootstrap_factor)
+        else:
+            self.bootstrap_factor = [self.paring_rate_bootstrap_factor] * 2
 
     @staticmethod
     def preprocess_isolated_s2(s2):
@@ -171,14 +191,18 @@ def simple_pairing(
             s1_rate * s2_rate * (max_drift_time - min_drift_time) / units.s / paring_rate_correction
         )
         n_events = round(paring_rate_full * run_time * paring_rate_bootstrap_factor)
-        s1_group_number = rng.choice(len(s1), size=n_events, replace=True)
-        s2_group_number = rng.choice(len(s2), size=n_events, replace=True)
+        s1_group_index = rng.choice(len(s1), size=n_events, replace=True)
+        s2_group_index = rng.choice(len(s2), size=n_events, replace=True)
         if fixed_drift_time is None:
             drift_time = rng.uniform(min_drift_time, max_drift_time, size=n_events)
         else:
             warnings.warn(f"Using fixed drift time {fixed_drift_time}ns")
             drift_time = np.full(n_events, fixed_drift_time)
-        return paring_rate_full, s1_group_number, s2_group_number, drift_time
+        return paring_rate_full, (
+            s1["group_number"][s1_group_index],
+            s2["group_number"][s2_group_index],
+            drift_time,
+        )
 
     def shadow_reference_selection(self, peaks_salted):
         return peaks_salted[peaks_salted["type"] == 2]
@@ -210,6 +234,7 @@ def preprocess_shadow(data, shadow_deltatime_exponent, delta_t=0, prefix=""):
         x = np.log10(pre_s2_area * dt_s2_time_shadow**shadow_deltatime_exponent)
         y = np.sqrt(np.log10(pre_s2_area) ** 2 + np.log10(dt_s2_time_shadow) ** 2)
         sample = np.stack([x, y]).T
+        # sample = np.stack([np.log10(dt_s2_time_shadow), np.log10(pre_s2_area)]).T
         return sample
 
     @staticmethod
@@ -227,6 +252,7 @@ def shadow_matching(
         paring_rate_correction,
         paring_rate_bootstrap_factor,
         rng,
+        preprocess_shadow,
         onlyrate=False,
     ):
         # perform Shadow matching technique
@@ -235,10 +261,8 @@ def shadow_matching(
         # 2D equal binning
         # prepare the 2D space, x is log(S2/dt), y is (log(S2)**2+log(dt)**2)**0.5
         # because these 2 dimension is orthogonal
-        sampled_correlation = PeaksPaired.preprocess_shadow(
-            shadow_reference, shadow_deltatime_exponent
-        )
-        s1_sample = PeaksPaired.preprocess_shadow(s1, shadow_deltatime_exponent)
+        sampled_correlation = preprocess_shadow(shadow_reference, shadow_deltatime_exponent)
+        s1_sample = preprocess_shadow(s1, shadow_deltatime_exponent)
 
         # use (x, y) distribution of isolated S1 as reference
         # because it is more intense when shadow(S2/dt) is large
@@ -285,16 +309,16 @@ def shadow_matching(
         if not onlyrate:
             # get indices of the 2D bins
             s1_digit = PeaksPaired.digitize2d(s1_sample, bin_edges, n_shadow_bins)
-            _s1_group_number = np.arange(len(s1))
-            s1_group_number_list = [
-                _s1_group_number[s1_digit == xd].tolist()
+            _s1_group_index = np.arange(len(s1))
+            s1_group_index_list = [
+                _s1_group_index[s1_digit == xd].tolist()
                 for xd in range(n_shadow_bins * n_shadow_bins)
             ]
 
         drift_time_bins = np.linspace(min_drift_time, max_drift_time, n_drift_time_bins + 1)
         drift_time_bin_center = (drift_time_bins[:-1] + drift_time_bins[1:]) / 2
 
-        group_number_list = []
+        group_index_list = []
         _paring_rate_full = np.zeros(len(drift_time_bin_center))
         for i in range(len(drift_time_bin_center)):
             if shift_dt_shadow_matching:
@@ -303,9 +327,7 @@ def shadow_matching(
                 delta_t = drift_time_bin_center[i]
             else:
                 delta_t = 0
-            data_sample = PeaksPaired.preprocess_shadow(
-                s2, shadow_deltatime_exponent, delta_t=delta_t
-            )
+            data_sample = preprocess_shadow(s2, shadow_deltatime_exponent, delta_t=delta_t)
             ge.check_sample_sanity(data_sample)
             # apply binning to (x, y)
             s2_shadow_count = ge.apply_irregular_binning(
@@ -331,43 +353,48 @@ def shadow_matching(
                 if count_pairing.max() == 0:
                     count_pairing[mu_shadow.argmax()] = 1
                 s2_digit = PeaksPaired.digitize2d(data_sample, bin_edges, n_shadow_bins)
-                _s2_group_number = np.arange(len(s2))
-                s2_group_number_list = [
-                    _s2_group_number[s2_digit == xd].tolist()
+                _s2_group_index = np.arange(len(s2))
+                s2_group_index_list = [
+                    _s2_group_index[s2_digit == xd].tolist()
                     for xd in range(n_shadow_bins * n_shadow_bins)
                 ]
                 # random sample isolated S1 and S2's group number
-                _s1_group_number = np.hstack(
+                _s1_group_index = np.hstack(
                     [
                         rng.choice(
-                            s1_group_number_list[xd],
+                            s1_group_index_list[xd],
                             size=count_pairing[xd],
                         )
                         for xd in range(n_shadow_bins * n_shadow_bins)
                     ]
                 )
-                _s2_group_number = np.hstack(
+                _s2_group_index = np.hstack(
                     [
                         rng.choice(
-                            s2_group_number_list[xd],
+                            s2_group_index_list[xd],
                             size=count_pairing[xd],
                         )
                         for xd in range(n_shadow_bins * n_shadow_bins)
                     ]
                 )
                 # sample drift time in this bin
-                _drift_time = rng.choice(
-                    round(drift_time_bins[i + 1] - drift_time_bins[i]),
+                _drift_time = rng.uniform(
+                    drift_time_bins[i],
+                    drift_time_bins[i + 1],
                     size=count_pairing.sum(),
-                ) + round(drift_time_bins[i])
-                group_number_list.append([_s1_group_number, _s2_group_number, _drift_time])
+                )
+                group_index_list.append([_s1_group_index, _s2_group_index, _drift_time])
         paring_rate_full = _paring_rate_full.sum()
         if not onlyrate:
-            s1_group_number = np.hstack([group[0] for group in group_number_list]).astype(int)
-            s2_group_number = np.hstack([group[1] for group in group_number_list]).astype(int)
-            drift_time = np.hstack([group[2] for group in group_number_list]).astype(int)
-            assert len(s1_group_number) == len(s2_group_number)
-        return paring_rate_full, s1_group_number, s2_group_number, drift_time
+            s1_group_index = np.hstack([group[0] for group in group_index_list]).astype(int)
+            s2_group_index = np.hstack([group[1] for group in group_index_list]).astype(int)
+            drift_time = np.hstack([group[2] for group in group_index_list]).astype(int)
+            assert len(s1_group_index) == len(s2_group_index)
+        return paring_rate_full, (
+            s1["group_number"][s1_group_index],
+            s2["group_number"][s2_group_index],
+            drift_time,
+        )
 
     def split_chunks(self, n_peaks):
         # divide results into chunks
@@ -416,7 +443,7 @@ def build_arrays(
             # isolated S1 is assigned peak by peak
             s1_index = s1_group_number[i]
             for q in self.dtype["peaks_paired"].names:
-                if "origin" not in q and q not in ["event_number"]:
+                if "origin" not in q and q not in ["event_number", "normalization"]:
                     _array[0][q] = s1[s1_index][q]
             # _array[0]["origin_run_id"] = s1["run_id"][s1_index]
             _array[0]["origin_group_number"] = s1["group_number"][s1_index]
@@ -434,7 +461,7 @@ def build_arrays(
             group_number = s2_group_number[i]
             s2_group_i = s2[s2_group_index[group_number] : s2_group_index[group_number + 1]]
             for q in self.dtype["peaks_paired"].names:
-                if "origin" not in q and q not in ["event_number"]:
+                if "origin" not in q and q not in ["event_number", "normalization"]:
                     _array[1:][q] = s2_group_i[q]
             s2_index = s2_group_i["s2_index"]
             # _array[1:]["origin_run_id"] = s2_group_i["run_id"]
@@ -524,38 +551,64 @@ def compute(self, isolated_s1, isolated_s2, peaks_salted, start, end):
         print(f"S1 rate is {s1_rate:.3f}Hz")
         print(f"There are {len(main_isolated_s2)} S2 peaks group")
         print(f"S2 rate is {s2_rate * 1e3:.3f}mHz")
-        if self.apply_shadow_matching:
-            # simulate AC's drift time bin by bin
-            shadow_reference = self.shadow_reference_selection(peaks_salted)
-            paring_rate_full, s1_group_number, s2_group_number, drift_time = self.shadow_matching(
-                isolated_s1,
-                main_isolated_s2,
-                shadow_reference,
-                self.shadow_deltatime_exponent,
-                self.max_n_shadow_bins,
-                run_time,
-                self.max_drift_time,
-                self.min_drift_time,
-                self.n_drift_time_bins,
-                self.shift_dt_shadow_matching,
-                paring_rate_correction,
-                self.paring_rate_bootstrap_factor,
-                self.rng,
-            )
-        else:
-            paring_rate_full, s1_group_number, s2_group_number, drift_time = self.simple_pairing(
-                isolated_s1,
-                main_isolated_s2,
-                s1_rate,
-                s2_rate,
-                run_time,
-                self.max_drift_time,
-                self.min_drift_time,
-                paring_rate_correction,
-                self.paring_rate_bootstrap_factor,
-                self.fixed_drift_time,
-                self.rng,
-            )
+        n_hits_2 = isolated_s1["n_hits"] == 2
+        n_hits_masks = [n_hits_2, ~n_hits_2]
+        truths = []
+        for i, mask in enumerate(n_hits_masks):
+            if mask.sum() != 0:
+                if self.apply_shadow_matching:
+                    # simulate AC's drift time bin by bin
+                    shadow_reference = self.shadow_reference_selection(peaks_salted)
+                    truth = self.shadow_matching(
+                        isolated_s1[mask],
+                        main_isolated_s2,
+                        shadow_reference,
+                        self.shadow_deltatime_exponent,
+                        self.max_n_shadow_bins,
+                        run_time,
+                        self.max_drift_time,
+                        self.min_drift_time,
+                        self.n_drift_time_bins,
+                        self.shift_dt_shadow_matching,
+                        paring_rate_correction,
+                        self.bootstrap_factor[i],
+                        self.rng,
+                        self.preprocess_shadow,
+                    )
+                else:
+                    truth = self.simple_pairing(
+                        isolated_s1[mask],
+                        main_isolated_s2,
+                        s1_rate,
+                        s2_rate,
+                        run_time,
+                        self.max_drift_time,
+                        self.min_drift_time,
+                        paring_rate_correction,
+                        self.bootstrap_factor[i],
+                        self.fixed_drift_time,
+                        self.rng,
+                    )
+            else:
+                truth = (
+                    0.0,
+                    (
+                        np.empty(0, dtype=isolated_s1["group_number"].dtype),
+                        np.empty(0, dtype=main_isolated_s2["group_number"].dtype),
+                        [],
+                    ),
+                )
+            truths.append(truth)
+        paring_rate_full = truths[0][0] + truths[1][0]
+        s1_group_number = np.hstack([truths[0][1][0], truths[1][1][0]])
+        s2_group_number = np.hstack([truths[0][1][1], truths[1][1][1]])
+        drift_time = np.hstack([truths[0][1][2], truths[1][1][2]])
+        normalization = np.hstack(
+            [
+                np.full(len(truths[0][1][0]), 1 / self.bootstrap_factor[0]),
+                np.full(len(truths[1][1][0]), 1 / self.bootstrap_factor[1]),
+            ]
+        )
 
         print(f"AC pairing rate is {paring_rate_full * 1e3:.3f}mHz")
         print(f"AC event number is {len(drift_time)}")
@@ -588,6 +641,11 @@ def compute(self, isolated_s1, isolated_s2, peaks_salted, start, end):
             )
             peaks_arrays["event_number"] += left_i
             truth_arrays["event_number"] += left_i
+            peaks_arrays["normalization"] = np.repeat(
+                normalization[left_i:right_i],
+                n_peaks[left_i:right_i],
+            )
+            truth_arrays["normalization"] = normalization[left_i:right_i]
 
             result = dict()
             result["peaks_paired"] = self.chunk(