diff --git a/docs/changes/2628.feature.rst b/docs/changes/2628.feature.rst new file mode 100644 index 00000000000..f57b32854af --- /dev/null +++ b/docs/changes/2628.feature.rst @@ -0,0 +1 @@ +Add a generic stats-calculation tool utilizing the PixelStatisticsCalculator. diff --git a/docs/user-guide/tools.rst b/docs/user-guide/tools.rst index 619b08bf793..1a0b2320d9b 100644 --- a/docs/user-guide/tools.rst +++ b/docs/user-guide/tools.rst @@ -17,6 +17,7 @@ Data Processing Tools * ``ctapipe-quickstart``: create some default analysis configurations and a working directory * ``ctapipe-process``: Process event data in any supported format from R0/R1/DL0 to DL1 or DL2 HDF5 files. * ``ctapipe-apply-models``: Tool to apply machine learning models in bulk (as opposed to event by event). +* ``ctapipe-calculate-pixel-statistics``: Tool to aggregate statistics and detect outliers from pixel-wise image data. * ``ctapipe-train-disp-reconstructor`` : Train the ML models for the `ctapipe.reco.DispReconstructor` (monoscopic reconstruction) * ``ctapipe-train-energy-regressor``: Train the ML models for the `ctapipe.reco.EnergyRegressor` (energy estimation) * ``ctapipe-train-particle-classifier``: Train the ML models for the `ctapipe.reco.ParticleClassifier` (gamma-hadron separation) diff --git a/pyproject.toml b/pyproject.toml index 27c6cf0b7fc..fc0fbbc4bab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,6 +107,7 @@ ctapipe-process = "ctapipe.tools.process:main" ctapipe-merge = "ctapipe.tools.merge:main" ctapipe-fileinfo = "ctapipe.tools.fileinfo:main" ctapipe-quickstart = "ctapipe.tools.quickstart:main" +ctapipe-calculate-pixel-statistics = "ctapipe.tools.calculate_pixel_stats:main" ctapipe-train-energy-regressor = "ctapipe.tools.train_energy_regressor:main" ctapipe-train-particle-classifier = "ctapipe.tools.train_particle_classifier:main" ctapipe-train-disp-reconstructor = "ctapipe.tools.train_disp_reconstructor:main" diff --git a/src/ctapipe/resources/calculate_pixel_stats.yaml b/src/ctapipe/resources/calculate_pixel_stats.yaml new file mode 100644 index 00000000000..48e262d3ab2 --- /dev/null +++ b/src/ctapipe/resources/calculate_pixel_stats.yaml @@ -0,0 +1,32 @@ +StatisticsCalculatorTool: + allowed_tels: [1,2,3,4] + input_column_name: image + output_column_name: statistics + +PixelStatisticsCalculator: + stats_aggregator_type: + - ["type", "LST*", "SigmaClippingAggregator"] + - ["type", "MST*", "PlainAggregator"] + + chunk_shift: 1000 + faulty_pixels_fraction: 0.1 + outlier_detector_list: + - name: MedianOutlierDetector + apply_to: median + config: + median_range_factors: [-15, 15] + - name: RangeOutlierDetector + apply_to: median + config: + validity_range: [-20, 120] + - name: StdOutlierDetector + apply_to: std + config: + std_range_factors: [-15, 15] + +SigmaClippingAggregator: + chunk_size: 2500 + max_sigma: 4 + iterations: 5 +PlainAggregator: + chunk_size: 2500 diff --git a/src/ctapipe/tools/calculate_pixel_stats.py b/src/ctapipe/tools/calculate_pixel_stats.py new file mode 100644 index 00000000000..0b9de78d6e0 --- /dev/null +++ b/src/ctapipe/tools/calculate_pixel_stats.py @@ -0,0 +1,194 @@ +""" +Perform statistics calculation from pixel-wise image data +""" + +import pathlib + +import numpy as np +from astropy.table import vstack + +from ctapipe.core import Tool +from ctapipe.core.tool import ToolConfigurationError +from ctapipe.core.traits import ( + Bool, + CInt, + Path, + Set, + Unicode, + classes_with_traits, +) +from ctapipe.instrument import SubarrayDescription +from ctapipe.io import write_table +from ctapipe.io.tableloader import TableLoader +from ctapipe.monitoring.calculator import PixelStatisticsCalculator + + +class StatisticsCalculatorTool(Tool): + """ + Perform statistics calculation for pixel-wise image data + """ + + name = "StatisticsCalculatorTool" + description = "Perform statistics calculation for pixel-wise image data" + + examples = """ + To calculate statistics of pixel-wise image data files: + + > ctapipe-calculate-pixel-statistics --TableLoader.input_url input.dl1.h5 --output_path /path/monitoring.h5 --overwrite + + """ + + allowed_tels = Set( + trait=CInt(), + default_value=None, + allow_none=True, + help=( + "List of allowed tel_ids, others will be ignored. " + "If None, all telescopes in the input stream will be included." + ), + ).tag(config=True) + + input_column_name = Unicode( + default_value="image", + allow_none=False, + help="Column name of the pixel-wise image data to calculate statistics", + ).tag(config=True) + + output_table_name = Unicode( + default_value="statistics", + allow_none=False, + help="Table name of the output statistics", + ).tag(config=True) + + output_path = Path( + help="Output filename", default_value=pathlib.Path("monitoring.h5") + ).tag(config=True) + + overwrite = Bool(help="Overwrite output file if it exists").tag(config=True) + + aliases = { + ("i", "input_url"): "TableLoader.input_url", + ("o", "output_path"): "StatisticsCalculatorTool.output_path", + } + + flags = { + "overwrite": ( + {"StatisticsCalculatorTool": {"overwrite": True}}, + "Overwrite existing files", + ), + } + + classes = [ + TableLoader, + ] + classes_with_traits(PixelStatisticsCalculator) + + def setup(self): + # Read the input data with the 'TableLoader' + self.input_data = TableLoader( + parent=self, + ) + # Check that the input and output files are not the same + if self.input_data.input_url == self.output_path: + raise ToolConfigurationError( + "Input and output files are same. Fix your configuration / cli arguments." + ) + # Load the subarray description from the input file + subarray = SubarrayDescription.from_hdf(self.input_data.input_url) + # Get the telescope ids from the input data or use the allowed_tels configuration + self.tel_ids = ( + subarray.tel_ids if self.allowed_tels is None else self.allowed_tels + ) + # Initialization of the statistics calculator + self.stats_calculator = PixelStatisticsCalculator( + parent=self, subarray=subarray + ) + + def start(self): + # Iterate over the telescope ids and calculate the statistics + for tel_id in self.tel_ids: + # Read the whole dl1 images for one particular telescope + dl1_table = self.input_data.read_telescope_events_by_id( + telescopes=[ + tel_id, + ], + dl1_images=True, + dl1_parameters=False, + dl1_muons=False, + dl2=False, + simulated=False, + true_images=False, + true_parameters=False, + instrument=False, + pointing=False, + )[tel_id] + # Check if the chunk size does not exceed the table length of the input data + if self.stats_calculator.stats_aggregators[ + self.stats_calculator.stats_aggregator_type.tel[tel_id] + ].chunk_size > len(dl1_table): + raise ToolConfigurationError( + f"Change --StatisticsAggregator.chunk_size to decrease the chunk size " + f"of the aggregation to a maximum of '{len(dl1_table)}' (table length of the " + f"input data for telescope 'tel_id={tel_id}')." + ) + # Check if the input column name is in the table + if self.input_column_name not in dl1_table.colnames: + raise ToolConfigurationError( + f"Column '{self.input_column_name}' not found " + f"in the input data for telescope 'tel_id={tel_id}'." + ) + # Perform the first pass of the statistics calculation + aggregated_stats = self.stats_calculator.first_pass( + table=dl1_table, + tel_id=tel_id, + col_name=self.input_column_name, + ) + # Check if 'chunk_shift' is selected + if self.stats_calculator.chunk_shift is not None: + # Check if there are any faulty chunks to perform a second pass over the data + if np.any(~aggregated_stats["is_valid"].data): + # Perform the second pass of the statistics calculation + aggregated_stats_secondpass = self.stats_calculator.second_pass( + table=dl1_table, + valid_chunks=aggregated_stats["is_valid"].data, + tel_id=tel_id, + col_name=self.input_column_name, + ) + # Stack the statistic values from the first and second pass + aggregated_stats = vstack( + [aggregated_stats, aggregated_stats_secondpass] + ) + # Sort the stacked aggregated statistic values by starting time + aggregated_stats.sort(["time_start"]) + else: + self.log.info( + "No faulty chunks found for telescope 'tel_id=%d'. Skipping second pass.", + tel_id, + ) + # Add metadata to the aggregated statistics + aggregated_stats.meta["event_type"] = dl1_table["event_type"][0] + aggregated_stats.meta["input_column_name"] = self.input_column_name + # Write the aggregated statistics and their outlier mask to the output file + write_table( + aggregated_stats, + self.output_path, + f"/dl1/monitoring/telescope/{self.output_table_name}/tel_{tel_id:03d}", + overwrite=self.overwrite, + ) + + def finish(self): + self.log.info( + "DL1 monitoring data was stored in '%s' under '%s'", + self.output_path, + f"/dl1/monitoring/telescope/{self.output_table_name}", + ) + self.log.info("Tool is shutting down") + + +def main(): + # Run the tool + tool = StatisticsCalculatorTool() + tool.run() + + +if __name__ == "main": + main() diff --git a/src/ctapipe/tools/quickstart.py b/src/ctapipe/tools/quickstart.py index 67410458d95..f8dfaff0d3c 100644 --- a/src/ctapipe/tools/quickstart.py +++ b/src/ctapipe/tools/quickstart.py @@ -12,6 +12,7 @@ CONFIGS_TO_WRITE = [ "base_config.yaml", + "calculate_pixel_stats.yaml", "stage1_config.yaml", "stage2_config.yaml", "ml_preprocessing_config.yaml", diff --git a/src/ctapipe/tools/tests/test_calculate_pixel_stats.py b/src/ctapipe/tools/tests/test_calculate_pixel_stats.py new file mode 100644 index 00000000000..bcd3af2a66d --- /dev/null +++ b/src/ctapipe/tools/tests/test_calculate_pixel_stats.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +""" +Test ctapipe-calculate-pixel-statistics tool +""" + +import pytest +from traitlets.config.loader import Config + +from ctapipe.core import run_tool +from ctapipe.core.tool import ToolConfigurationError +from ctapipe.io import read_table +from ctapipe.tools.calculate_pixel_stats import StatisticsCalculatorTool + + +def test_calculate_pixel_stats_tool(tmp_path, dl1_image_file): + """check statistics calculation from pixel-wise image data files""" + + # Create a configuration suitable for the test + tel_id = 3 + config = Config( + { + "StatisticsCalculatorTool": { + "allowed_tels": [tel_id], + "input_column_name": "image", + "output_table_name": "statistics", + }, + "PixelStatisticsCalculator": { + "stats_aggregator_type": [ + ("id", tel_id, "PlainAggregator"), + ], + }, + "PlainAggregator": { + "chunk_size": 1, + }, + } + ) + # Set the output file path + monitoring_file = tmp_path / "monitoring.dl1.h5" + # Run the tool with the configuration and the input file + run_tool( + StatisticsCalculatorTool(config=config), + argv=[ + f"--input_url={dl1_image_file}", + f"--output_path={monitoring_file}", + "--overwrite", + ], + cwd=tmp_path, + raises=True, + ) + # Check that the output file has been created + assert monitoring_file.exists() + # Check that the output file is not empty + assert ( + read_table( + monitoring_file, + path=f"/dl1/monitoring/telescope/statistics/tel_{tel_id:03d}", + )["mean"] + is not None + ) + + +def test_tool_config_error(tmp_path, dl1_image_file): + """check tool configuration error""" + + # Run the tool with the configuration and the input file + config = Config( + { + "StatisticsCalculatorTool": { + "allowed_tels": [3], + "input_column_name": "image_charges", + "output_table_name": "statistics", + } + } + ) + # Set the output file path + monitoring_file = tmp_path / "monitoring.dl1.h5" + # Check if ToolConfigurationError is raised + # when the column name of the pixel-wise image data is not correct + with pytest.raises( + ToolConfigurationError, match="Column 'image_charges' not found" + ): + run_tool( + StatisticsCalculatorTool(config=config), + argv=[ + f"--input_url={dl1_image_file}", + f"--output_path={monitoring_file}", + "--StatisticsAggregator.chunk_size=1", + "--overwrite", + ], + cwd=tmp_path, + raises=True, + ) + # Check if ToolConfigurationError is raised + # when the input and output files are the same + with pytest.raises( + ToolConfigurationError, match="Input and output files are same." + ): + run_tool( + StatisticsCalculatorTool(), + argv=[ + f"--input_url={dl1_image_file}", + f"--output_path={dl1_image_file}", + "--overwrite", + ], + cwd=tmp_path, + raises=True, + ) + # Check if ToolConfigurationError is raised + # when the chunk size is larger than the number of events in the input file + with pytest.raises( + ToolConfigurationError, match="Change --StatisticsAggregator.chunk_size" + ): + run_tool( + StatisticsCalculatorTool(), + argv=[ + f"--input_url={dl1_image_file}", + f"--output_path={monitoring_file}", + "--StatisticsCalculatorTool.allowed_tels=3", + "--StatisticsAggregator.chunk_size=2500", + "--overwrite", + ], + cwd=tmp_path, + raises=True, + )