From 9961a22c09078d044ba7872843f27a00e2144990 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Fri, 29 Sep 2023 17:09:52 +0200 Subject: [PATCH 01/22] basic iterative and not-quite tree reduction hadd functions started --- src/proteus/operations/add_histograms.py | 115 +++++++++++++++ src/proteus/operations/parquet_to_root.py | 20 +++ src/proteus/to_feather.py | 65 +++++++++ src/proteus/to_parquet.py | 167 ++++++++++++++++++++++ 4 files changed, 367 insertions(+) create mode 100644 src/proteus/operations/add_histograms.py create mode 100644 src/proteus/operations/parquet_to_root.py create mode 100644 src/proteus/to_feather.py create mode 100644 src/proteus/to_parquet.py diff --git a/src/proteus/operations/add_histograms.py b/src/proteus/operations/add_histograms.py new file mode 100644 index 0000000..163550c --- /dev/null +++ b/src/proteus/operations/add_histograms.py @@ -0,0 +1,115 @@ +import uproot + +# Only combines one histogram per file +def hadd_like(files, destination, *, hist_name=None, hist_paths=None): + if hist_name == None: + # ? search through them nicely? Assume there are multiple? + array = hist.classnames() + try: + hist = uproot.open(files[0])[hist_name] + except: + # error: name not the same! + error = 5 + + # Base case + bins = hist.member('fN') + values = hist.values(flow=True) + fEntries = hist.member("fEntries") + fTsumw = hist.member("fTsumw") + if hist.member("fTsumw2") != None: + fTsumw2 = hist.member("fTsumw2") + else: + fTsumw2 = 0 + fTsumwx = hist.member("fTsumwx") + fTsumwx2 = hist.member("fTsumwx2") + variances = hist.variances("flow=True") + + # Iteratively / Sequentially: + for path in files[1:]: + with uproot.open(path) as file: + hist = file[hist_name] # histogram = uproot.open("file.root:path/to/histogram") + if bins != hist.member('fN'): + raise ValueError( + "Bins must be the same, not " + {bins} + " and " + {hist.member('fN')} + ) + + values += hist.values(flow=True) + fEntries += hist.member("fEntries") + fTsumw += hist.member("fTsumw") + if hist.member("fTsumw2") != None: + fTsumw2 += hist.member("fTsumw2") + fTsumwx += hist.member("fTsumwx") + fTsumwx2 += hist.member("fTsumwx2") + variances += hist.variances("flow=True") + + h_sum = uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), values, + fEntries, fTsumw, fTsumw2, fTsumwx, fTsumwx2, variances, hist.member("fXaxis")) + + file_out = uproot.recreate(destination) + file_out[h_sum.member("fName")] = h_sum + + +# If we can do things in parallel +def hadd_like_tree_reduction(files, destination, *, hist_name=None, threads=1): + import numpy as np + hist = uproot.open(files[0]) + try: + hist = uproot.open(files[0])[hist_name] + except: + # error: name not the same! + error = 5 + + iterations = len(files) + + if (iterations%2) != 0: + hist = files[-1] + values = hist.values(flow=True) + fEntries = hist.member("fEntries") + fTsumw = hist.member("fTsumw") + if hist.member("fTsumw2") != None: + fTsumw2 = hist.member("fTsumw2") + fTsumwx = hist.member("fTsumwx") + fTsumwx2 = hist.member("fTsumwx2") + variances = hist.variances("flow=True") + else: + values = 0 + fEntries = 0 + fTsumw = 0 + fTsumw2 = 0 + fTsumwx = 0 + fTsumwx2 = 0 + variances = 0 + + for i in range(iterations/2): + print(i) + values, fEntries, fTsumw, fTsumw2, fTsumwx, fTsumwx2, variances += sum_hists(files[i], files[-i], hist_name) + + h_sum = uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), values, + fEntries, fTsumw, fTsumw2, fTsumwx, fTsumwx2, variances, hist.member("fXaxis")) + + file_out = uproot.recreate(destination) + file_out[h_sum.member("fName")] = h_sum + +hadd_like_tree_reduction(["/Users/zobil/Documents/Proteus/file1.root", "/Users/zobil/Documents/Proteus/file2.root"], "place.root", hist_name="name") + +def sum_hists(hist1, hist2): + # Check bins + hist1 = uproot.open(hist1) + hist2 = uproot.open(hist2) + if hist1.member("fN") != hist2.member("fN"): + raise ValueError( + "Bins must be the same, not " + {hist1.member("fN")} + " and " + {hist2.member("fN")} # Get file names + ) + values = hist1.values(flow=True) + hist2.values(flow=True) + fEntries = hist1.member("fEntries") + hist2.member("fEntries") + fTsumw = hist1.member("fTsumw") + hist2.member("fTsumw") + if hist1.member("fTsumw2") != None: + fTsumw2 = hist1.member("fTsumw2") + else: + fTsumw2 = 0 + if hist2.member("fTsumw2") != None: + fTsumw2 += hist2.member("fTsumw2") + fTsumwx = hist1.member("fTsumwx") + hist2.member("fTsumwx") + fTsumwx2 = hist1.member("fTsumwx2") + hist2.member("fTsumwx") + variances = hist1.variances("flow=True") + hist2.variances("flow=True") + return hist1.member("fN"), values, fEntries, fTsumw, fTsumw2, fTsumwx, fTsumwx2, variances diff --git a/src/proteus/operations/parquet_to_root.py b/src/proteus/operations/parquet_to_root.py new file mode 100644 index 0000000..2acccc0 --- /dev/null +++ b/src/proteus/operations/parquet_to_root.py @@ -0,0 +1,20 @@ +import dask_awkward as dak +import uproot +def parquet_to_root(read_path, + write_path, + *, + columns, + storage_options, + max_gap, + max_block, + footer_sample_size, + generate_bitmasks, + highlevel, + behavior, + ): + arrays = dak.from_parquet(read_path, split_row_groups=True) + tree = uproot.recreate(write_path) + tree.mktree("tree", {arrays.partitions[0]}) #name? But root files aren't just TTrees... + for i in range(1,arrays.npartitions): + tree["tree"].extend(arrays.partitions[i]) + \ No newline at end of file diff --git a/src/proteus/to_feather.py b/src/proteus/to_feather.py new file mode 100644 index 0000000..8d7b065 --- /dev/null +++ b/src/proteus/to_feather.py @@ -0,0 +1,65 @@ +import pyarrow as pa +import pyarrow.parquet as pq +import awkward as ak +from fsspec import AbstractFileSystem + +def parquet_to_feather( + path, + new_path, # ? + *, + columns=None, + row_groups=None, + storage_options=None, + max_gap=64_000, + max_block=256_000_000, + footer_sample_size=1_000_000, + generate_bitmasks=False, + highlevel=True, + behavior=None, +): + """ + Args: + path (str): Local filename or remote URL, passed to fsspec for resolution. + May contain glob patterns. + columns (None, str, or list of str): Glob pattern(s) with bash-like curly + brackets for matching column names. Nested records are separated by dots. + If a list of patterns, the logical-or is matched. If None, all columns + are read. + row_groups (None or set of int): Row groups to read; must be non-negative. + Order is ignored: the output array is presented in the order specified by + Parquet metadata. If None, all row groups/all rows are read. + storage_options: Passed to `fsspec.parquet.open_parquet_file`. + max_gap (int): Passed to `fsspec.parquet.open_parquet_file`. + max_block (int): Passed to `fsspec.parquet.open_parquet_file`. + footer_sample_size (int): Passed to `fsspec.parquet.open_parquet_file`. + generate_bitmasks (bool): If enabled and Arrow/Parquet does not have Awkward + metadata, `generate_bitmasks=True` creates empty bitmasks for nullable + types that don't have bitmasks in the Arrow/Parquet data, so that the + Form (BitMaskedForm vs UnmaskedForm) is predictable. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.contents.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Reads data from a local or remote Parquet file a feather file (or a collection of feather files?). + + Different from ak.to_parquet etc. because... + """ + #create feather file? + + # read one page of parquet file + parquet_file = pq.ParquetFile(path) # does this put the whole thing in memory? + metadata = ak.metadata_from_parquet(path) + # parquet_metadata + # read_row_group or with metadata? + # batch vs page? what size? + # with metadata['fs'].open as fp: #why would this be necessary? + for batch in parquet_file.iter_batches(): + pa.concat(new_path, ak.to_feather(new_path, batch)) #but this shouldn't be something that sets a var to a bigger file?? + + + + + # write to feather file - find concat without + # feather_file = pa.concat([ak.from_parquet(file, page) for page in pages],ignore_index=True) + diff --git a/src/proteus/to_parquet.py b/src/proteus/to_parquet.py new file mode 100644 index 0000000..a596f70 --- /dev/null +++ b/src/proteus/to_parquet.py @@ -0,0 +1,167 @@ +import dask_awkward as da +import _collections_abc +import awkward as ak +import pyarrow.parquet + +# Feather to parquet first? +def feather_to_parquet( + # array, + path, + *, + list_to32=False, + string_to32=True, + bytestring_to32=True, + emptyarray_to=None, + categorical_as_dictionary=False, + extensionarray=True, + count_nulls=True, + compression="zstd", + compression_level=None, + row_group_size=64 * 1024 * 1024, + data_page_size=None, + parquet_flavor=None, + parquet_version="2.4", + parquet_page_version="1.0", + parquet_metadata_statistics=True, + parquet_dictionary_encoding=False, + parquet_byte_stream_split=False, + parquet_coerce_timestamps=None, + parquet_old_int96_timestamps=None, + parquet_compliant_nested=False, # https://issues.apache.org/jira/browse/ARROW-16348 + parquet_extra_options=None, + storage_options=None, + # Potentially need: + # expressions=None, + # cut=None, + # filter_name = no_filter, + # filter_typename = no_filter, + # aliases=None, + # language=uproot.language.python.python_language, + # entry_start=None, + # entry_stop=None, + # step_size="100 MB", + # library="ak", + # how=None, +): + + # Do in steps. can use argument "columns" to select amount - have it be the same as the + # size of a page is in a parquet? Or step_size like in uproot's iterate? + # much to read, can choose bytes, tuple, str, list (not sure what most of those mean here) + # Data page size! + # First read ak.feather() + # Read feather also has columns...best to read a bit at a time and keep track? + + """ + Args: + array: Array-like data (anything #ak.to_layout recognizes). + destination (path-like): Name of the output file, file path, or + remote URL passed to [fsspec.core.url_to_fs](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.core.url_to_fs) + for remote writing. + list_to32 (bool): If True, convert Awkward lists into 32-bit Arrow lists + if they're small enough, even if it means an extra conversion. Otherwise, + signed 32-bit #ak.types.ListType maps to Arrow `ListType`, + signed 64-bit #ak.types.ListType maps to Arrow `LargeListType`, + and unsigned 32-bit #ak.types.ListType picks whichever Arrow type its + values fit into. + string_to32 (bool): Same as the above for Arrow `string` and `large_string`. + bytestring_to32 (bool): Same as the above for Arrow `binary` and `large_binary`. + emptyarray_to (None or dtype): If None, #ak.types.UnknownType maps to Arrow's + null type; otherwise, it is converted a given numeric dtype. + categorical_as_dictionary (bool): If True, #ak.contents.IndexedArray and + #ak.contents.IndexedOptionArray labeled with `__array__ = "categorical"` + are mapped to Arrow `DictionaryArray`; otherwise, the projection is + evaluated before conversion (always the case without + `__array__ = "categorical"`). + extensionarray (bool): If True, this function returns extended Arrow arrays + (at all levels of nesting), which preserve metadata so that Awkward \u2192 + Arrow \u2192 Awkward preserves the array's #ak.types.Type (though not + the #ak.forms.Form). If False, this function returns generic Arrow arrays + that might be needed for third-party tools that don't recognize Arrow's + extensions. Even with `extensionarray=False`, the values produced by + Arrow's `to_pylist` method are the same as the values produced by Awkward's + #ak.to_list. + count_nulls (bool): If True, count the number of missing values at each level + and include these in the resulting Arrow array, which makes some downstream + applications faster. If False, skip the up-front cost of counting them. + compression (None, str, or dict): Compression algorithm name, passed to + [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + Parquet supports `{"NONE", "SNAPPY", "GZIP", "BROTLI", "LZ4", "ZSTD"}` + (where `"GZIP"` is also known as "zlib" or "deflate"). If a dict, the keys + are column names (the same column names that #ak.forms.Form.columns returns + and #ak.forms.Form.select_columns accepts) and the values are compression + algorithm names, to compress each column differently. + compression_level (None, int, or dict None): Compression level, passed to + [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + Compression levels have different meanings for different compression + algorithms: GZIP ranges from 1 to 9, but ZSTD ranges from -7 to 22, for + example. Generally, higher numbers provide slower but smaller compression. + row_group_size (int or None): Number of entries in each row group (except the last), + passed to [pyarrow.parquet.ParquetWriter.write_table](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter.write_table). + If None, the Parquet default of 64 MiB is used. + data_page_size (None or int): Number of bytes in each data page, passed to + [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + If None, the Parquet default of 1 MiB is used. + parquet_flavor (None or `"spark"`): If None, the output Parquet file will follow + Arrow conventions; if `"spark"`, it will follow Spark conventions. Some + systems, such as Spark and Google BigQuery, might need Spark conventions, + while others might need Arrow conventions. Passed to + [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `flavor`. + parquet_version (`"1.0"`, `"2.4"`, or `"2.6"`): Parquet file format version. + Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `version`. + parquet_page_version (`"1.0"` or `"2.0"`): Parquet page format version. + Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `data_page_version`. + parquet_metadata_statistics (bool or dict): If True, include summary + statistics for each data page in the Parquet metadata, which lets some + applications search for data more quickly (by skipping pages). If a dict + mapping column names to bool, include summary statistics on only the + specified columns. Passed to + [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `write_statistics`. + parquet_dictionary_encoding (bool or dict): If True, allow Parquet to pre-compress + with dictionary encoding. If a dict mapping column names to bool, only + use dictionary encoding on the specified columns. Passed to + [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `use_dictionary`. + parquet_byte_stream_split (bool or dict): If True, pre-compress floating + point fields (`float32` or `float64`) with byte stream splitting, which + collects all mantissas in one part of the stream and exponents in another. + Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `use_byte_stream_split`. + parquet_coerce_timestamps (None, `"ms"`, or `"us"`): If None, any timestamps + (`datetime64` data) are coerced to a given resolution depending on + `parquet_version`: version `"1.0"` and `"2.4"` are coerced to microseconds, + but later versions use the `datetime64`'s own units. If `"ms"` is explicitly + specified, timestamps are coerced to milliseconds; if `"us"`, microseconds. + Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `coerce_timestamps`. + parquet_old_int96_timestamps (None or bool): If True, use Parquet's INT96 format + for any timestamps (`datetime64` data), taking priority over `parquet_coerce_timestamps`. + If None, let the `parquet_flavor` decide. Passed to + [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `use_deprecated_int96_timestamps`. + parquet_compliant_nested (bool): If True, use the Spark/BigQuery/Parquet + [convention for nested lists](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types), + in which each list is a one-field record with field name "`element`"; + otherwise, use the Arrow convention, in which the field name is "`item`". + Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + as `use_compliant_nested_type`. + parquet_extra_options (None or dict): Any additional options to pass to + [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). + storage_options (None or dict): Any additional options to pass to + [fsspec.core.url_to_fs](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.core.url_to_fs) + to open a remote file for writing. + """ + + + + # Some kind of for-structure where it checks if there is more of the file before + parquet_writer = pq.ParquetWriter(path, ak.to_parquet(first_batch)) + for i in hasNextPage: + parquet_writer.write_table(i) + + # class pyarrow.parquet.ParquetWriter(where, schema, filesystem=None, flavor=None, version='2.6', use_dictionary=True, compression='snappy', write_statistics=True, use_deprecated_int96_timestamps=None, compression_level=None, use_byte_stream_split=False, column_encoding=None, writer_engine_version=None, data_page_version='1.0', use_compliant_nested_type=True, encryption_properties=None, write_batch_size=None, dictionary_pagesize_limit=None, store_schema=True, write_page_index=False, **options) + parquet_writer.close() + \ No newline at end of file From 35738e44ae9ad1d7bb663b55433b2c6fdb405dbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Tue, 3 Oct 2023 10:57:57 +0200 Subject: [PATCH 02/22] Variables now stored in np array, some flexibility added (directories, finding histogram names etc.) removed attempt at tree reduction --- src/proteus/operations/add_histograms.py | 115 --------------- src/proteus/operations/hadd_like.py | 169 +++++++++++++++++++++++ 2 files changed, 169 insertions(+), 115 deletions(-) delete mode 100644 src/proteus/operations/add_histograms.py create mode 100644 src/proteus/operations/hadd_like.py diff --git a/src/proteus/operations/add_histograms.py b/src/proteus/operations/add_histograms.py deleted file mode 100644 index 163550c..0000000 --- a/src/proteus/operations/add_histograms.py +++ /dev/null @@ -1,115 +0,0 @@ -import uproot - -# Only combines one histogram per file -def hadd_like(files, destination, *, hist_name=None, hist_paths=None): - if hist_name == None: - # ? search through them nicely? Assume there are multiple? - array = hist.classnames() - try: - hist = uproot.open(files[0])[hist_name] - except: - # error: name not the same! - error = 5 - - # Base case - bins = hist.member('fN') - values = hist.values(flow=True) - fEntries = hist.member("fEntries") - fTsumw = hist.member("fTsumw") - if hist.member("fTsumw2") != None: - fTsumw2 = hist.member("fTsumw2") - else: - fTsumw2 = 0 - fTsumwx = hist.member("fTsumwx") - fTsumwx2 = hist.member("fTsumwx2") - variances = hist.variances("flow=True") - - # Iteratively / Sequentially: - for path in files[1:]: - with uproot.open(path) as file: - hist = file[hist_name] # histogram = uproot.open("file.root:path/to/histogram") - if bins != hist.member('fN'): - raise ValueError( - "Bins must be the same, not " + {bins} + " and " + {hist.member('fN')} - ) - - values += hist.values(flow=True) - fEntries += hist.member("fEntries") - fTsumw += hist.member("fTsumw") - if hist.member("fTsumw2") != None: - fTsumw2 += hist.member("fTsumw2") - fTsumwx += hist.member("fTsumwx") - fTsumwx2 += hist.member("fTsumwx2") - variances += hist.variances("flow=True") - - h_sum = uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), values, - fEntries, fTsumw, fTsumw2, fTsumwx, fTsumwx2, variances, hist.member("fXaxis")) - - file_out = uproot.recreate(destination) - file_out[h_sum.member("fName")] = h_sum - - -# If we can do things in parallel -def hadd_like_tree_reduction(files, destination, *, hist_name=None, threads=1): - import numpy as np - hist = uproot.open(files[0]) - try: - hist = uproot.open(files[0])[hist_name] - except: - # error: name not the same! - error = 5 - - iterations = len(files) - - if (iterations%2) != 0: - hist = files[-1] - values = hist.values(flow=True) - fEntries = hist.member("fEntries") - fTsumw = hist.member("fTsumw") - if hist.member("fTsumw2") != None: - fTsumw2 = hist.member("fTsumw2") - fTsumwx = hist.member("fTsumwx") - fTsumwx2 = hist.member("fTsumwx2") - variances = hist.variances("flow=True") - else: - values = 0 - fEntries = 0 - fTsumw = 0 - fTsumw2 = 0 - fTsumwx = 0 - fTsumwx2 = 0 - variances = 0 - - for i in range(iterations/2): - print(i) - values, fEntries, fTsumw, fTsumw2, fTsumwx, fTsumwx2, variances += sum_hists(files[i], files[-i], hist_name) - - h_sum = uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), values, - fEntries, fTsumw, fTsumw2, fTsumwx, fTsumwx2, variances, hist.member("fXaxis")) - - file_out = uproot.recreate(destination) - file_out[h_sum.member("fName")] = h_sum - -hadd_like_tree_reduction(["/Users/zobil/Documents/Proteus/file1.root", "/Users/zobil/Documents/Proteus/file2.root"], "place.root", hist_name="name") - -def sum_hists(hist1, hist2): - # Check bins - hist1 = uproot.open(hist1) - hist2 = uproot.open(hist2) - if hist1.member("fN") != hist2.member("fN"): - raise ValueError( - "Bins must be the same, not " + {hist1.member("fN")} + " and " + {hist2.member("fN")} # Get file names - ) - values = hist1.values(flow=True) + hist2.values(flow=True) - fEntries = hist1.member("fEntries") + hist2.member("fEntries") - fTsumw = hist1.member("fTsumw") + hist2.member("fTsumw") - if hist1.member("fTsumw2") != None: - fTsumw2 = hist1.member("fTsumw2") - else: - fTsumw2 = 0 - if hist2.member("fTsumw2") != None: - fTsumw2 += hist2.member("fTsumw2") - fTsumwx = hist1.member("fTsumwx") + hist2.member("fTsumwx") - fTsumwx2 = hist1.member("fTsumwx2") + hist2.member("fTsumwx") - variances = hist1.variances("flow=True") + hist2.variances("flow=True") - return hist1.member("fN"), values, fEntries, fTsumw, fTsumw2, fTsumwx, fTsumwx2, variances diff --git a/src/proteus/operations/hadd_like.py b/src/proteus/operations/hadd_like.py new file mode 100644 index 0000000..ee98dd6 --- /dev/null +++ b/src/proteus/operations/hadd_like.py @@ -0,0 +1,169 @@ +import uproot +import numpy as np + + +def add_1D_hists(files, hist_name, members, values, bins): + for path in files[1:]: + with uproot.open(path) as file: + hist = file[hist_name] + if bins != hist.member('fN'): + raise ValueError( + "Bins must be equal, not " + bins + " and " + hist.member('fN') + ) + if hist.member('fName') != hist_name: + raise ValueError( + "Names must be the same, not " + hist_name + " and " + hist.member('fName') + ) + + temp_members = [ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + hist.variances(flow=True) + ] + + values += hist.values(flow=True) + members += temp_members + return uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), values, + *members[0:6], hist.member("fXaxis")) + +def add_2D_hists(files, hist_name, values, members, bins): + for path in files[1:]: + with uproot.open(path) as file: + hist = file[hist_name] + if bins != hist.member('fN'): + raise ValueError( + "Bins must be equal, not "+ {members['fN']} + " and " + {hist.member('fN')} + ) + if hist.member('fName') != hist_name: + raise ValueError( + "Names must be the same, not "+ {members['fN']} + " and " + {hist.member('fName')} + ) + + temp_members = [ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + hist.member('fTsumwy'), + hist.member('fTsumwy2'), + hist.member('fTsumxy'), + hist.variances(flow=True) + ] + members += temp_members + values += hist.values(flow=True) + return uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), values, + *members[0:9], + hist.member("fXaxis"), hist.member("fYaxis")) + +def add_3D_hists(files, hist_names, values, members): + for path in files[1:]: + with uproot.open(path) as file: + hist = file[hist_names] + if members['fN'] != hist.member('fN'): + raise ValueError( + "Bins must be equal, not "+ {members['fN']} + " and " + {hist.member('fN')} + ) + if hist.member('fName') != hist_names: + raise ValueError( + "Names must be the same, not "+ {members['fN']} + " and " + {hist.member('fN')} + ) + temp_members = [ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + hist.member('fTsumwy'), + hist.member('fTsumwy2'), + hist.member('fTsumxy'), + hist.member('fTsumwz'), + hist.member('fTsumwz2'), + hist.member('fTsumwxz'), + hist.member('fTsumwyz'), + hist.variances(flow=True) + ] + members += temp_members + values += hist.values(flow=True) + return uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), values, + *members[0:14], hist.member("fXaxis")) + +def find_histograms(file): + # for i in filenames: + with uproot.open(file) as i: + array = i.classnames() + list = np.array([i for i in array if (array.get(i).startswith("TH1") or array.get(i).startswith("TH2") or array.get(i).startswith("TH3"))]) + return list + +def hadd_like(destination, filenames=None, directory=None, hist_names=None): + """ + Args: + destination (path-like): Name of the output file or file path. + filenames (None, or list of str): + directory (None, str): Local path, may contain glob patterns + hist_names (None, str, or list of str): Names of histograms to be added together. + + Adds together histograms from local ROOT files of a collection of ROOT files, and writes them to one ROOT file. + """ + if directory!=None: + import glob + filenames = sorted( + glob.glob(directory + f"/**/*{'.root'}", recursive=True) + ) + + if hist_names == None: # if histogram names are not provided + hist_names = find_histograms(filenames[0]) + hist_names = hist_names[0] + + file = uproot.open(filenames[0]) # This file may never close until the end... + hist_name = hist_names + hist = file[hist_name] + bins = hist.member('fN') + if len(hist.axes) == 1: + members = [ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + hist.variances(flow=True) + ] + values = hist.values(flow=True) + h_sum = add_1D_hists(filenames, hist.member('fName'), members, values, bins) + elif len(hist.axes) == 2: + members = [ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + hist.member('fTsumwy'), + hist.member('fTsumwy2'), + hist.member('fTsumxy'), + hist.variances(flow=True) + ] + values = hist.values(flow=True) + h_sum = add_2D_hists(filenames, hist.member('fName'), members, values, bins) + elif len(hist.axes) == 3: + members = [ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + hist.member('fTsumwy'), + hist.member('fTsumwy2'), + hist.member('fTsumxy'), + hist.member('fTsumwz'), + hist.member('fTsumwz2'), + hist.member('fTsumwxz'), + hist.member('fTsumwyz'), + hist.variances(flow=True) + ] + values = hist.values(flow=True) + h_sum = add_3D_hists(filenames, hist.member('fName'), members, values, bins) + file_out = uproot.recreate(destination) # What compression level? + file_out[h_sum.member("fName")] = h_sum From 1bcac6ad1c813ec0f72251defc1e55f203bc1de7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Wed, 4 Oct 2023 10:34:45 +0200 Subject: [PATCH 03/22] Changed to adding numpy arrays, added to docstrings, changed structure for multiple histograms per file (still untested though) --- src/proteus/operations/add_histograms.py | 186 +++++++++++++++++++++++ src/proteus/operations/hadd_like.py | 53 ++++--- 2 files changed, 217 insertions(+), 22 deletions(-) create mode 100644 src/proteus/operations/add_histograms.py diff --git a/src/proteus/operations/add_histograms.py b/src/proteus/operations/add_histograms.py new file mode 100644 index 0000000..1ee603d --- /dev/null +++ b/src/proteus/operations/add_histograms.py @@ -0,0 +1,186 @@ +import uproot +import numpy as np +import awkward as ak + +def add_1D_hists(files, hist_name): + bins = -1 + for path in files: + with uproot.open(path) as file: + hist = file[hist_name] + if bins == -1: + bins = hist.member('fN') + member_data = np.array([ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + ]) + variances = np.array(hist.variances(flow=True)) + values = np.array(hist.values(flow=True)) + elif bins != hist.member('fN'): + raise ValueError( + "Bins must be equal, not ", bins, " and ", hist.member('fN') + ) + if hist.member('fName') != hist_name: + raise ValueError( + "Names must be the same, not " + hist_name + " and " + hist.member('fName') + ) + + member_data += [ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2') + ] + variances += hist.variances(flow=True) + values += hist.values(flow=True) + return uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), + values, *member_data, variances, hist.member("fXaxis")) + +def add_2D_hists(files, hist_name): + bins = -1 + for path in files: + with uproot.open(path) as file: + hist = file[hist_name] + if bins == -1: + bins = hist.member('fN') + member_data = np.array([ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + hist.member('fTsumwy'), + hist.member('fTsumwy2'), + hist.member('fTsumwxy'), + ]) + variances = np.array(hist.variances(flow=True)) + values = np.array(hist.values(flow=True)) + elif bins != hist.member('fN'): + raise ValueError( + "Bins must be equal, not ", bins, " and ", hist.member('fN') + ) + if hist.member('fName') != hist_name: + raise ValueError( + "Names must be the same, not " + hist_name + " and " + hist.member('fName') + ) + + member_data += [ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + hist.member('fTsumwy'), + hist.member('fTsumwy2'), + hist.member('fTsumwxy') + ] + variances += hist.variances(flow=True) + values += hist.values(flow=True) + return uproot.writing.identify.to_TH2x(hist.member("fName"), hist.member("fTitle"), + values, *member_data, variances, hist.member("fXaxis"), hist.member("fYaxis")) + +def add_3D_hists(files, hist_name): + bins = -1 + for path in files: + with uproot.open(path) as file: + hist = file[hist_name] + if bins == -1: + bins = hist.member('fN') + member_data = np.array([ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + hist.member('fTsumwy'), + hist.member('fTsumwy2'), + hist.member('fTsumwxy'), + hist.member('fTsumwz'), + hist.member('fTsumwz2'), + hist.member('fTsumwxz'), + hist.member('fTsumwyz') + ]) + variances = np.array(hist.variances(flow=True)) + values = np.array(hist.values(flow=True)) + elif bins != hist.member('fN'): + raise ValueError( + "Bins must be equal, not ", bins, " and ", hist.member('fN') + ) + if hist.member('fName') != hist_name: + raise ValueError( + "Names must be the same, not " + hist_name + " and " + hist.member('fName') + ) + + member_data += [ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + hist.member('fTsumwy'), + hist.member('fTsumwy2'), + hist.member('fTsumwxy'), + hist.member('fTsumwz'), + hist.member('fTsumwz2'), + hist.member('fTsumwxz'), + hist.member('fTsumwyz') + ] + variances += hist.variances(flow=True) + values += hist.values(flow=True) + return uproot.writing.identify.to_TH2x(hist.member("fName"), hist.member("fTitle"), + values, *member_data, variances, hist.member("fXaxis"), hist.member("fYaxis"), hist.member("fZaxis")) + +def find_histograms(file): + # for i in filenames: + with uproot.open(file) as i: + array = i.classnames() + list = np.array([i for i in array if (array.get(i).startswith("TH1") or array.get(i).startswith("TH2") or array.get(i).startswith("TH3"))]) + return list + +def hadd_like(destination, files, hist_names=None): + """ + Args: + destination (path-like): Name of the output file or file path. + filenames (None, or list of str): List of local ROOT files to read histograms from. + directory (None, str): Local path, may contain glob patterns + hist_names (None, str, or list of str): Names of histograms to be added together, must be specified if files contain different numbers of histograms. + + Adds together histograms from local ROOT files of a collection of ROOT files, and writes them to one ROOT file. + """ + if type(files) != list: # Will this always work? + import glob + files = sorted( + glob.glob(files + f"/**/*{'.root'}", recursive=True) + ) + + if hist_names == None: # if histogram names are not provided + hist_names = find_histograms(files[0]) + + with uproot.open(files[0]) as file: # This file may never close until the end... + hist = file[hist_names] + num_axes = len(hist.axes) + + if type(hist_names) == str: + if num_axes == 1: + h_sum = add_1D_hists(files, hist_names) + elif num_axes == 2: + h_sum = add_2D_hists(files, hist_names) + elif num_axes == 3: + h_sum = add_3D_hists(files, hist_names) + file_out = uproot.recreate(destination) # What compression level? + file_out[h_sum.member("fName")] = h_sum + else: + file_out = uproot.recreate(destination) # What compression level? Would it still be recreate? + for name in hist_names: + if num_axes == 1: + h_sum = add_1D_hists(files, name) + elif num_axes == 2: + h_sum = add_2D_hists(files, name) + elif num_axes == 3: + h_sum = add_3D_hists(files, name) + file_out[h_sum.member("fName")] = h_sum + + diff --git a/src/proteus/operations/hadd_like.py b/src/proteus/operations/hadd_like.py index ee98dd6..354a2fc 100644 --- a/src/proteus/operations/hadd_like.py +++ b/src/proteus/operations/hadd_like.py @@ -8,7 +8,7 @@ def add_1D_hists(files, hist_name, members, values, bins): hist = file[hist_name] if bins != hist.member('fN'): raise ValueError( - "Bins must be equal, not " + bins + " and " + hist.member('fN') + "Bins must be equal, not ", bins, " and ", hist.member('fN') ) if hist.member('fName') != hist_name: raise ValueError( @@ -29,17 +29,20 @@ def add_1D_hists(files, hist_name, members, values, bins): return uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), values, *members[0:6], hist.member("fXaxis")) -def add_2D_hists(files, hist_name, values, members, bins): +def add_2D_hists(files, hist_name, members, values, bins): for path in files[1:]: with uproot.open(path) as file: + if hist_names == None: # if histogram names are not provided + hist_names = find_histograms(path) hist = file[hist_name] + print(path) if bins != hist.member('fN'): raise ValueError( - "Bins must be equal, not "+ {members['fN']} + " and " + {hist.member('fN')} + "Bins must be equal, not ", bins, " and ", hist.member('fN') ) if hist.member('fName') != hist_name: raise ValueError( - "Names must be the same, not "+ {members['fN']} + " and " + {hist.member('fName')} + "Names must be the same, not " + hist_name + " and " + hist.member('fName') ) temp_members = [ @@ -50,26 +53,27 @@ def add_2D_hists(files, hist_name, values, members, bins): hist.member('fTsumwx2'), hist.member('fTsumwy'), hist.member('fTsumwy2'), - hist.member('fTsumxy'), + hist.member('fTsumwxy'), hist.variances(flow=True) ] members += temp_members - values += hist.values(flow=True) - return uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), values, - *members[0:9], + values += np.array(hist.values(flow=True)) + print(type(values)) + return uproot.writing.identify.to_TH2x(hist.member("fName"), hist.member("fTitle"), values, + *members, hist.member("fXaxis"), hist.member("fYaxis")) -def add_3D_hists(files, hist_names, values, members): +def add_3D_hists(files, hist_names, values, members, bins): for path in files[1:]: with uproot.open(path) as file: hist = file[hist_names] if members['fN'] != hist.member('fN'): raise ValueError( - "Bins must be equal, not "+ {members['fN']} + " and " + {hist.member('fN')} + "Bins must be equal, not ", bins, " and ", hist.member('fN') ) if hist.member('fName') != hist_names: raise ValueError( - "Names must be the same, not "+ {members['fN']} + " and " + {hist.member('fN')} + "Names must be the same, not " + hist.member('fName') + " and " + hist.member('fName') ) temp_members = [ hist.member('fEntries'), @@ -79,7 +83,7 @@ def add_3D_hists(files, hist_names, values, members): hist.member('fTsumwx2'), hist.member('fTsumwy'), hist.member('fTsumwy2'), - hist.member('fTsumxy'), + hist.member('fTsumwxy'), hist.member('fTsumwz'), hist.member('fTsumwz2'), hist.member('fTsumwxz'), @@ -88,8 +92,8 @@ def add_3D_hists(files, hist_names, values, members): ] members += temp_members values += hist.values(flow=True) - return uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), values, - *members[0:14], hist.member("fXaxis")) + return uproot.writing.identify.to_TH3x(hist.member("fName"), hist.member("fTitle"), values, + *members[0:14], hist.member("fXaxis"), hist.member("fYaxis"), hist.member("fZaxis")) def find_histograms(file): # for i in filenames: @@ -102,13 +106,13 @@ def hadd_like(destination, filenames=None, directory=None, hist_names=None): """ Args: destination (path-like): Name of the output file or file path. - filenames (None, or list of str): + filenames (None, or list of str): List of local ROOT files to read histograms from. directory (None, str): Local path, may contain glob patterns hist_names (None, str, or list of str): Names of histograms to be added together. Adds together histograms from local ROOT files of a collection of ROOT files, and writes them to one ROOT file. """ - if directory!=None: + if directory!=None: # Merge directory and filenames arguments? import glob filenames = sorted( glob.glob(directory + f"/**/*{'.root'}", recursive=True) @@ -116,7 +120,7 @@ def hadd_like(destination, filenames=None, directory=None, hist_names=None): if hist_names == None: # if histogram names are not provided hist_names = find_histograms(filenames[0]) - hist_names = hist_names[0] + file = uproot.open(filenames[0]) # This file may never close until the end... hist_name = hist_names @@ -142,10 +146,11 @@ def hadd_like(destination, filenames=None, directory=None, hist_names=None): hist.member('fTsumwx2'), hist.member('fTsumwy'), hist.member('fTsumwy2'), - hist.member('fTsumxy'), + hist.member('fTsumwxy'), hist.variances(flow=True) ] - values = hist.values(flow=True) + values = np.array(hist.values(flow=True)) + print(type(values)) h_sum = add_2D_hists(filenames, hist.member('fName'), members, values, bins) elif len(hist.axes) == 3: members = [ @@ -156,14 +161,18 @@ def hadd_like(destination, filenames=None, directory=None, hist_names=None): hist.member('fTsumwx2'), hist.member('fTsumwy'), hist.member('fTsumwy2'), - hist.member('fTsumxy'), + hist.member('fTsumwxy'), hist.member('fTsumwz'), hist.member('fTsumwz2'), hist.member('fTsumwxz'), hist.member('fTsumwyz'), hist.variances(flow=True) - ] - values = hist.values(flow=True) + ] + values += np.array(hist.values(flow=True)) h_sum = add_3D_hists(filenames, hist.member('fName'), members, values, bins) file_out = uproot.recreate(destination) # What compression level? file_out[h_sum.member("fName")] = h_sum + + +hadd_like("place.root", filenames=["/Users/zobil/Documents/Proteus/tests/file1.root", "/Users/zobil/Documents/Proteus/tests/file2.root"], hist_names="name") +# hadd_like("place.root", directory="/Users/zobil/Documents/Proteus/tests/") From 2bb8f4b922a8a88f2513541dd58b25844d656300 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Wed, 4 Oct 2023 11:56:31 +0200 Subject: [PATCH 04/22] Setting up tests and module --- src/proteus/__init__.py | 9 +- src/proteus/operations/__init__.py | 1 + src/proteus/operations/add_histograms.py | 15 +- src/proteus/to_feather.py | 65 --------- src/proteus/to_parquet.py | 167 ----------------------- tests/generate_hists_root.py | 99 ++++++++++++++ tests/test_add_histograms.py | 47 +++++++ 7 files changed, 160 insertions(+), 243 deletions(-) create mode 100644 src/proteus/operations/__init__.py delete mode 100644 src/proteus/to_feather.py delete mode 100644 src/proteus/to_parquet.py create mode 100644 tests/generate_hists_root.py create mode 100644 tests/test_add_histograms.py diff --git a/src/proteus/__init__.py b/src/proteus/__init__.py index 6ef7cae..bf99a9b 100644 --- a/src/proteus/__init__.py +++ b/src/proteus/__init__.py @@ -7,6 +7,11 @@ from __future__ import annotations -from ._version import version as __version__ +from proteus._version import version as __version__ -__all__ = ("__version__",) +from proteus.operations import add_histograms + +__all__ = [x for x in globals() if not x.startswith("_")] + +def __dir__(): + return __all__ \ No newline at end of file diff --git a/src/proteus/operations/__init__.py b/src/proteus/operations/__init__.py new file mode 100644 index 0000000..1308244 --- /dev/null +++ b/src/proteus/operations/__init__.py @@ -0,0 +1 @@ +from proteus.operations.add_histograms import * \ No newline at end of file diff --git a/src/proteus/operations/add_histograms.py b/src/proteus/operations/add_histograms.py index 1ee603d..595ea9e 100644 --- a/src/proteus/operations/add_histograms.py +++ b/src/proteus/operations/add_histograms.py @@ -134,13 +134,12 @@ def add_3D_hists(files, hist_name): values, *member_data, variances, hist.member("fXaxis"), hist.member("fYaxis"), hist.member("fZaxis")) def find_histograms(file): - # for i in filenames: - with uproot.open(file) as i: - array = i.classnames() - list = np.array([i for i in array if (array.get(i).startswith("TH1") or array.get(i).startswith("TH2") or array.get(i).startswith("TH3"))]) - return list + with uproot.open(file) as i: + array = i.classnames() + list = np.array([i for i in array if (array.get(i).startswith("TH1") or array.get(i).startswith("TH2") or array.get(i).startswith("TH3"))]) + return list -def hadd_like(destination, files, hist_names=None): +def add_hists(destination, files, hist_names=None): """ Args: destination (path-like): Name of the output file or file path. @@ -181,6 +180,4 @@ def hadd_like(destination, files, hist_names=None): h_sum = add_2D_hists(files, name) elif num_axes == 3: h_sum = add_3D_hists(files, name) - file_out[h_sum.member("fName")] = h_sum - - + file_out[h_sum.member("fName")] = h_sum \ No newline at end of file diff --git a/src/proteus/to_feather.py b/src/proteus/to_feather.py deleted file mode 100644 index 8d7b065..0000000 --- a/src/proteus/to_feather.py +++ /dev/null @@ -1,65 +0,0 @@ -import pyarrow as pa -import pyarrow.parquet as pq -import awkward as ak -from fsspec import AbstractFileSystem - -def parquet_to_feather( - path, - new_path, # ? - *, - columns=None, - row_groups=None, - storage_options=None, - max_gap=64_000, - max_block=256_000_000, - footer_sample_size=1_000_000, - generate_bitmasks=False, - highlevel=True, - behavior=None, -): - """ - Args: - path (str): Local filename or remote URL, passed to fsspec for resolution. - May contain glob patterns. - columns (None, str, or list of str): Glob pattern(s) with bash-like curly - brackets for matching column names. Nested records are separated by dots. - If a list of patterns, the logical-or is matched. If None, all columns - are read. - row_groups (None or set of int): Row groups to read; must be non-negative. - Order is ignored: the output array is presented in the order specified by - Parquet metadata. If None, all row groups/all rows are read. - storage_options: Passed to `fsspec.parquet.open_parquet_file`. - max_gap (int): Passed to `fsspec.parquet.open_parquet_file`. - max_block (int): Passed to `fsspec.parquet.open_parquet_file`. - footer_sample_size (int): Passed to `fsspec.parquet.open_parquet_file`. - generate_bitmasks (bool): If enabled and Arrow/Parquet does not have Awkward - metadata, `generate_bitmasks=True` creates empty bitmasks for nullable - types that don't have bitmasks in the Arrow/Parquet data, so that the - Form (BitMaskedForm vs UnmaskedForm) is predictable. - highlevel (bool): If True, return an #ak.Array; otherwise, return - a low-level #ak.contents.Content subclass. - behavior (None or dict): Custom #ak.behavior for the output array, if - high-level. - - Reads data from a local or remote Parquet file a feather file (or a collection of feather files?). - - Different from ak.to_parquet etc. because... - """ - #create feather file? - - # read one page of parquet file - parquet_file = pq.ParquetFile(path) # does this put the whole thing in memory? - metadata = ak.metadata_from_parquet(path) - # parquet_metadata - # read_row_group or with metadata? - # batch vs page? what size? - # with metadata['fs'].open as fp: #why would this be necessary? - for batch in parquet_file.iter_batches(): - pa.concat(new_path, ak.to_feather(new_path, batch)) #but this shouldn't be something that sets a var to a bigger file?? - - - - - # write to feather file - find concat without - # feather_file = pa.concat([ak.from_parquet(file, page) for page in pages],ignore_index=True) - diff --git a/src/proteus/to_parquet.py b/src/proteus/to_parquet.py deleted file mode 100644 index a596f70..0000000 --- a/src/proteus/to_parquet.py +++ /dev/null @@ -1,167 +0,0 @@ -import dask_awkward as da -import _collections_abc -import awkward as ak -import pyarrow.parquet - -# Feather to parquet first? -def feather_to_parquet( - # array, - path, - *, - list_to32=False, - string_to32=True, - bytestring_to32=True, - emptyarray_to=None, - categorical_as_dictionary=False, - extensionarray=True, - count_nulls=True, - compression="zstd", - compression_level=None, - row_group_size=64 * 1024 * 1024, - data_page_size=None, - parquet_flavor=None, - parquet_version="2.4", - parquet_page_version="1.0", - parquet_metadata_statistics=True, - parquet_dictionary_encoding=False, - parquet_byte_stream_split=False, - parquet_coerce_timestamps=None, - parquet_old_int96_timestamps=None, - parquet_compliant_nested=False, # https://issues.apache.org/jira/browse/ARROW-16348 - parquet_extra_options=None, - storage_options=None, - # Potentially need: - # expressions=None, - # cut=None, - # filter_name = no_filter, - # filter_typename = no_filter, - # aliases=None, - # language=uproot.language.python.python_language, - # entry_start=None, - # entry_stop=None, - # step_size="100 MB", - # library="ak", - # how=None, -): - - # Do in steps. can use argument "columns" to select amount - have it be the same as the - # size of a page is in a parquet? Or step_size like in uproot's iterate? - # much to read, can choose bytes, tuple, str, list (not sure what most of those mean here) - # Data page size! - # First read ak.feather() - # Read feather also has columns...best to read a bit at a time and keep track? - - """ - Args: - array: Array-like data (anything #ak.to_layout recognizes). - destination (path-like): Name of the output file, file path, or - remote URL passed to [fsspec.core.url_to_fs](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.core.url_to_fs) - for remote writing. - list_to32 (bool): If True, convert Awkward lists into 32-bit Arrow lists - if they're small enough, even if it means an extra conversion. Otherwise, - signed 32-bit #ak.types.ListType maps to Arrow `ListType`, - signed 64-bit #ak.types.ListType maps to Arrow `LargeListType`, - and unsigned 32-bit #ak.types.ListType picks whichever Arrow type its - values fit into. - string_to32 (bool): Same as the above for Arrow `string` and `large_string`. - bytestring_to32 (bool): Same as the above for Arrow `binary` and `large_binary`. - emptyarray_to (None or dtype): If None, #ak.types.UnknownType maps to Arrow's - null type; otherwise, it is converted a given numeric dtype. - categorical_as_dictionary (bool): If True, #ak.contents.IndexedArray and - #ak.contents.IndexedOptionArray labeled with `__array__ = "categorical"` - are mapped to Arrow `DictionaryArray`; otherwise, the projection is - evaluated before conversion (always the case without - `__array__ = "categorical"`). - extensionarray (bool): If True, this function returns extended Arrow arrays - (at all levels of nesting), which preserve metadata so that Awkward \u2192 - Arrow \u2192 Awkward preserves the array's #ak.types.Type (though not - the #ak.forms.Form). If False, this function returns generic Arrow arrays - that might be needed for third-party tools that don't recognize Arrow's - extensions. Even with `extensionarray=False`, the values produced by - Arrow's `to_pylist` method are the same as the values produced by Awkward's - #ak.to_list. - count_nulls (bool): If True, count the number of missing values at each level - and include these in the resulting Arrow array, which makes some downstream - applications faster. If False, skip the up-front cost of counting them. - compression (None, str, or dict): Compression algorithm name, passed to - [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). - Parquet supports `{"NONE", "SNAPPY", "GZIP", "BROTLI", "LZ4", "ZSTD"}` - (where `"GZIP"` is also known as "zlib" or "deflate"). If a dict, the keys - are column names (the same column names that #ak.forms.Form.columns returns - and #ak.forms.Form.select_columns accepts) and the values are compression - algorithm names, to compress each column differently. - compression_level (None, int, or dict None): Compression level, passed to - [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). - Compression levels have different meanings for different compression - algorithms: GZIP ranges from 1 to 9, but ZSTD ranges from -7 to 22, for - example. Generally, higher numbers provide slower but smaller compression. - row_group_size (int or None): Number of entries in each row group (except the last), - passed to [pyarrow.parquet.ParquetWriter.write_table](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html#pyarrow.parquet.ParquetWriter.write_table). - If None, the Parquet default of 64 MiB is used. - data_page_size (None or int): Number of bytes in each data page, passed to - [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). - If None, the Parquet default of 1 MiB is used. - parquet_flavor (None or `"spark"`): If None, the output Parquet file will follow - Arrow conventions; if `"spark"`, it will follow Spark conventions. Some - systems, such as Spark and Google BigQuery, might need Spark conventions, - while others might need Arrow conventions. Passed to - [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). - as `flavor`. - parquet_version (`"1.0"`, `"2.4"`, or `"2.6"`): Parquet file format version. - Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). - as `version`. - parquet_page_version (`"1.0"` or `"2.0"`): Parquet page format version. - Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). - as `data_page_version`. - parquet_metadata_statistics (bool or dict): If True, include summary - statistics for each data page in the Parquet metadata, which lets some - applications search for data more quickly (by skipping pages). If a dict - mapping column names to bool, include summary statistics on only the - specified columns. Passed to - [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). - as `write_statistics`. - parquet_dictionary_encoding (bool or dict): If True, allow Parquet to pre-compress - with dictionary encoding. If a dict mapping column names to bool, only - use dictionary encoding on the specified columns. Passed to - [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). - as `use_dictionary`. - parquet_byte_stream_split (bool or dict): If True, pre-compress floating - point fields (`float32` or `float64`) with byte stream splitting, which - collects all mantissas in one part of the stream and exponents in another. - Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). - as `use_byte_stream_split`. - parquet_coerce_timestamps (None, `"ms"`, or `"us"`): If None, any timestamps - (`datetime64` data) are coerced to a given resolution depending on - `parquet_version`: version `"1.0"` and `"2.4"` are coerced to microseconds, - but later versions use the `datetime64`'s own units. If `"ms"` is explicitly - specified, timestamps are coerced to milliseconds; if `"us"`, microseconds. - Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). - as `coerce_timestamps`. - parquet_old_int96_timestamps (None or bool): If True, use Parquet's INT96 format - for any timestamps (`datetime64` data), taking priority over `parquet_coerce_timestamps`. - If None, let the `parquet_flavor` decide. Passed to - [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). - as `use_deprecated_int96_timestamps`. - parquet_compliant_nested (bool): If True, use the Spark/BigQuery/Parquet - [convention for nested lists](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#nested-types), - in which each list is a one-field record with field name "`element`"; - otherwise, use the Arrow convention, in which the field name is "`item`". - Passed to [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). - as `use_compliant_nested_type`. - parquet_extra_options (None or dict): Any additional options to pass to - [pyarrow.parquet.ParquetWriter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). - storage_options (None or dict): Any additional options to pass to - [fsspec.core.url_to_fs](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.core.url_to_fs) - to open a remote file for writing. - """ - - - - # Some kind of for-structure where it checks if there is more of the file before - parquet_writer = pq.ParquetWriter(path, ak.to_parquet(first_batch)) - for i in hasNextPage: - parquet_writer.write_table(i) - - # class pyarrow.parquet.ParquetWriter(where, schema, filesystem=None, flavor=None, version='2.6', use_dictionary=True, compression='snappy', write_statistics=True, use_deprecated_int96_timestamps=None, compression_level=None, use_byte_stream_split=False, column_encoding=None, writer_engine_version=None, data_page_version='1.0', use_compliant_nested_type=True, encryption_properties=None, write_batch_size=None, dictionary_pagesize_limit=None, store_schema=True, write_page_index=False, **options) - parquet_writer.close() - \ No newline at end of file diff --git a/tests/generate_hists_root.py b/tests/generate_hists_root.py new file mode 100644 index 0000000..a2499e8 --- /dev/null +++ b/tests/generate_hists_root.py @@ -0,0 +1,99 @@ +import ROOT +import uproot + +# h1 = ROOT.TH1I("name", "title", 10, -4, 4) +# h1.FillRandom("gaus") +# h2 = ROOT.TH1I("name", "title", 10, -4, 4) +# h2.FillRandom("gaus") + +def gen_gause_hists_uproot(): + file_out = uproot.recreate("file1.root") + h1 = uproot.from_pyroot(h1) + h_1 = uproot.writing.identify.to_TH1x(h1.member("fName"), + h1.member("fTitle"), + h1.values(flow=True), + h1.member("fEntries"), + h1.member("fTsumw"), + h1.member("fTsumw2"), + h1.member("fTsumwx"), + h1.member("fTsumwx2"), + h1.variances(flow=True), + h1.member("fXaxis"), + ) + print(h_1) + file_out[h_1.member("fName")] = h_1 + + file_out = uproot.recreate("file2.root") + h2 = uproot.from_pyroot(h2) + h_2 = uproot.writing.identify.to_TH1x(h2.member("fName"), + h2.member("fTitle"), + h2.values(flow=True), + h2.member("fEntries"), + h2.member("fTsumw"), + h2.member("fTsumw2"), + h2.member("fTsumwx"), + h2.member("fTsumwx2"), + h2.variances(flow=True), + h2.member("fXaxis"), + ) + + file_out[h_2.member("fName")] = h_2 + +def gen_gaus_hists_pyroot(names, file_names): + # Will create histograms with same names and bins for files in file_names + for file in file_names: + for name in names: + h = ROOT.TH1I(name, name, 10, -4, 4) + h.FillRandom("gaus") + h.Sumw2() + h.SetDirectory(0) + outHistFile = ROOT.TFile.Open(file, "RECREATE") + outHistFile.cd() + h.Write() + outHistFile.Close() + +def gen_gaus_hists_pyroot(): + h1 = ROOT.TH1I("name", "title", 10, -4, 4) + h1.FillRandom("gaus") + h1.Sumw2() + h1.SetDirectory(0) + outHistFile = ROOT.TFile.Open("file1.root", "RECREATE") + outHistFile.cd() + h1.Write() + outHistFile.Close() + + h2 = ROOT.TH1I("name", "title", 10, -4, 4) + h2.FillRandom("gaus") + h2.Sumw2() + h2.SetDirectory(0) + outHistFile = ROOT.TFile.Open("file2.root", "RECREATE") + outHistFile.cd() + h2.Write() + outHistFile.Close() + +def gen_2dim_hists_pyroot(num_hists, num_files, names): + import numpy as np + xedges = [0, 1, 3, 5] + yedges = [0, 2, 3, 4, 6] + x = np.random.normal(2, 1, 100) + y = np.random.normal(1, 1, 100) + H, xedges, yedges = np.histogram2d(x, y, bins=(xedges, yedges)) + + h1 = ROOT.TH2I("name", "title", len(xedges), 0.0, 5.0, len(yedges), 0.0, 6.0) + h1.Sumw2() + h1.Fill(0,0) + h1.SetDirectory(0) + outHistFile = ROOT.TFile.Open("file2dim1.root", "RECREATE") + outHistFile.cd() + h1.Write() + outHistFile.Close() + + + h2 = ROOT.TH2I("name", "title", len(xedges), 0.0, 5.0, len(yedges), 0.0, 6.0) + h2.Sumw2() + h2.Fill(0,0) + h2.SetDirectory(0) + outHistFile = ROOT.TFile.Open("file2dim2.root", "RECREATE") + outHistFile.cd() + h2.Write() + outHistFile.Close() diff --git a/tests/test_add_histograms.py b/tests/test_add_histograms.py new file mode 100644 index 0000000..18f63cd --- /dev/null +++ b/tests/test_add_histograms.py @@ -0,0 +1,47 @@ +import os + +import pytest + +import proteus + +import ROOT + +def make_hists(): + h1 = ROOT.TH1I("name", "title", 10, -4, 4) + h1.FillRandom("gaus") + h1.Sumw2() + h1.SetDirectory(0) + outHistFile = ROOT.TFile.Open("file1.root", "RECREATE") + outHistFile.cd() + h1.Write() + outHistFile.Close() + + h2 = ROOT.TH1I("name", "title", 10, -4, 4) + h2.FillRandom("gaus") + h2.Sumw2() + h2.SetDirectory(0) + outHistFile = ROOT.TFile.Open("file2.root", "RECREATE") + outHistFile.cd() + h2.Write() + outHistFile.Close() + + h3 = ROOT.TH1I("name", "title", 10, -4, 4) + h3.FillRandom("gaus") + h3.Sumw2() + h3.SetDirectory(0) + outHistFile = ROOT.TFile.Open("file3.root", "RECREATE") + outHistFile.cd() + h3.Write() + outHistFile.Close() + +def test_simple(tmp_path): + # 1-Dimensional Histograms, list of files, one histogram per file + destination = os.path.join(tmp_path, "destination.root") + make_hists() + proteus.operations.add_hists("place.root", filenames=["file1.root", "file2.root", "file3.root"], hist_names="name") + + # assert get hists from destination file and compare? + +# hadd_like("place.root", directory="/Users/zobil/Documents/Proteus/tests/") + +test_simple("/") \ No newline at end of file From a913db2f5f0ac47af0b3d1d6a137d2adcf03937a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Thu, 5 Oct 2023 11:08:50 +0200 Subject: [PATCH 05/22] small fixes for different cases, added tests --- src/proteus/__init__.py | 5 +- src/proteus/operations/add_histograms.py | 77 +++++++++++----------- tests/test_add_histograms.py | 84 +++++++++++++++++++----- 3 files changed, 106 insertions(+), 60 deletions(-) diff --git a/src/proteus/__init__.py b/src/proteus/__init__.py index bf99a9b..9ece269 100644 --- a/src/proteus/__init__.py +++ b/src/proteus/__init__.py @@ -11,7 +11,4 @@ from proteus.operations import add_histograms -__all__ = [x for x in globals() if not x.startswith("_")] - -def __dir__(): - return __all__ \ No newline at end of file +__all__ = ["add_histograms"] \ No newline at end of file diff --git a/src/proteus/operations/add_histograms.py b/src/proteus/operations/add_histograms.py index 595ea9e..bff67bb 100644 --- a/src/proteus/operations/add_histograms.py +++ b/src/proteus/operations/add_histograms.py @@ -16,26 +16,26 @@ def add_1D_hists(files, hist_name): hist.member('fTsumwx'), hist.member('fTsumwx2'), ]) - variances = np.array(hist.variances(flow=True)) - values = np.array(hist.values(flow=True)) + variances = hist.variances(flow=True) + values = hist.values(flow=True) elif bins != hist.member('fN'): raise ValueError( "Bins must be equal, not ", bins, " and ", hist.member('fN') ) - if hist.member('fName') != hist_name: + elif hist.member('fName') != hist_name: raise ValueError( "Names must be the same, not " + hist_name + " and " + hist.member('fName') ) - - member_data += [ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2') - ] - variances += hist.variances(flow=True) - values += hist.values(flow=True) + else: + member_data += [ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2') + ] + variances += hist.variances(flow=True) + values += hist.values(flow=True) return uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), values, *member_data, variances, hist.member("fXaxis")) @@ -62,23 +62,23 @@ def add_2D_hists(files, hist_name): raise ValueError( "Bins must be equal, not ", bins, " and ", hist.member('fN') ) - if hist.member('fName') != hist_name: + elif hist.member('fName') != hist_name: raise ValueError( "Names must be the same, not " + hist_name + " and " + hist.member('fName') ) - - member_data += [ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - hist.member('fTsumwy'), - hist.member('fTsumwy2'), - hist.member('fTsumwxy') - ] - variances += hist.variances(flow=True) - values += hist.values(flow=True) + else: + member_data += [ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + hist.member('fTsumwy'), + hist.member('fTsumwy2'), + hist.member('fTsumwxy') + ] + variances += hist.variances(flow=True) + values += hist.values(flow=True) return uproot.writing.identify.to_TH2x(hist.member("fName"), hist.member("fTitle"), values, *member_data, variances, hist.member("fXaxis"), hist.member("fYaxis")) @@ -109,12 +109,13 @@ def add_3D_hists(files, hist_name): raise ValueError( "Bins must be equal, not ", bins, " and ", hist.member('fN') ) - if hist.member('fName') != hist_name: + elif hist.member('fName') != hist_name: raise ValueError( "Names must be the same, not " + hist_name + " and " + hist.member('fName') ) - member_data += [ + else: + member_data += [ hist.member('fEntries'), hist.member('fTsumw'), hist.member('fTsumw2'), @@ -127,24 +128,23 @@ def add_3D_hists(files, hist_name): hist.member('fTsumwz2'), hist.member('fTsumwxz'), hist.member('fTsumwyz') - ] - variances += hist.variances(flow=True) - values += hist.values(flow=True) + ] + variances += hist.variances(flow=True) + values += hist.values(flow=True) return uproot.writing.identify.to_TH2x(hist.member("fName"), hist.member("fTitle"), values, *member_data, variances, hist.member("fXaxis"), hist.member("fYaxis"), hist.member("fZaxis")) def find_histograms(file): - with uproot.open(file) as i: - array = i.classnames() - list = np.array([i for i in array if (array.get(i).startswith("TH1") or array.get(i).startswith("TH2") or array.get(i).startswith("TH3"))]) + with uproot.open(file) as h: + array = h.classnames() + list = np.array([h[i].member("fName") for i in array if (array.get(i).startswith("TH1") or array.get(i).startswith("TH2") or array.get(i).startswith("TH3"))]) return list def add_hists(destination, files, hist_names=None): """ Args: destination (path-like): Name of the output file or file path. - filenames (None, or list of str): List of local ROOT files to read histograms from. - directory (None, str): Local path, may contain glob patterns + files (Str or list of str): List of local ROOT files to read histograms from. hist_names (None, str, or list of str): Names of histograms to be added together, must be specified if files contain different numbers of histograms. Adds together histograms from local ROOT files of a collection of ROOT files, and writes them to one ROOT file. @@ -157,9 +157,10 @@ def add_hists(destination, files, hist_names=None): if hist_names == None: # if histogram names are not provided hist_names = find_histograms(files[0]) + # print(hist_names[0].member("fName")) with uproot.open(files[0]) as file: # This file may never close until the end... - hist = file[hist_names] + hist = file[hist_names[0]] num_axes = len(hist.axes) if type(hist_names) == str: diff --git a/tests/test_add_histograms.py b/tests/test_add_histograms.py index 18f63cd..8791a68 100644 --- a/tests/test_add_histograms.py +++ b/tests/test_add_histograms.py @@ -1,47 +1,95 @@ import os - -import pytest - +import uproot +import sys +sys.path.append("/Users/zobil/Documents/Proteus/src/") import proteus - import ROOT +import numpy as np -def make_hists(): - h1 = ROOT.TH1I("name", "title", 10, -4, 4) +def gen_1d_root(file_paths): + h1 = ROOT.TH1I("name", "title", 5, -4, 4) h1.FillRandom("gaus") h1.Sumw2() h1.SetDirectory(0) - outHistFile = ROOT.TFile.Open("file1.root", "RECREATE") + outHistFile = ROOT.TFile.Open(file_paths[0], "RECREATE") outHistFile.cd() h1.Write() outHistFile.Close() + h1 = uproot.from_pyroot(h1) - h2 = ROOT.TH1I("name", "title", 10, -4, 4) + h2 = ROOT.TH1I("name", "title", 5, -4, 4) h2.FillRandom("gaus") h2.Sumw2() h2.SetDirectory(0) - outHistFile = ROOT.TFile.Open("file2.root", "RECREATE") + outHistFile = ROOT.TFile.Open(file_paths[1], "RECREATE") outHistFile.cd() h2.Write() outHistFile.Close() + h2 = uproot.from_pyroot(h2) - h3 = ROOT.TH1I("name", "title", 10, -4, 4) + h3 = ROOT.TH1I("name", "title", 5, -4, 4) h3.FillRandom("gaus") h3.Sumw2() h3.SetDirectory(0) - outHistFile = ROOT.TFile.Open("file3.root", "RECREATE") + outHistFile = ROOT.TFile.Open(file_paths[2], "RECREATE") outHistFile.cd() h3.Write() outHistFile.Close() + h3 = uproot.from_pyroot(h3) + return h1, h2, h3 + +def test_simple(tmp_path, file_paths): + h1, h2, h3 = gen_1d_root(file_paths) -def test_simple(tmp_path): - # 1-Dimensional Histograms, list of files, one histogram per file destination = os.path.join(tmp_path, "destination.root") - make_hists() - proteus.operations.add_hists("place.root", filenames=["file1.root", "file2.root", "file3.root"], hist_names="name") + proteus.operations.add_hists(destination, ["tests/directory/file1.root", "tests/directory/file2.root"], hist_names="name") + + with uproot.open(destination) as file: + assert file["name"].member("fN") == h1.member("fN") + assert file["name"].member("fTsumw") == h1.member("fTsumw") + h2.member("fTsumw") + h3.member("fTsumw") + assert np.equal(file["name"].values(flow=True), np.array(h1.values(flow=True) + h2.values(flow=True))).all + +def test_3_glob(tmp_path, file_paths): + h1, h2, h3 = gen_1d_root(file_paths) + + # destination = os.path.join(tmp_path, "destination.root") + proteus.operations.add_hists(os.path.join(tmp_path, "place.root"), "tests/directory") + + with uproot.open("tests/place.root") as file: + assert file["name"].member("fN") == h1.member("fN") + assert file["name"].member("fTsumw") == h1.member("fTsumw") + h2.member("fTsumw") + h3.member("fTsumw") + assert np.equal(file["name"].values(flow=True), np.array(h1.values(flow=True) + h2.values(flow=True) + h3.values(flow=True))).all + +def test_2dim(tmp_path): + xedges = [0, 1, 3, 5] + yedges = [0, 2, 3, 4, 6] + x = np.random.normal(2, 1, 100) + y = np.random.normal(1, 1, 100) + H, xedges, yedges = np.histogram2d(x, y, bins=(xedges, yedges)) + + h1 = ROOT.TH2I("name", "title", len(xedges), 0.0, 5.0, len(yedges), 0.0, 6.0) + h1.Sumw2() + h1.Fill(0,0) + h1.SetDirectory(0) + outHistFile = ROOT.TFile.Open("tests/file2dim1.root", "RECREATE") + outHistFile.cd() + h1.Write() + outHistFile.Close() + + h2 = ROOT.TH2I("name", "title", len(xedges), 0.0, 5.0, len(yedges), 0.0, 6.0) + h2.Sumw2() + h2.Fill(0,0) + h2.SetDirectory(0) + outHistFile = ROOT.TFile.Open("tests/file2dim2.root", "RECREATE") + outHistFile.cd() + h2.Write() + outHistFile.Close() - # assert get hists from destination file and compare? + proteus.operations.add_hists("tests/place2.root", ["file2dim1.root", "file2dim2.root"], hist_names="name") -# hadd_like("place.root", directory="/Users/zobil/Documents/Proteus/tests/") + with uproot.open("tests/place2.root") as file: + assert file["name"].member("fN") == h1.member("fN") + assert file["name"].member("fTsumw") == h1.member("fTsumw") + h2.member("fTsumw") + assert np.equal(file["name"].values(flow=True), np.array(h1.values(flow=True) + h2.values(flow=True))).all -test_simple("/") \ No newline at end of file +test_3_glob("",["tests/directory/file1.root","tests/directory/file2.root","tests/directory/file3.root"]) From 2edd868e6e4556d5154e91dd93824bc0a2bcfead Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Thu, 5 Oct 2023 16:57:38 +0200 Subject: [PATCH 06/22] Added a sort of partial tree reduction and tests --- src/proteus/operations/add_histograms.py | 110 ++++++++++++++++++----- tests/test_add_histograms.py | 19 +++- 2 files changed, 107 insertions(+), 22 deletions(-) diff --git a/src/proteus/operations/add_histograms.py b/src/proteus/operations/add_histograms.py index bff67bb..4ba3ff0 100644 --- a/src/proteus/operations/add_histograms.py +++ b/src/proteus/operations/add_histograms.py @@ -1,12 +1,11 @@ import uproot import numpy as np -import awkward as ak def add_1D_hists(files, hist_name): bins = -1 for path in files: with uproot.open(path) as file: - hist = file[hist_name] + hist = file[hist_name] # Try catch? if bins == -1: bins = hist.member('fN') member_data = np.array([ @@ -22,17 +21,17 @@ def add_1D_hists(files, hist_name): raise ValueError( "Bins must be equal, not ", bins, " and ", hist.member('fN') ) - elif hist.member('fName') != hist_name: - raise ValueError( - "Names must be the same, not " + hist_name + " and " + hist.member('fName') - ) + # elif hist.member('fName') != hist_name: + # raise ValueError( + # "Names must be the same, not " + hist_name + " and " + hist.member('fName') + # ) else: member_data += [ hist.member('fEntries'), hist.member('fTsumw'), hist.member('fTsumw2'), hist.member('fTsumwx'), - hist.member('fTsumwx2') + hist.member('fTsumwx2'), ] variances += hist.variances(flow=True) values += hist.values(flow=True) @@ -134,13 +133,7 @@ def add_3D_hists(files, hist_name): return uproot.writing.identify.to_TH2x(hist.member("fName"), hist.member("fTitle"), values, *member_data, variances, hist.member("fXaxis"), hist.member("fYaxis"), hist.member("fZaxis")) -def find_histograms(file): - with uproot.open(file) as h: - array = h.classnames() - list = np.array([h[i].member("fName") for i in array if (array.get(i).startswith("TH1") or array.get(i).startswith("TH2") or array.get(i).startswith("TH3"))]) - return list - -def add_hists(destination, files, hist_names=None): +def add_hists(destination, files, hist_names=None, tree_reduction=False): """ Args: destination (path-like): Name of the output file or file path. @@ -154,16 +147,23 @@ def add_hists(destination, files, hist_names=None): files = sorted( glob.glob(files + f"/**/*{'.root'}", recursive=True) ) - + if hist_names == None: # if histogram names are not provided - hist_names = find_histograms(files[0]) - # print(hist_names[0].member("fName")) + with uproot.open(files[0]) as h: + array = h.classnames() + hist_names = np.array([h[i].member("fName") for i in array if (array.get(i).startswith("TH1") or array.get(i).startswith("TH2") or array.get(i).startswith("TH3"))]) + - with uproot.open(files[0]) as file: # This file may never close until the end... - hist = file[hist_names[0]] + with uproot.open(files[0]) as file: + hist = file[[str(hist_names)][0]] num_axes = len(hist.axes) + if tree_reduction == True: + h_sum = tree_reduction_add(files, hist_names) + if type(hist_names) == str: + if tree_reduction == True: + h_sum = tree_reduction_add(files, hist_names) if num_axes == 1: h_sum = add_1D_hists(files, hist_names) elif num_axes == 2: @@ -172,13 +172,83 @@ def add_hists(destination, files, hist_names=None): h_sum = add_3D_hists(files, hist_names) file_out = uproot.recreate(destination) # What compression level? file_out[h_sum.member("fName")] = h_sum + else: file_out = uproot.recreate(destination) # What compression level? Would it still be recreate? for name in hist_names: + if tree_reduction == True: + h_sum = tree_reduction_add(files, name) if num_axes == 1: h_sum = add_1D_hists(files, name) elif num_axes == 2: h_sum = add_2D_hists(files, name) elif num_axes == 3: h_sum = add_3D_hists(files, name) - file_out[h_sum.member("fName")] = h_sum \ No newline at end of file + file_out[h_sum.member("fName")] = h_sum + +def tree_reduction_add(files, hist_name): + # *** Partial tree reduction... + + # Get rid of need for all the dtype conversions? + indx = int(0) + i = int(len(files)/2) + member_data = np.ndarray((i,5)) + values, variances = np.ndarray(i), np.ndarray(i) + + if type(files) != list: # Will this always work? + import glob + files = sorted( + glob.glob(files + f"/**/*{'.root'}", recursive=True) + ) + + x_axis = "" + title = "" + while indx+1 <= i: + with uproot.open(files[indx]) as file1: + with uproot.open(files[indx+1]) as file2: + try: + hist1, hist2 = file1[hist_name], file2[hist_name] + except: + raise ValueError("Names of histograms must all be the same.") # How get other hist name? + title = hist1.member("fTitle") + x_axis = hist1.member("fXaxis") + hist1, hist2 = file1[hist_name], file2[hist_name] + i = indx/int(2) + member_data[:] = np.add(np.array([ + hist1.member('fEntries'), + hist1.member('fTsumw'), + hist1.member('fTsumw2'), + hist1.member('fTsumwx'), + hist1.member('fTsumwx2'), + ]), np.array([ + hist2.member('fEntries'), + hist2.member('fTsumw'), + hist2.member('fTsumw2'), + hist2.member('fTsumwx'), + hist2.member('fTsumwx2'), + ])) + variances = np.add(hist1.variances(flow=True), hist2.variances(flow=True)) + values = np.add(hist1.values(flow=True), hist2.values(flow=True)) + indx+=2 + if(len(files)%2==1): + with uproot.open(files[-1]) as file: + try: + hist = file[hist_name] + except: + raise ValueError("Names of histograms must all be the same.") # How get other hist name? + + member_data[-1] += np.array([ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + ]) + variances += hist.variances(flow=True) + values += hist.values(flow=True) + try: + return uproot.writing.identify.to_TH1x(hist_name, title, # pass Title? It may end up random + values, *np.sum(member_data, axis=0), variances, x_axis) + except: + print("Write failed.") + print("Bins must be the same size.") # Change! \ No newline at end of file diff --git a/tests/test_add_histograms.py b/tests/test_add_histograms.py index 8791a68..84352c8 100644 --- a/tests/test_add_histograms.py +++ b/tests/test_add_histograms.py @@ -42,7 +42,18 @@ def test_simple(tmp_path, file_paths): h1, h2, h3 = gen_1d_root(file_paths) destination = os.path.join(tmp_path, "destination.root") - proteus.operations.add_hists(destination, ["tests/directory/file1.root", "tests/directory/file2.root"], hist_names="name") + proteus.operations.add_hists(destination, file_paths, hist_names="name", tree_reduction=False) + + with uproot.open(destination) as file: + assert file["name"].member("fN") == h1.member("fN") + assert file["name"].member("fTsumw") == h1.member("fTsumw") + h2.member("fTsumw") + h3.member("fTsumw") + assert np.equal(file["name"].values(flow=True), np.array(h1.values(flow=True) + h2.values(flow=True))).all + +def test_tree_reduction(tmp_path, file_paths): + h1, h2, h3 = gen_1d_root(file_paths) + + destination = os.path.join(tmp_path, "destination.root") + proteus.operations.add_hists(destination, file_paths, hist_names="name", tree_reduction=True) with uproot.open(destination) as file: assert file["name"].member("fN") == h1.member("fN") @@ -90,6 +101,10 @@ def test_2dim(tmp_path): with uproot.open("tests/place2.root") as file: assert file["name"].member("fN") == h1.member("fN") assert file["name"].member("fTsumw") == h1.member("fTsumw") + h2.member("fTsumw") + print(file["name"].values(flow=True)) assert np.equal(file["name"].values(flow=True), np.array(h1.values(flow=True) + h2.values(flow=True))).all -test_3_glob("",["tests/directory/file1.root","tests/directory/file2.root","tests/directory/file3.root"]) +# def test_partial_tree_reduction(): + +# test_3_glob("",["tests/directory/file1.root","tests/directory/file2.root","tests/directory/file3.root"]) +test_simple("",["tests/directory/file1.root","tests/directory/file2.root","tests/directory/file3.root"]) \ No newline at end of file From 99815e323bd6fb56fb026808cabbe6c3f33dcb00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Thu, 12 Oct 2023 16:23:08 +0200 Subject: [PATCH 07/22] rewrote to work smoother, added options from ROOT hadd and started commandline arguments --- src/proteus/operations/__init__.py | 3 +- src/proteus/operations/add_histograms.py | 403 +++++++++++------------ src/proteus/operations/hadd_like.py | 178 ---------- 3 files changed, 199 insertions(+), 385 deletions(-) delete mode 100644 src/proteus/operations/hadd_like.py diff --git a/src/proteus/operations/__init__.py b/src/proteus/operations/__init__.py index 1308244..fcafb42 100644 --- a/src/proteus/operations/__init__.py +++ b/src/proteus/operations/__init__.py @@ -1 +1,2 @@ -from proteus.operations.add_histograms import * \ No newline at end of file +from proteus.operations.add_histograms import * +from proteus.operations.temp import * \ No newline at end of file diff --git a/src/proteus/operations/add_histograms.py b/src/proteus/operations/add_histograms.py index 4ba3ff0..91e6044 100644 --- a/src/proteus/operations/add_histograms.py +++ b/src/proteus/operations/add_histograms.py @@ -1,94 +1,115 @@ import uproot import numpy as np +import argparse +import os -def add_1D_hists(files, hist_name): - bins = -1 - for path in files: - with uproot.open(path) as file: - hist = file[hist_name] # Try catch? - if bins == -1: - bins = hist.member('fN') - member_data = np.array([ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - ]) - variances = hist.variances(flow=True) - values = hist.values(flow=True) - elif bins != hist.member('fN'): - raise ValueError( - "Bins must be equal, not ", bins, " and ", hist.member('fN') - ) - # elif hist.member('fName') != hist_name: - # raise ValueError( - # "Names must be the same, not " + hist_name + " and " + hist.member('fName') - # ) - else: - member_data += [ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - ] - variances += hist.variances(flow=True) - values += hist.values(flow=True) - return uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), - values, *member_data, variances, hist.member("fXaxis")) +def get_1d_data(hist): + return np.array([ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + ]) + +def add_1D_hists(destination, file, key, union, first, keys, skip_errors): + outfile = uproot.open(destination) + try: + hist = file[key] # Try catch? + except: + if union: + print('New key') + return keys.append(), None + elif skip_errors: + return keys, None + else: + ValueError("Histogram ", key, " missing from other files") + if first: + member_data = np.array([ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + ]) + return keys, uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), + hist.values(flow=True), *member_data, hist.variances(flow=True), hist.member("fXaxis")) + elif hist.member('fN') == outfile[key].member('fN'): + member_data = np.array([ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + ]) + h_sum = uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), outfile[key].values(flow=True) + hist.values(flow=True), + *np.add(np.array([outfile[key].member('fEntries'), outfile[key].member('fTsumw'), outfile[key].member('fTsumw2'), outfile[key].member('fTsumwx'), + outfile[key].member('fTsumwx2')]), member_data), outfile[key].variances(flow=True) + hist.variances(flow=True), hist.member("fXaxis")) + file.close() + return keys, h_sum + + +def add_2D_hists(destination, file, key, union, first, keys, skip_errors): + # bins = -1 + # for path in files: + outfile = uproot.open(destination) + # keys = {keys} + # for key in keys: + try: + hist = file[key] # Try catch? + except: + if union: + print('New key') + keys.append() + elif skip_errors: + return keys, None + else: + ValueError("Histogram ", key, " missing from other files") + if first: + member_data = np.array([ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + hist.member('fTsumwy'), + hist.member('fTsumwy2'), + hist.member('fTsumwxy') + ]) + return keys, uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), + np.ravel(hist.values(flow=True), order='C'), *member_data, np.ravel(hist.variances(flow=True), order='C'), hist.member("fXaxis"), hist.member("fYaxis")) + elif hist.member('fN') == outfile[key].member('fN'): + member_data = np.array([ + hist.member('fEntries'), + hist.member('fTsumw'), + hist.member('fTsumw2'), + hist.member('fTsumwx'), + hist.member('fTsumwx2'), + hist.member('fTsumwy'), + hist.member('fTsumwy2'), + hist.member('fTsumwxy') + ]) + h_sum = uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), outfile[key].values(flow=True) + hist.values(flow=True), + *np.add(np.array([outfile[key].member('fEntries'), outfile[key].member('fTsumw'), outfile[key].member('fTsumw2'), outfile[key].member('fTsumwx'), + outfile[key].member('fTsumwx2')]), member_data), outfile[key].variances(flow=True) + hist.variances(flow=True), hist.member("fXaxis")) + file.close() + return keys, h_sum -def add_2D_hists(files, hist_name): - bins = -1 - for path in files: - with uproot.open(path) as file: - hist = file[hist_name] - if bins == -1: - bins = hist.member('fN') - member_data = np.array([ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - hist.member('fTsumwy'), - hist.member('fTsumwy2'), - hist.member('fTsumwxy'), - ]) - variances = np.array(hist.variances(flow=True)) - values = np.array(hist.values(flow=True)) - elif bins != hist.member('fN'): - raise ValueError( - "Bins must be equal, not ", bins, " and ", hist.member('fN') - ) - elif hist.member('fName') != hist_name: - raise ValueError( - "Names must be the same, not " + hist_name + " and " + hist.member('fName') - ) - else: - member_data += [ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - hist.member('fTsumwy'), - hist.member('fTsumwy2'), - hist.member('fTsumwxy') - ] - variances += hist.variances(flow=True) - values += hist.values(flow=True) - return uproot.writing.identify.to_TH2x(hist.member("fName"), hist.member("fTitle"), - values, *member_data, variances, hist.member("fXaxis"), hist.member("fYaxis")) -def add_3D_hists(files, hist_name): - bins = -1 - for path in files: - with uproot.open(path) as file: - hist = file[hist_name] - if bins == -1: - bins = hist.member('fN') - member_data = np.array([ +def add_3D_hists(destination, file, key, union, first, keys, skip_errors): + outfile = uproot.open(destination) + try: + hist = file[key] # Try catch? + except: + if union: + print('New key') + return keys.append(), None + elif skip_errors: + return keys, None + else: + ValueError("Histogram ", key, " missing from other files") + if first: + member_data = np.array([ hist.member('fEntries'), hist.member('fTsumw'), hist.member('fTsumw2'), @@ -101,20 +122,11 @@ def add_3D_hists(files, hist_name): hist.member('fTsumwz2'), hist.member('fTsumwxz'), hist.member('fTsumwyz') - ]) - variances = np.array(hist.variances(flow=True)) - values = np.array(hist.values(flow=True)) - elif bins != hist.member('fN'): - raise ValueError( - "Bins must be equal, not ", bins, " and ", hist.member('fN') - ) - elif hist.member('fName') != hist_name: - raise ValueError( - "Names must be the same, not " + hist_name + " and " + hist.member('fName') - ) - - else: - member_data += [ + ]) + return uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), + np.ravel(hist.values(flow=True), order='C'), *member_data, np.ravel(hist.variances(flow=True), order='C'), hist.member("fXaxis"), hist.member("fYaxis"), hist.member("fZaxis")) + elif hist.member('fN') == outfile[key].member('fN'): + member_data = np.add(np.array([ hist.member('fEntries'), hist.member('fTsumw'), hist.member('fTsumw2'), @@ -127,128 +139,107 @@ def add_3D_hists(files, hist_name): hist.member('fTsumwz2'), hist.member('fTsumwxz'), hist.member('fTsumwyz') - ] - variances += hist.variances(flow=True) - values += hist.values(flow=True) - return uproot.writing.identify.to_TH2x(hist.member("fName"), hist.member("fTitle"), - values, *member_data, variances, hist.member("fXaxis"), hist.member("fYaxis"), hist.member("fZaxis")) + ]), np.array(hist.member('fEntries'), + outfile[key].member('fTsumw'), + outfile[key].member('fTsumw2'), + outfile[key].member('fTsumwx'), + outfile[key].member('fTsumwx2'), + outfile[key].member('fTsumwy'), + outfile[key].member('fTsumwy2'), + outfile[key].member('fTsumwxy'), + outfile[key].member('fTsumwz'), + outfile[key].member('fTsumwz2'), + outfile[key].member('fTsumwxz'), + outfile[key].member('fTsumwyz'))) + h_sum = uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), np.ravel(outfile[key].values(flow=True) + hist.values(flow=True), order='C'), + *member_data, np.ravel((outfile[key].variances(flow=True) + hist.variances(flow=True)), order='C'), hist.member("fXaxis"), hist.member("fYaxis"), hist.member("fZaxis")) + file.close() + return keys, h_sum + +def add_hists( + destination, + files, + *, + target_compression=1, + tree_reduction=False, + append=False, + force=False, + no_trees=True, + skip_errors=False, + max_opened_files=0, + union=False, # Union vs intersection + same_name_only=True + ): -def add_hists(destination, files, hist_names=None, tree_reduction=False): """ Args: destination (path-like): Name of the output file or file path. files (Str or list of str): List of local ROOT files to read histograms from. hist_names (None, str, or list of str): Names of histograms to be added together, must be specified if files contain different numbers of histograms. - + force (bool): If True, overwrites destination file if it exists. + append (bool): If True, appends histograms to an existing file. + skip_errors (bool): If True, skips corrupt or non-existant files without exiting. + max_opened_files (int): Limits the number of files to be open at the same time. + skip_extra (bool): If True, ignores histograms that are not in all files. If False, writes all histograms to destination file. + no_extra (bool): If True, throws an error if files do not have the same histograms. + Adds together histograms from local ROOT files of a collection of ROOT files, and writes them to one ROOT file. """ + if os.path.isfile(destination): + if not force and not append: + raise FileExistsError + elif force and append: + raise ValueError("Cannot append to a new file. Either force or append can be true.") + if force: + file_out = uproot.recreate(destination) + if type(files) != list: # Will this always work? + if files.endswith('.txt'): + readfile=readfile import glob files = sorted( glob.glob(files + f"/**/*{'.root'}", recursive=True) ) - - if hist_names == None: # if histogram names are not provided - with uproot.open(files[0]) as h: - array = h.classnames() - hist_names = np.array([h[i].member("fName") for i in array if (array.get(i).startswith("TH1") or array.get(i).startswith("TH2") or array.get(i).startswith("TH3"))]) - - - with uproot.open(files[0]) as file: - hist = file[[str(hist_names)][0]] - num_axes = len(hist.axes) - - if tree_reduction == True: - h_sum = tree_reduction_add(files, hist_names) - - if type(hist_names) == str: - if tree_reduction == True: - h_sum = tree_reduction_add(files, hist_names) - if num_axes == 1: - h_sum = add_1D_hists(files, hist_names) - elif num_axes == 2: - h_sum = add_2D_hists(files, hist_names) - elif num_axes == 3: - h_sum = add_3D_hists(files, hist_names) - file_out = uproot.recreate(destination) # What compression level? - file_out[h_sum.member("fName")] = h_sum + if no_trees: + with uproot.open(files[0]) as file: + # iterclassnames ? https://uproot.readthedocs.io/en/latest/uproot.reading.ReadOnlyDirectory.html + keys = file.keys(cycle=False) + print(type(keys)) + keys_axes = dict(zip(keys, (len(file[i].axes) for i in keys))) + # print(file.classnames()) else: - file_out = uproot.recreate(destination) # What compression level? Would it still be recreate? - for name in hist_names: - if tree_reduction == True: - h_sum = tree_reduction_add(files, name) - if num_axes == 1: - h_sum = add_1D_hists(files, name) - elif num_axes == 2: - h_sum = add_2D_hists(files, name) - elif num_axes == 3: - h_sum = add_3D_hists(files, name) - file_out[h_sum.member("fName")] = h_sum - -def tree_reduction_add(files, hist_name): - # *** Partial tree reduction... + with uproot.open(files[0]) as file: + #filter for both TTrees and histograms + keys = file.keys(filter_classname='[TH[1|2|3][I|S|F|D|C]|TTREE]', cycle=False) # Actually might account for subdirectories and everything? https://uproot.readthedocs.io/en/latest/basic.html#finding-objects-in-a-file + keys_axes = dict(zip(keys, (len(file[i].axes) for i in keys))) - # Get rid of need for all the dtype conversions? - indx = int(0) - i = int(len(files)/2) - member_data = np.ndarray((i,5)) - values, variances = np.ndarray(i), np.ndarray(i) + first = True + for file in files: + try: + file = uproot.open(file) + except: + Warning("File: " + {file} + " does not exist or is corrupt.") + continue + for key in keys: + if keys_axes[key] == 1: + keys, h_sum = add_1D_hists(destination, file, key, union, first, keys, skip_errors) + elif keys_axes[key] == 2: + keys, h_sum = add_2D_hists(destination, file, key, union, first, keys, skip_errors) + else: + keys, h_sum = add_3D_hists(destination, file, key, union, first, keys, skip_errors) + if h_sum != None: + file_out[key] = h_sum + first = False + file.close() - if type(files) != list: # Will this always work? - import glob - files = sorted( - glob.glob(files + f"/**/*{'.root'}", recursive=True) - ) +def args(): + argparser = argparse.ArgumentParser(description="Hadd ROOT histograms with Uproot") + argparser.add_argument("destination", type=str, help="path of output file") + argparser.add_argument("input_files", type=str, nargs="+", help="list or directory (glob syntax accepted) of input files") + argparser.add_argument("-f", action="store_true",default=False, help="force overwrite of output file") - x_axis = "" - title = "" - while indx+1 <= i: - with uproot.open(files[indx]) as file1: - with uproot.open(files[indx+1]) as file2: - try: - hist1, hist2 = file1[hist_name], file2[hist_name] - except: - raise ValueError("Names of histograms must all be the same.") # How get other hist name? - title = hist1.member("fTitle") - x_axis = hist1.member("fXaxis") - hist1, hist2 = file1[hist_name], file2[hist_name] - i = indx/int(2) - member_data[:] = np.add(np.array([ - hist1.member('fEntries'), - hist1.member('fTsumw'), - hist1.member('fTsumw2'), - hist1.member('fTsumwx'), - hist1.member('fTsumwx2'), - ]), np.array([ - hist2.member('fEntries'), - hist2.member('fTsumw'), - hist2.member('fTsumw2'), - hist2.member('fTsumwx'), - hist2.member('fTsumwx2'), - ])) - variances = np.add(hist1.variances(flow=True), hist2.variances(flow=True)) - values = np.add(hist1.values(flow=True), hist2.values(flow=True)) - indx+=2 - if(len(files)%2==1): - with uproot.open(files[-1]) as file: - try: - hist = file[hist_name] - except: - raise ValueError("Names of histograms must all be the same.") # How get other hist name? +def tree_reduction(max_opened_files): + # Root checks system max opened files + work = work - member_data[-1] += np.array([ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - ]) - variances += hist.variances(flow=True) - values += hist.values(flow=True) - try: - return uproot.writing.identify.to_TH1x(hist_name, title, # pass Title? It may end up random - values, *np.sum(member_data, axis=0), variances, x_axis) - except: - print("Write failed.") - print("Bins must be the same size.") # Change! \ No newline at end of file diff --git a/src/proteus/operations/hadd_like.py b/src/proteus/operations/hadd_like.py deleted file mode 100644 index 354a2fc..0000000 --- a/src/proteus/operations/hadd_like.py +++ /dev/null @@ -1,178 +0,0 @@ -import uproot -import numpy as np - - -def add_1D_hists(files, hist_name, members, values, bins): - for path in files[1:]: - with uproot.open(path) as file: - hist = file[hist_name] - if bins != hist.member('fN'): - raise ValueError( - "Bins must be equal, not ", bins, " and ", hist.member('fN') - ) - if hist.member('fName') != hist_name: - raise ValueError( - "Names must be the same, not " + hist_name + " and " + hist.member('fName') - ) - - temp_members = [ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - hist.variances(flow=True) - ] - - values += hist.values(flow=True) - members += temp_members - return uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), values, - *members[0:6], hist.member("fXaxis")) - -def add_2D_hists(files, hist_name, members, values, bins): - for path in files[1:]: - with uproot.open(path) as file: - if hist_names == None: # if histogram names are not provided - hist_names = find_histograms(path) - hist = file[hist_name] - print(path) - if bins != hist.member('fN'): - raise ValueError( - "Bins must be equal, not ", bins, " and ", hist.member('fN') - ) - if hist.member('fName') != hist_name: - raise ValueError( - "Names must be the same, not " + hist_name + " and " + hist.member('fName') - ) - - temp_members = [ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - hist.member('fTsumwy'), - hist.member('fTsumwy2'), - hist.member('fTsumwxy'), - hist.variances(flow=True) - ] - members += temp_members - values += np.array(hist.values(flow=True)) - print(type(values)) - return uproot.writing.identify.to_TH2x(hist.member("fName"), hist.member("fTitle"), values, - *members, - hist.member("fXaxis"), hist.member("fYaxis")) - -def add_3D_hists(files, hist_names, values, members, bins): - for path in files[1:]: - with uproot.open(path) as file: - hist = file[hist_names] - if members['fN'] != hist.member('fN'): - raise ValueError( - "Bins must be equal, not ", bins, " and ", hist.member('fN') - ) - if hist.member('fName') != hist_names: - raise ValueError( - "Names must be the same, not " + hist.member('fName') + " and " + hist.member('fName') - ) - temp_members = [ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - hist.member('fTsumwy'), - hist.member('fTsumwy2'), - hist.member('fTsumwxy'), - hist.member('fTsumwz'), - hist.member('fTsumwz2'), - hist.member('fTsumwxz'), - hist.member('fTsumwyz'), - hist.variances(flow=True) - ] - members += temp_members - values += hist.values(flow=True) - return uproot.writing.identify.to_TH3x(hist.member("fName"), hist.member("fTitle"), values, - *members[0:14], hist.member("fXaxis"), hist.member("fYaxis"), hist.member("fZaxis")) - -def find_histograms(file): - # for i in filenames: - with uproot.open(file) as i: - array = i.classnames() - list = np.array([i for i in array if (array.get(i).startswith("TH1") or array.get(i).startswith("TH2") or array.get(i).startswith("TH3"))]) - return list - -def hadd_like(destination, filenames=None, directory=None, hist_names=None): - """ - Args: - destination (path-like): Name of the output file or file path. - filenames (None, or list of str): List of local ROOT files to read histograms from. - directory (None, str): Local path, may contain glob patterns - hist_names (None, str, or list of str): Names of histograms to be added together. - - Adds together histograms from local ROOT files of a collection of ROOT files, and writes them to one ROOT file. - """ - if directory!=None: # Merge directory and filenames arguments? - import glob - filenames = sorted( - glob.glob(directory + f"/**/*{'.root'}", recursive=True) - ) - - if hist_names == None: # if histogram names are not provided - hist_names = find_histograms(filenames[0]) - - - file = uproot.open(filenames[0]) # This file may never close until the end... - hist_name = hist_names - hist = file[hist_name] - bins = hist.member('fN') - if len(hist.axes) == 1: - members = [ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - hist.variances(flow=True) - ] - values = hist.values(flow=True) - h_sum = add_1D_hists(filenames, hist.member('fName'), members, values, bins) - elif len(hist.axes) == 2: - members = [ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - hist.member('fTsumwy'), - hist.member('fTsumwy2'), - hist.member('fTsumwxy'), - hist.variances(flow=True) - ] - values = np.array(hist.values(flow=True)) - print(type(values)) - h_sum = add_2D_hists(filenames, hist.member('fName'), members, values, bins) - elif len(hist.axes) == 3: - members = [ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - hist.member('fTsumwy'), - hist.member('fTsumwy2'), - hist.member('fTsumwxy'), - hist.member('fTsumwz'), - hist.member('fTsumwz2'), - hist.member('fTsumwxz'), - hist.member('fTsumwyz'), - hist.variances(flow=True) - ] - values += np.array(hist.values(flow=True)) - h_sum = add_3D_hists(filenames, hist.member('fName'), members, values, bins) - file_out = uproot.recreate(destination) # What compression level? - file_out[h_sum.member("fName")] = h_sum - - -hadd_like("place.root", filenames=["/Users/zobil/Documents/Proteus/tests/file1.root", "/Users/zobil/Documents/Proteus/tests/file2.root"], hist_names="name") -# hadd_like("place.root", directory="/Users/zobil/Documents/Proteus/tests/") From c121ebe782c5b31a2a2aa871c7a1a0e903634561 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Mon, 16 Oct 2023 09:47:34 +0200 Subject: [PATCH 08/22] name change --- src/{proteus => odapt}/__init__.py | 5 +++-- src/{proteus => odapt}/_version.pyi | 0 src/odapt/operations/__init__.py | 1 + src/{proteus => odapt}/operations/add_histograms.py | 0 src/{proteus => odapt}/operations/parquet_to_root.py | 0 src/{proteus => odapt}/py.typed | 0 src/proteus/operations/__init__.py | 2 -- tests/test_package.py | 4 ++-- 8 files changed, 6 insertions(+), 6 deletions(-) rename src/{proteus => odapt}/__init__.py (62%) rename src/{proteus => odapt}/_version.pyi (100%) create mode 100644 src/odapt/operations/__init__.py rename src/{proteus => odapt}/operations/add_histograms.py (100%) rename src/{proteus => odapt}/operations/parquet_to_root.py (100%) rename src/{proteus => odapt}/py.typed (100%) delete mode 100644 src/proteus/operations/__init__.py diff --git a/src/proteus/__init__.py b/src/odapt/__init__.py similarity index 62% rename from src/proteus/__init__.py rename to src/odapt/__init__.py index 9ece269..a9308c7 100644 --- a/src/proteus/__init__.py +++ b/src/odapt/__init__.py @@ -5,10 +5,11 @@ """ + from __future__ import annotations -from proteus._version import version as __version__ +from odapt._version import version as __version__ -from proteus.operations import add_histograms +from odapt.operations import add_histograms __all__ = ["add_histograms"] \ No newline at end of file diff --git a/src/proteus/_version.pyi b/src/odapt/_version.pyi similarity index 100% rename from src/proteus/_version.pyi rename to src/odapt/_version.pyi diff --git a/src/odapt/operations/__init__.py b/src/odapt/operations/__init__.py new file mode 100644 index 0000000..0cc8d38 --- /dev/null +++ b/src/odapt/operations/__init__.py @@ -0,0 +1 @@ +from odapt.operations.add_histograms import * \ No newline at end of file diff --git a/src/proteus/operations/add_histograms.py b/src/odapt/operations/add_histograms.py similarity index 100% rename from src/proteus/operations/add_histograms.py rename to src/odapt/operations/add_histograms.py diff --git a/src/proteus/operations/parquet_to_root.py b/src/odapt/operations/parquet_to_root.py similarity index 100% rename from src/proteus/operations/parquet_to_root.py rename to src/odapt/operations/parquet_to_root.py diff --git a/src/proteus/py.typed b/src/odapt/py.typed similarity index 100% rename from src/proteus/py.typed rename to src/odapt/py.typed diff --git a/src/proteus/operations/__init__.py b/src/proteus/operations/__init__.py deleted file mode 100644 index fcafb42..0000000 --- a/src/proteus/operations/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from proteus.operations.add_histograms import * -from proteus.operations.temp import * \ No newline at end of file diff --git a/tests/test_package.py b/tests/test_package.py index 11c9493..7abc07f 100644 --- a/tests/test_package.py +++ b/tests/test_package.py @@ -2,8 +2,8 @@ import importlib.metadata -import proteus as m +import odapt as m def test_version(): - assert importlib.metadata.version("proteus") == m.__version__ + assert importlib.metadata.version("odapt") == m.__version__ From a27ad884466fafd5244bc2162eea2f27bba12ffa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Mon, 16 Oct 2023 09:54:32 +0200 Subject: [PATCH 09/22] removing extra changes --- src/odapt/operations/__init__.py | 1 - src/odapt/operations/add_histograms.py | 245 ------------------------ src/odapt/operations/parquet_to_root.py | 20 -- tests/generate_hists_root.py | 99 ---------- tests/test_add_histograms.py | 110 ----------- 5 files changed, 475 deletions(-) delete mode 100644 src/odapt/operations/__init__.py delete mode 100644 src/odapt/operations/add_histograms.py delete mode 100644 src/odapt/operations/parquet_to_root.py delete mode 100644 tests/generate_hists_root.py delete mode 100644 tests/test_add_histograms.py diff --git a/src/odapt/operations/__init__.py b/src/odapt/operations/__init__.py deleted file mode 100644 index 0cc8d38..0000000 --- a/src/odapt/operations/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from odapt.operations.add_histograms import * \ No newline at end of file diff --git a/src/odapt/operations/add_histograms.py b/src/odapt/operations/add_histograms.py deleted file mode 100644 index 91e6044..0000000 --- a/src/odapt/operations/add_histograms.py +++ /dev/null @@ -1,245 +0,0 @@ -import uproot -import numpy as np -import argparse -import os - -def get_1d_data(hist): - return np.array([ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - ]) - -def add_1D_hists(destination, file, key, union, first, keys, skip_errors): - outfile = uproot.open(destination) - try: - hist = file[key] # Try catch? - except: - if union: - print('New key') - return keys.append(), None - elif skip_errors: - return keys, None - else: - ValueError("Histogram ", key, " missing from other files") - if first: - member_data = np.array([ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - ]) - return keys, uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), - hist.values(flow=True), *member_data, hist.variances(flow=True), hist.member("fXaxis")) - elif hist.member('fN') == outfile[key].member('fN'): - member_data = np.array([ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - ]) - h_sum = uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), outfile[key].values(flow=True) + hist.values(flow=True), - *np.add(np.array([outfile[key].member('fEntries'), outfile[key].member('fTsumw'), outfile[key].member('fTsumw2'), outfile[key].member('fTsumwx'), - outfile[key].member('fTsumwx2')]), member_data), outfile[key].variances(flow=True) + hist.variances(flow=True), hist.member("fXaxis")) - file.close() - return keys, h_sum - - -def add_2D_hists(destination, file, key, union, first, keys, skip_errors): - # bins = -1 - # for path in files: - outfile = uproot.open(destination) - # keys = {keys} - # for key in keys: - try: - hist = file[key] # Try catch? - except: - if union: - print('New key') - keys.append() - elif skip_errors: - return keys, None - else: - ValueError("Histogram ", key, " missing from other files") - if first: - member_data = np.array([ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - hist.member('fTsumwy'), - hist.member('fTsumwy2'), - hist.member('fTsumwxy') - ]) - return keys, uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), - np.ravel(hist.values(flow=True), order='C'), *member_data, np.ravel(hist.variances(flow=True), order='C'), hist.member("fXaxis"), hist.member("fYaxis")) - elif hist.member('fN') == outfile[key].member('fN'): - member_data = np.array([ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - hist.member('fTsumwy'), - hist.member('fTsumwy2'), - hist.member('fTsumwxy') - ]) - h_sum = uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), outfile[key].values(flow=True) + hist.values(flow=True), - *np.add(np.array([outfile[key].member('fEntries'), outfile[key].member('fTsumw'), outfile[key].member('fTsumw2'), outfile[key].member('fTsumwx'), - outfile[key].member('fTsumwx2')]), member_data), outfile[key].variances(flow=True) + hist.variances(flow=True), hist.member("fXaxis")) - file.close() - return keys, h_sum - - -def add_3D_hists(destination, file, key, union, first, keys, skip_errors): - outfile = uproot.open(destination) - try: - hist = file[key] # Try catch? - except: - if union: - print('New key') - return keys.append(), None - elif skip_errors: - return keys, None - else: - ValueError("Histogram ", key, " missing from other files") - if first: - member_data = np.array([ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - hist.member('fTsumwy'), - hist.member('fTsumwy2'), - hist.member('fTsumwxy'), - hist.member('fTsumwz'), - hist.member('fTsumwz2'), - hist.member('fTsumwxz'), - hist.member('fTsumwyz') - ]) - return uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), - np.ravel(hist.values(flow=True), order='C'), *member_data, np.ravel(hist.variances(flow=True), order='C'), hist.member("fXaxis"), hist.member("fYaxis"), hist.member("fZaxis")) - elif hist.member('fN') == outfile[key].member('fN'): - member_data = np.add(np.array([ - hist.member('fEntries'), - hist.member('fTsumw'), - hist.member('fTsumw2'), - hist.member('fTsumwx'), - hist.member('fTsumwx2'), - hist.member('fTsumwy'), - hist.member('fTsumwy2'), - hist.member('fTsumwxy'), - hist.member('fTsumwz'), - hist.member('fTsumwz2'), - hist.member('fTsumwxz'), - hist.member('fTsumwyz') - ]), np.array(hist.member('fEntries'), - outfile[key].member('fTsumw'), - outfile[key].member('fTsumw2'), - outfile[key].member('fTsumwx'), - outfile[key].member('fTsumwx2'), - outfile[key].member('fTsumwy'), - outfile[key].member('fTsumwy2'), - outfile[key].member('fTsumwxy'), - outfile[key].member('fTsumwz'), - outfile[key].member('fTsumwz2'), - outfile[key].member('fTsumwxz'), - outfile[key].member('fTsumwyz'))) - h_sum = uproot.writing.identify.to_TH1x(hist.member("fName"), hist.member("fTitle"), np.ravel(outfile[key].values(flow=True) + hist.values(flow=True), order='C'), - *member_data, np.ravel((outfile[key].variances(flow=True) + hist.variances(flow=True)), order='C'), hist.member("fXaxis"), hist.member("fYaxis"), hist.member("fZaxis")) - file.close() - return keys, h_sum - -def add_hists( - destination, - files, - *, - target_compression=1, - tree_reduction=False, - append=False, - force=False, - no_trees=True, - skip_errors=False, - max_opened_files=0, - union=False, # Union vs intersection - same_name_only=True - ): - - """ - Args: - destination (path-like): Name of the output file or file path. - files (Str or list of str): List of local ROOT files to read histograms from. - hist_names (None, str, or list of str): Names of histograms to be added together, must be specified if files contain different numbers of histograms. - force (bool): If True, overwrites destination file if it exists. - append (bool): If True, appends histograms to an existing file. - skip_errors (bool): If True, skips corrupt or non-existant files without exiting. - max_opened_files (int): Limits the number of files to be open at the same time. - skip_extra (bool): If True, ignores histograms that are not in all files. If False, writes all histograms to destination file. - no_extra (bool): If True, throws an error if files do not have the same histograms. - - Adds together histograms from local ROOT files of a collection of ROOT files, and writes them to one ROOT file. - """ - if os.path.isfile(destination): - if not force and not append: - raise FileExistsError - elif force and append: - raise ValueError("Cannot append to a new file. Either force or append can be true.") - if force: - file_out = uproot.recreate(destination) - - if type(files) != list: # Will this always work? - if files.endswith('.txt'): - readfile=readfile - import glob - files = sorted( - glob.glob(files + f"/**/*{'.root'}", recursive=True) - ) - - if no_trees: - with uproot.open(files[0]) as file: - # iterclassnames ? https://uproot.readthedocs.io/en/latest/uproot.reading.ReadOnlyDirectory.html - keys = file.keys(cycle=False) - print(type(keys)) - keys_axes = dict(zip(keys, (len(file[i].axes) for i in keys))) - # print(file.classnames()) - else: - with uproot.open(files[0]) as file: - #filter for both TTrees and histograms - keys = file.keys(filter_classname='[TH[1|2|3][I|S|F|D|C]|TTREE]', cycle=False) # Actually might account for subdirectories and everything? https://uproot.readthedocs.io/en/latest/basic.html#finding-objects-in-a-file - keys_axes = dict(zip(keys, (len(file[i].axes) for i in keys))) - - first = True - for file in files: - try: - file = uproot.open(file) - except: - Warning("File: " + {file} + " does not exist or is corrupt.") - continue - for key in keys: - if keys_axes[key] == 1: - keys, h_sum = add_1D_hists(destination, file, key, union, first, keys, skip_errors) - elif keys_axes[key] == 2: - keys, h_sum = add_2D_hists(destination, file, key, union, first, keys, skip_errors) - else: - keys, h_sum = add_3D_hists(destination, file, key, union, first, keys, skip_errors) - if h_sum != None: - file_out[key] = h_sum - first = False - file.close() - -def args(): - argparser = argparse.ArgumentParser(description="Hadd ROOT histograms with Uproot") - argparser.add_argument("destination", type=str, help="path of output file") - argparser.add_argument("input_files", type=str, nargs="+", help="list or directory (glob syntax accepted) of input files") - argparser.add_argument("-f", action="store_true",default=False, help="force overwrite of output file") - -def tree_reduction(max_opened_files): - # Root checks system max opened files - work = work - diff --git a/src/odapt/operations/parquet_to_root.py b/src/odapt/operations/parquet_to_root.py deleted file mode 100644 index 2acccc0..0000000 --- a/src/odapt/operations/parquet_to_root.py +++ /dev/null @@ -1,20 +0,0 @@ -import dask_awkward as dak -import uproot -def parquet_to_root(read_path, - write_path, - *, - columns, - storage_options, - max_gap, - max_block, - footer_sample_size, - generate_bitmasks, - highlevel, - behavior, - ): - arrays = dak.from_parquet(read_path, split_row_groups=True) - tree = uproot.recreate(write_path) - tree.mktree("tree", {arrays.partitions[0]}) #name? But root files aren't just TTrees... - for i in range(1,arrays.npartitions): - tree["tree"].extend(arrays.partitions[i]) - \ No newline at end of file diff --git a/tests/generate_hists_root.py b/tests/generate_hists_root.py deleted file mode 100644 index a2499e8..0000000 --- a/tests/generate_hists_root.py +++ /dev/null @@ -1,99 +0,0 @@ -import ROOT -import uproot - -# h1 = ROOT.TH1I("name", "title", 10, -4, 4) -# h1.FillRandom("gaus") -# h2 = ROOT.TH1I("name", "title", 10, -4, 4) -# h2.FillRandom("gaus") - -def gen_gause_hists_uproot(): - file_out = uproot.recreate("file1.root") - h1 = uproot.from_pyroot(h1) - h_1 = uproot.writing.identify.to_TH1x(h1.member("fName"), - h1.member("fTitle"), - h1.values(flow=True), - h1.member("fEntries"), - h1.member("fTsumw"), - h1.member("fTsumw2"), - h1.member("fTsumwx"), - h1.member("fTsumwx2"), - h1.variances(flow=True), - h1.member("fXaxis"), - ) - print(h_1) - file_out[h_1.member("fName")] = h_1 - - file_out = uproot.recreate("file2.root") - h2 = uproot.from_pyroot(h2) - h_2 = uproot.writing.identify.to_TH1x(h2.member("fName"), - h2.member("fTitle"), - h2.values(flow=True), - h2.member("fEntries"), - h2.member("fTsumw"), - h2.member("fTsumw2"), - h2.member("fTsumwx"), - h2.member("fTsumwx2"), - h2.variances(flow=True), - h2.member("fXaxis"), - ) - - file_out[h_2.member("fName")] = h_2 - -def gen_gaus_hists_pyroot(names, file_names): - # Will create histograms with same names and bins for files in file_names - for file in file_names: - for name in names: - h = ROOT.TH1I(name, name, 10, -4, 4) - h.FillRandom("gaus") - h.Sumw2() - h.SetDirectory(0) - outHistFile = ROOT.TFile.Open(file, "RECREATE") - outHistFile.cd() - h.Write() - outHistFile.Close() - -def gen_gaus_hists_pyroot(): - h1 = ROOT.TH1I("name", "title", 10, -4, 4) - h1.FillRandom("gaus") - h1.Sumw2() - h1.SetDirectory(0) - outHistFile = ROOT.TFile.Open("file1.root", "RECREATE") - outHistFile.cd() - h1.Write() - outHistFile.Close() - - h2 = ROOT.TH1I("name", "title", 10, -4, 4) - h2.FillRandom("gaus") - h2.Sumw2() - h2.SetDirectory(0) - outHistFile = ROOT.TFile.Open("file2.root", "RECREATE") - outHistFile.cd() - h2.Write() - outHistFile.Close() - -def gen_2dim_hists_pyroot(num_hists, num_files, names): - import numpy as np - xedges = [0, 1, 3, 5] - yedges = [0, 2, 3, 4, 6] - x = np.random.normal(2, 1, 100) - y = np.random.normal(1, 1, 100) - H, xedges, yedges = np.histogram2d(x, y, bins=(xedges, yedges)) - - h1 = ROOT.TH2I("name", "title", len(xedges), 0.0, 5.0, len(yedges), 0.0, 6.0) - h1.Sumw2() - h1.Fill(0,0) - h1.SetDirectory(0) - outHistFile = ROOT.TFile.Open("file2dim1.root", "RECREATE") - outHistFile.cd() - h1.Write() - outHistFile.Close() - - - h2 = ROOT.TH2I("name", "title", len(xedges), 0.0, 5.0, len(yedges), 0.0, 6.0) - h2.Sumw2() - h2.Fill(0,0) - h2.SetDirectory(0) - outHistFile = ROOT.TFile.Open("file2dim2.root", "RECREATE") - outHistFile.cd() - h2.Write() - outHistFile.Close() diff --git a/tests/test_add_histograms.py b/tests/test_add_histograms.py deleted file mode 100644 index 84352c8..0000000 --- a/tests/test_add_histograms.py +++ /dev/null @@ -1,110 +0,0 @@ -import os -import uproot -import sys -sys.path.append("/Users/zobil/Documents/Proteus/src/") -import proteus -import ROOT -import numpy as np - -def gen_1d_root(file_paths): - h1 = ROOT.TH1I("name", "title", 5, -4, 4) - h1.FillRandom("gaus") - h1.Sumw2() - h1.SetDirectory(0) - outHistFile = ROOT.TFile.Open(file_paths[0], "RECREATE") - outHistFile.cd() - h1.Write() - outHistFile.Close() - h1 = uproot.from_pyroot(h1) - - h2 = ROOT.TH1I("name", "title", 5, -4, 4) - h2.FillRandom("gaus") - h2.Sumw2() - h2.SetDirectory(0) - outHistFile = ROOT.TFile.Open(file_paths[1], "RECREATE") - outHistFile.cd() - h2.Write() - outHistFile.Close() - h2 = uproot.from_pyroot(h2) - - h3 = ROOT.TH1I("name", "title", 5, -4, 4) - h3.FillRandom("gaus") - h3.Sumw2() - h3.SetDirectory(0) - outHistFile = ROOT.TFile.Open(file_paths[2], "RECREATE") - outHistFile.cd() - h3.Write() - outHistFile.Close() - h3 = uproot.from_pyroot(h3) - return h1, h2, h3 - -def test_simple(tmp_path, file_paths): - h1, h2, h3 = gen_1d_root(file_paths) - - destination = os.path.join(tmp_path, "destination.root") - proteus.operations.add_hists(destination, file_paths, hist_names="name", tree_reduction=False) - - with uproot.open(destination) as file: - assert file["name"].member("fN") == h1.member("fN") - assert file["name"].member("fTsumw") == h1.member("fTsumw") + h2.member("fTsumw") + h3.member("fTsumw") - assert np.equal(file["name"].values(flow=True), np.array(h1.values(flow=True) + h2.values(flow=True))).all - -def test_tree_reduction(tmp_path, file_paths): - h1, h2, h3 = gen_1d_root(file_paths) - - destination = os.path.join(tmp_path, "destination.root") - proteus.operations.add_hists(destination, file_paths, hist_names="name", tree_reduction=True) - - with uproot.open(destination) as file: - assert file["name"].member("fN") == h1.member("fN") - assert file["name"].member("fTsumw") == h1.member("fTsumw") + h2.member("fTsumw") + h3.member("fTsumw") - assert np.equal(file["name"].values(flow=True), np.array(h1.values(flow=True) + h2.values(flow=True))).all - -def test_3_glob(tmp_path, file_paths): - h1, h2, h3 = gen_1d_root(file_paths) - - # destination = os.path.join(tmp_path, "destination.root") - proteus.operations.add_hists(os.path.join(tmp_path, "place.root"), "tests/directory") - - with uproot.open("tests/place.root") as file: - assert file["name"].member("fN") == h1.member("fN") - assert file["name"].member("fTsumw") == h1.member("fTsumw") + h2.member("fTsumw") + h3.member("fTsumw") - assert np.equal(file["name"].values(flow=True), np.array(h1.values(flow=True) + h2.values(flow=True) + h3.values(flow=True))).all - -def test_2dim(tmp_path): - xedges = [0, 1, 3, 5] - yedges = [0, 2, 3, 4, 6] - x = np.random.normal(2, 1, 100) - y = np.random.normal(1, 1, 100) - H, xedges, yedges = np.histogram2d(x, y, bins=(xedges, yedges)) - - h1 = ROOT.TH2I("name", "title", len(xedges), 0.0, 5.0, len(yedges), 0.0, 6.0) - h1.Sumw2() - h1.Fill(0,0) - h1.SetDirectory(0) - outHistFile = ROOT.TFile.Open("tests/file2dim1.root", "RECREATE") - outHistFile.cd() - h1.Write() - outHistFile.Close() - - h2 = ROOT.TH2I("name", "title", len(xedges), 0.0, 5.0, len(yedges), 0.0, 6.0) - h2.Sumw2() - h2.Fill(0,0) - h2.SetDirectory(0) - outHistFile = ROOT.TFile.Open("tests/file2dim2.root", "RECREATE") - outHistFile.cd() - h2.Write() - outHistFile.Close() - - proteus.operations.add_hists("tests/place2.root", ["file2dim1.root", "file2dim2.root"], hist_names="name") - - with uproot.open("tests/place2.root") as file: - assert file["name"].member("fN") == h1.member("fN") - assert file["name"].member("fTsumw") == h1.member("fTsumw") + h2.member("fTsumw") - print(file["name"].values(flow=True)) - assert np.equal(file["name"].values(flow=True), np.array(h1.values(flow=True) + h2.values(flow=True))).all - -# def test_partial_tree_reduction(): - -# test_3_glob("",["tests/directory/file1.root","tests/directory/file2.root","tests/directory/file3.root"]) -test_simple("",["tests/directory/file1.root","tests/directory/file2.root","tests/directory/file3.root"]) \ No newline at end of file From 16f2ffec72a8feafd9a0644681cde57263bf1b74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Mon, 16 Oct 2023 10:00:16 +0200 Subject: [PATCH 10/22] format change --- src/odapt/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/odapt/__init__.py b/src/odapt/__init__.py index a9308c7..b012cca 100644 --- a/src/odapt/__init__.py +++ b/src/odapt/__init__.py @@ -12,4 +12,4 @@ from odapt.operations import add_histograms -__all__ = ["add_histograms"] \ No newline at end of file +__all__ = ["add_histograms"] From cc760b3cdcd1a13cd4dbd280202cd30dc3415749 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Mon, 16 Oct 2023 10:06:09 +0200 Subject: [PATCH 11/22] fix version file --- src/odapt/__init__.py | 2 +- src/odapt/version.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 src/odapt/version.py diff --git a/src/odapt/__init__.py b/src/odapt/__init__.py index b012cca..aeebff4 100644 --- a/src/odapt/__init__.py +++ b/src/odapt/__init__.py @@ -8,7 +8,7 @@ from __future__ import annotations -from odapt._version import version as __version__ +from odapt.version import __version__ from odapt.operations import add_histograms diff --git a/src/odapt/version.py b/src/odapt/version.py new file mode 100644 index 0000000..063d69c --- /dev/null +++ b/src/odapt/version.py @@ -0,0 +1,5 @@ +import re + +__version__ = "1.0" +version = __version__ +version_info = tuple(re.split(r"[-\.]", __version__)) From c561118ec043a9edf0bff71f26f5536f55105aeb Mon Sep 17 00:00:00 2001 From: zbilodea <70441641+zbilodea@users.noreply.github.com> Date: Mon, 16 Oct 2023 04:13:53 -0400 Subject: [PATCH 12/22] Delete src/odapt/_version.pyi --- src/odapt/_version.pyi | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 src/odapt/_version.pyi diff --git a/src/odapt/_version.pyi b/src/odapt/_version.pyi deleted file mode 100644 index 91744f9..0000000 --- a/src/odapt/_version.pyi +++ /dev/null @@ -1,4 +0,0 @@ -from __future__ import annotations - -version: str -version_tuple: tuple[int, int, int] | tuple[int, int, int, str, str] From 7498e782ed6bcf4ffca9b37ff54bf6248dfb1ab8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Mon, 16 Oct 2023 10:26:02 +0200 Subject: [PATCH 13/22] format fix --- src/odapt/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/odapt/__init__.py b/src/odapt/__init__.py index aeebff4..2a6630d 100644 --- a/src/odapt/__init__.py +++ b/src/odapt/__init__.py @@ -8,8 +8,8 @@ from __future__ import annotations -from odapt.version import __version__ +from Odapt.version import __version__ -from odapt.operations import add_histograms +from Odapt.operations import add_histograms __all__ = ["add_histograms"] From 57a281d77cac21c444270532619cfe6e7281fb7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Mon, 16 Oct 2023 10:27:15 +0200 Subject: [PATCH 14/22] fix attempt --- src/odapt/__init__.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/odapt/__init__.py b/src/odapt/__init__.py index 2a6630d..12d2331 100644 --- a/src/odapt/__init__.py +++ b/src/odapt/__init__.py @@ -8,8 +8,6 @@ from __future__ import annotations -from Odapt.version import __version__ +from version import __version__ -from Odapt.operations import add_histograms - -__all__ = ["add_histograms"] +__all__ = ["__version__"] From 24650708ec5ab74eca9fa320fa4d58000c7584c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Mon, 16 Oct 2023 10:29:23 +0200 Subject: [PATCH 15/22] fix attempt --- src/odapt/__init__.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/odapt/__init__.py b/src/odapt/__init__.py index 12d2331..27fbdb3 100644 --- a/src/odapt/__init__.py +++ b/src/odapt/__init__.py @@ -1,13 +1,9 @@ """ Copyright (c) 2023 Zoë Bilodeau. All rights reserved. -Proteus: File conversion package. +Odapt: File conversion package. """ - - -from __future__ import annotations - from version import __version__ __all__ = ["__version__"] From 3b37e6722628e1d71cfa5421fdd233f2303415c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Mon, 16 Oct 2023 10:34:13 +0200 Subject: [PATCH 16/22] version fix --- src/odapt/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/odapt/__init__.py b/src/odapt/__init__.py index 27fbdb3..27343ee 100644 --- a/src/odapt/__init__.py +++ b/src/odapt/__init__.py @@ -4,6 +4,6 @@ Odapt: File conversion package. """ -from version import __version__ +from Odapt.version import __version__ __all__ = ["__version__"] From 9303dd98ea359ccabef73eb847774cf42e1a2deb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Mon, 16 Oct 2023 10:37:24 +0200 Subject: [PATCH 17/22] finally fixed --- src/odapt/__init__.py | 1 + src/odapt/version.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/odapt/__init__.py b/src/odapt/__init__.py index 27343ee..a5722ef 100644 --- a/src/odapt/__init__.py +++ b/src/odapt/__init__.py @@ -3,6 +3,7 @@ Odapt: File conversion package. """ +from __future__ import annotations from Odapt.version import __version__ diff --git a/src/odapt/version.py b/src/odapt/version.py index 063d69c..cf3683e 100644 --- a/src/odapt/version.py +++ b/src/odapt/version.py @@ -1,5 +1,6 @@ +from __future__ import annotations + import re __version__ = "1.0" -version = __version__ version_info = tuple(re.split(r"[-\.]", __version__)) From 136295c428ff7db0b8660e24d0d00555e16722a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Mon, 16 Oct 2023 10:45:58 +0200 Subject: [PATCH 18/22] version not recognized --- src/odapt/_version.pyi | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 src/odapt/_version.pyi diff --git a/src/odapt/_version.pyi b/src/odapt/_version.pyi new file mode 100644 index 0000000..f5d880b --- /dev/null +++ b/src/odapt/_version.pyi @@ -0,0 +1,4 @@ +from __future__ import annotations + +version: "1.0" +version_tuple: [1,0,0] From 02f24ccaf94c2cf9f1908dcf3d7e268fd8aff2e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Mon, 16 Oct 2023 10:49:56 +0200 Subject: [PATCH 19/22] version still not recognized --- src/odapt/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/odapt/__init__.py b/src/odapt/__init__.py index a5722ef..5d74e98 100644 --- a/src/odapt/__init__.py +++ b/src/odapt/__init__.py @@ -5,6 +5,6 @@ """ from __future__ import annotations -from Odapt.version import __version__ +from Odapt._version import __version__ __all__ = ["__version__"] From 8736eb59add8900645a374c132af4faf3f7d09ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Mon, 16 Oct 2023 10:52:03 +0200 Subject: [PATCH 20/22] version --- src/odapt/__init__.py | 2 +- src/odapt/_version.pyi | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/odapt/__init__.py b/src/odapt/__init__.py index 5d74e98..9ae33e3 100644 --- a/src/odapt/__init__.py +++ b/src/odapt/__init__.py @@ -5,6 +5,6 @@ """ from __future__ import annotations -from Odapt._version import __version__ +from Odapt._version import version as __version__ __all__ = ["__version__"] diff --git a/src/odapt/_version.pyi b/src/odapt/_version.pyi index f5d880b..91744f9 100644 --- a/src/odapt/_version.pyi +++ b/src/odapt/_version.pyi @@ -1,4 +1,4 @@ from __future__ import annotations -version: "1.0" -version_tuple: [1,0,0] +version: str +version_tuple: tuple[int, int, int] | tuple[int, int, int, str, str] From ec54d78482c7cd062e238098c7b91323dd94960a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Mon, 16 Oct 2023 11:12:13 +0200 Subject: [PATCH 21/22] fixing version --- pyproject.toml | 18 +++++++++--------- src/odapt/__init__.py | 2 +- src/odapt/version.py | 6 ------ 3 files changed, 10 insertions(+), 16 deletions(-) delete mode 100644 src/odapt/version.py diff --git a/pyproject.toml b/pyproject.toml index 1f8954c..5edf88b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] -name = "proteus" +name = "odapt" authors = [ { name = "Zoë Bilodeau", email = "zobilodeau@gmail.com" }, ] @@ -49,15 +49,15 @@ docs = [ ] [project.urls] -Homepage = "https://github.com/zbilodea/Proteus" -"Bug Tracker" = "https://github.com/zbilodea/Proteus/issues" -Discussions = "https://github.com/zbilodea/Proteus/discussions" -Changelog = "https://github.com/zbilodea/Proteus/releases" +Homepage = "https://github.com/zbilodea/Odapt" +"Bug Tracker" = "https://github.com/zbilodea/Odapt/issues" +Discussions = "https://github.com/zbilodea/Odapt/discussions" +Changelog = "https://github.com/zbilodea/Odapt/releases" [tool.hatch] version.source = "vcs" -build.hooks.vcs.version-file = "src/proteus/_version.py" +build.hooks.vcs.version-file = "src/odapt/_version.py" envs.default.dependencies = [ "pytest", "pytest-cov", @@ -78,7 +78,7 @@ testpaths = [ [tool.coverage] -run.source = ["proteus"] +run.source = ["odapt"] port.exclude_lines = [ 'pragma: no cover', '\.\.\.', @@ -97,7 +97,7 @@ disallow_untyped_defs = false disallow_incomplete_defs = false [[tool.mypy.overrides]] -module = "proteus.*" +module = "odapt.*" disallow_untyped_defs = true disallow_incomplete_defs = true @@ -141,7 +141,7 @@ exclude = [] flake8-unused-arguments.ignore-variadic-names = true isort.required-imports = ["from __future__ import annotations"] # Uncomment if using a _compat.typing backport -# typing-modules = ["proteus._compat.typing"] +# typing-modules = ["odapt._compat.typing"] [tool.ruff.per-file-ignores] "tests/**" = ["T20"] diff --git a/src/odapt/__init__.py b/src/odapt/__init__.py index 9ae33e3..872d7dc 100644 --- a/src/odapt/__init__.py +++ b/src/odapt/__init__.py @@ -5,6 +5,6 @@ """ from __future__ import annotations -from Odapt._version import version as __version__ +from odapt._version import version as __version__ __all__ = ["__version__"] diff --git a/src/odapt/version.py b/src/odapt/version.py deleted file mode 100644 index cf3683e..0000000 --- a/src/odapt/version.py +++ /dev/null @@ -1,6 +0,0 @@ -from __future__ import annotations - -import re - -__version__ = "1.0" -version_info = tuple(re.split(r"[-\.]", __version__)) From 9d52b9273e69d8437889d54ca1910324feb03b29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Mon, 16 Oct 2023 11:48:20 +0200 Subject: [PATCH 22/22] more name changes --- .github/CONTRIBUTING.md | 2 +- docs/conf.py | 4 ++-- docs/index.md | 2 +- noxfile.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index ab3900f..a99efb0 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -71,7 +71,7 @@ pytest Use pytest-cov to generate coverage reports: ```bash -pytest --cov=Proteus +pytest --cov=Odapt ``` # Building docs diff --git a/docs/conf.py b/docs/conf.py index 1bb2c7d..6316841 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -2,10 +2,10 @@ import importlib.metadata -project = "Proteus" +project = "Odapt" copyright = "2023, Zoë Bilodeau" author = "Zoë Bilodeau" -version = release = importlib.metadata.version("proteus") +version = release = importlib.metadata.version("odapt") extensions = [ "myst_parser", diff --git a/docs/index.md b/docs/index.md index 4b55379..f926b6f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,4 +1,4 @@ -# Proteus +# Odapt ```{toctree} :maxdepth: 2 diff --git a/noxfile.py b/noxfile.py index 90a3a7e..5632317 100644 --- a/noxfile.py +++ b/noxfile.py @@ -30,7 +30,7 @@ def pylint(session: nox.Session) -> None: # This needs to be installed into the package environment, and is slower # than a pre-commit check session.install(".", "pylint") - session.run("pylint", "proteus", *session.posargs) + session.run("pylint", "odapt", *session.posargs) @nox.session @@ -99,7 +99,7 @@ def build_api_docs(session: nox.Session) -> None: "--module-first", "--no-toc", "--force", - "../src/proteus", + "../src/odapt", )