Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add known forth for ATLAS #1282

Merged
merged 14 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions src/uproot/interpretation/known_forth/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/uproot5/blob/main/LICENSE

"""
This module provides known forth code and awkward forms for types where it is known a priori.

See :doc:`uproot.interpretation.known_forth.known_forth_of` for the function
that provides the lookup of known forth codes and :doc:`uproot.interpretation.known_forth.atlas.VectorVectorElementLink` for an
implementation used in ATLAS (D)AODs.
"""
from __future__ import annotations

import uproot
from uproot.interpretation.known_forth.atlas import VectorVectorElementLink

KNOWN_FORTH_DICT = {
"std::vector<std::vector<ElementLink<DataVector<xAOD::CaloCluster_v1>>>>": VectorVectorElementLink,
"std::vector<std::vector<ElementLink<DataVector<xAOD::IParticle>>>>": VectorVectorElementLink,
"std::vector<std::vector<ElementLink<DataVector<xAOD::MuonSegment_v1>>>>": VectorVectorElementLink,
"std::vector<std::vector<ElementLink<DataVector<xAOD::NeutralParticle_v1>>>>": VectorVectorElementLink,
"std::vector<std::vector<ElementLink<DataVector<xAOD::TauTrack_v1>>>>": VectorVectorElementLink,
"std::vector<std::vector<ElementLink<DataVector<xAOD::TrackParticle_v1>>>>": VectorVectorElementLink,
"std::vector<std::vector<ElementLink<DataVector<xAOD::TruthParticle_v1>>>>": VectorVectorElementLink,
"std::vector<std::vector<ElementLink<DataVector<xAOD::Vertex_v1>>>>": VectorVectorElementLink,
}


def known_forth_of(model):
"""
Args:
model: The :doc:`uproot.model.Model` to look up known forth for

Returns an object with attributes `forth_code` and `awkward_form` if a known
special case exists, else None
"""
try:
typename = model.typename
except AttributeError:
try:
typename = model.classname
except AttributeError:
typename = uproot.model.classname_decode(model.__name__)

if typename not in KNOWN_FORTH_DICT:
return

return KNOWN_FORTH_DICT[typename](typename)
99 changes: 99 additions & 0 deletions src/uproot/interpretation/known_forth/atlas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/uproot5/blob/main/LICENSE

"""
This module defines ATLAS specific known forth code
"""

from __future__ import annotations

import re


class VectorVectorElementLink:
"""
Known forth and awkward form for ``std::vector<std::vector<ElementLink<T>>`` types in ATLAS (D)AODs

The forth code was adjusted from what was provided in
``branch._complete_forth_code`` after running ``.array()`` once.

The binary data of one vector<vector<ElementLink<T>> looks as follows:

* 6 bytes header for the outer vector
* 4 bytes big endian uint for the size of the outer vector (node1)
* for each outer vector element:
* 4 bytes big endian uint for the size of the inner vector (node2)
* for each inner vector element:
* 20 bytes header for the ElementLink object
* 4 bytes big endian uint for the ``m_persKey`` member (node3)
* 4 bytes big endian uint for the ``m_persIndex`` member (node4)
"""

forth_code = """
input stream
input byteoffsets
input bytestops
output node1-offsets int64
output node2-offsets int64
output node3-data uint32
output node4-data uint32

0 node1-offsets <- stack
0 node2-offsets <- stack

0 do
byteoffsets I-> stack
stream seek
6 stream skip
stream !I-> stack
dup node1-offsets +<- stack
0 do
stream !I-> stack
dup node2-offsets +<- stack
0 do
20 stream skip
stream !I-> node3-data
stream !I-> node4-data
loop
loop
loop
"""

def __init__(self, typename):
self.typename = typename
self.inner_typename = re.sub(
"std::vector<std::vector<(.*)>>", r"\1", self.typename
)

@property
def awkward_form(self):
return {
"class": "ListOffsetArray",
"offsets": "i64",
"form_key": "node1",
"content": {
"class": "ListOffsetArray",
"offsets": "i64",
"form_key": "node2",
"content": {
"class": "RecordArray",
"fields": ["m_persKey", "m_persIndex"],
"contents": [
{
"class": "NumpyArray",
"primitive": "uint32",
"inner_shape": [],
"parameters": {},
"form_key": "node3",
},
{
"class": "NumpyArray",
"primitive": "uint32",
"inner_shape": [],
"parameters": {},
"form_key": "node4",
},
],
"parameters": {"__record__": f"{self.inner_typename}"},
},
},
}
17 changes: 15 additions & 2 deletions src/uproot/interpretation/objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

import uproot
import uproot._awkwardforth
from uproot.interpretation.known_forth import known_forth_of


class AsObjects(uproot.interpretation.Interpretation):
Expand All @@ -45,14 +46,22 @@ class AsObjects(uproot.interpretation.Interpretation):
:ref:`uproot.interpretation.objects.AsObjects.simplify` attempts to
replace this interpretation with a faster-to-read equivalent, but not all
data types can be simplified.

:doc:`uproot.interpretation.known_forth` defines forth code and forms for
special cases that will be picked up here as well
"""

def __init__(self, model, branch=None):
self._model = model
self._branch = branch
self._form = None
self._forth = True
self._complete_forth_code = None
known_forth = known_forth_of(self._model)
if known_forth is not None:
self._complete_forth_code = known_forth.forth_code
self._form = known_forth.awkward_form
else:
self._complete_forth_code = None
self._form = None
self._forth_lock = threading.Lock()

@property
Expand Down Expand Up @@ -122,6 +131,10 @@ def awkward_form(
tobject_header=False,
breadcrumbs=(),
):
if self._form is not None:
awkward = uproot.extras.awkward()
return awkward.forms.from_dict(self._form)

context = self._make_context(
context, index_format, header, tobject_header, breadcrumbs
)
Expand Down
48 changes: 48 additions & 0 deletions tests/test_1282_add_known_forth_for_atlas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env python3

import awkward
import pytest
import skhep_testdata
import uproot

VECTOR_VECTOR_ELEMENTLINK_BRANCHES = [
"AnalysisHLT_e12_lhloose_nod0_2mu10AuxDyn.TrigMatchedObjects",
"AnalysisElectronsAuxDyn.caloClusterLinks",
"AnalysisPhotonsAuxDyn.vertexLinks",
"TruthMuonsAuxDyn.childLinks",
"AnalysisElectronsAuxDyn.trackParticleLinks",
"PrimaryVerticesAuxDyn.neutralParticleLinks",
"AnalysisTauJetsAuxDyn.tauTrackLinks",
]


@pytest.mark.parametrize("key", VECTOR_VECTOR_ELEMENTLINK_BRANCHES)
def test_pickup_vector_vector_elementlink(key):
with uproot.open(
{skhep_testdata.data_path("uproot-issue-123a.root"): "CollectionTree"}
) as tree:
branch = tree[key]
assert branch.interpretation._complete_forth_code is not None
assert branch.interpretation._form is not None


def test_consistent_library_np_vector_vector_elementlink():
arrays_np = {}
with uproot.open(
{skhep_testdata.data_path("uproot-issue-123a.root"): "CollectionTree"}
) as tree:
for key in VECTOR_VECTOR_ELEMENTLINK_BRANCHES:
arrays_np[key] = tree[key].array(library="np")
arrays_ak = {}
with uproot.open(
{skhep_testdata.data_path("uproot-issue-123a.root"): "CollectionTree"}
) as tree:
for key in VECTOR_VECTOR_ELEMENTLINK_BRANCHES:
arrays_ak[key] = tree[key].array()
for key in arrays_np:
array_ak = arrays_ak[key]
array_np = uproot.interpretation.library._object_to_awkward_array(
awkward, array_ak.layout.form.to_dict(), arrays_np[key]
)
for field in array_ak.fields:
assert awkward.all(array_np[field] == array_ak[field])
Loading