Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Load additional schemes for segments in audformat.Database.get() from filewise table #460

Open
audeerington opened this issue Sep 27, 2024 · 4 comments · May be fixed by #461
Open

Load additional schemes for segments in audformat.Database.get() from filewise table #460

audeerington opened this issue Sep 27, 2024 · 4 comments · May be fixed by #461
Labels
bug Something isn't working

Comments

@audeerington
Copy link

For a database db with a segment-wise table with a column col_1 and a filewise table with a column meta_1,
db.get("col_1", additional_schemes="meta_1") doesn't load the column data from meta_1 from the filewise table:

import audformat
import audformat.testing

db = audformat.testing.create_db(minimal=True)
db.schemes["meta_1"] = audformat.Scheme(dtype="str")
db.schemes["col_1"] = audformat.Scheme(dtype="str")
audformat.testing.add_table(
    db,
    table_id="files",
    index_type=audformat.define.IndexType.FILEWISE,
    columns=["meta_1"],
)
audformat.testing.add_table(
    db,
    table_id="segments",
    index_type=audformat.define.IndexType.SEGMENTED,
    columns=["col_1"],
)
db.get(
    "col_1",
    additional_schemes="meta_1",
)
@hagenw
Copy link
Member

hagenw commented Sep 27, 2024

Great observation, seems I have relied too much on emodb when testing db.get() ;)

@hagenw hagenw added the bug Something isn't working label Sep 27, 2024
@hagenw
Copy link
Member

hagenw commented Sep 27, 2024

I had a look at the unittests, which test a lot of cases:

@pytest.mark.parametrize(
"db, scheme, additional_schemes, expected",
[
(
"mono_db",
"non-existing",
[],
pd.DataFrame(
{
"non-existing": [],
},
index=audformat.filewise_index(),
dtype="object",
),
),
(
"mono_db",
"weather",
[],
pd.DataFrame(
{
"weather": [],
},
index=audformat.filewise_index(),
dtype="object",
),
),
(
"mono_db",
"gender",
[],
pd.DataFrame(
{
"gender": ["female", "", "male"],
},
index=audformat.filewise_index(["f1.wav", "f2.wav", "f3.wav"]),
dtype="string",
),
),
(
"mono_db",
"sex",
[],
pd.DataFrame(
{
"sex": ["female", "male"],
},
index=audformat.filewise_index(["f1.wav", "f3.wav"]),
dtype="object",
),
),
(
"mono_db",
"sex",
["gender"],
pd.concat(
[
pd.Series(
["female", "male"],
index=audformat.filewise_index(["f1.wav", "f3.wav"]),
dtype="object",
name="sex",
),
pd.Series(
["female", "male"],
index=audformat.filewise_index(["f1.wav", "f3.wav"]),
dtype="string",
name="gender",
),
],
axis=1,
),
),
(
"mono_db",
"gender",
["sex", "non-existing"],
pd.concat(
[
pd.Series(
["female", "", "male"],
index=audformat.filewise_index(["f1.wav", "f2.wav", "f3.wav"]),
dtype="string",
name="gender",
),
pd.Series(
["female", np.nan, "male"],
index=audformat.filewise_index(["f1.wav", "f2.wav", "f3.wav"]),
dtype="object",
name="sex",
),
pd.Series(
[],
index=audformat.filewise_index(),
dtype="object",
name="non-existing",
),
],
axis=1,
),
),
# Ensure that requesting a non-existing scheme
# before an existing scheme
# does return values for existing schemes.
# https://github.com/audeering/audformat/issues/426
(
"mono_db",
"gender",
["non-existing", "sex"],
pd.concat(
[
pd.Series(
["female", "", "male"],
index=audformat.filewise_index(["f1.wav", "f2.wav", "f3.wav"]),
dtype="string",
name="gender",
),
pd.Series(
[],
index=audformat.filewise_index(),
dtype="object",
name="non-existing",
),
pd.Series(
["female", np.nan, "male"],
index=audformat.filewise_index(["f1.wav", "f2.wav", "f3.wav"]),
dtype="object",
name="sex",
),
],
axis=1,
),
),
# Ensure that requesting a non-existing scheme
# before an existing scheme
# does return values for existing schemes.
# https://github.com/audeering/audformat/issues/426
(
"mono_db",
"gender",
["numbers", "non-existing", "sex"],
pd.concat(
[
pd.Series(
["female", "", "male"],
index=audformat.filewise_index(["f1.wav", "f2.wav", "f3.wav"]),
dtype="string",
name="gender",
),
pd.Series(
[0, 1, 2],
index=audformat.filewise_index(["f1.wav", "f2.wav", "f3.wav"]),
dtype="Int64",
name="numbers",
),
pd.Series(
[],
index=audformat.filewise_index(),
dtype="object",
name="non-existing",
),
pd.Series(
["female", np.nan, "male"],
index=audformat.filewise_index(["f1.wav", "f2.wav", "f3.wav"]),
dtype="object",
name="sex",
),
],
axis=1,
),
),
(
"mono_db",
"winner",
[],
pd.DataFrame(
{
"winner": ["w1", "w1", "w2", "w1", "w1", "w1", "w1"],
},
index=audformat.utils.union(
[
audformat.filewise_index(["f1.wav", "f2.wav", "f3.wav"]),
audformat.segmented_index(
["f1.wav", "f1.wav", "f1.wav", "f2.wav"],
[0, 0.1, 0.3, 0],
[0.2, 0.2, 0.5, 0.7],
),
]
),
dtype=pd.CategoricalDtype(
["w1", "w2", "w3"],
ordered=False,
),
),
),
(
"mono_db",
"year",
[],
pd.DataFrame(
{
"year": [1995, 1995, 1996, 1995, 1995, 1995, 1995],
},
index=audformat.utils.union(
[
audformat.filewise_index(["f1.wav", "f2.wav", "f3.wav"]),
audformat.segmented_index(
["f1.wav", "f1.wav", "f1.wav", "f2.wav"],
[0, 0.1, 0.3, 0],
[0.2, 0.2, 0.5, 0.7],
),
]
),
dtype="Int64",
),
),
(
"mono_db",
"rating",
[],
pd.DataFrame(
{
"rating": [1, 0, 1, 1, 1, 2, 2],
},
index=audformat.utils.union(
[
audformat.filewise_index(["f3.wav", "f1.wav", "f2.wav"]),
audformat.segmented_index(
["f1.wav", "f1.wav", "f1.wav", "f2.wav"],
[0, 0.1, 0.3, 0],
[0.2, 0.2, 0.5, 0.7],
),
]
),
dtype=pd.CategoricalDtype(
[0, 1, 2],
ordered=False,
),
),
),
(
"mono_db",
"regression",
[],
pd.DataFrame(
{
"regression": [0.3, 0.2, 0.6, 0.4],
},
index=audformat.segmented_index(
["f1.wav", "f1.wav", "f1.wav", "f2.wav"],
[0, 0.1, 0.3, 0],
[0.2, 0.2, 0.5, 0.7],
),
dtype="float",
),
),
(
"mono_db",
"selection",
[],
pd.DataFrame(
{
"selection": [1, 1, 1],
},
index=audformat.filewise_index(["f1.wav", "f2.wav", "f3.wav"]),
dtype=pd.CategoricalDtype(
[1, 0],
ordered=False,
),
),
),
(
"mono_db",
"numbers",
[],
pd.DataFrame(
{
"numbers": [0, 1, 2],
},
index=audformat.filewise_index(["f1.wav", "f2.wav", "f3.wav"]),
dtype="Int64",
),
),
(
"overlapping_data_db",
"gender",
[],
pd.DataFrame(
{
"gender": ["female", np.nan],
},
index=audformat.filewise_index(["f1.wav", "f2.wav"]),
dtype=pd.CategoricalDtype(
["female", "male"],
ordered=False,
),
),
),
(
"scheme_not_assigned_db",
"gender",
[],
pd.DataFrame(
{
"gender": [],
},
index=audformat.filewise_index(),
dtype="object",
),
),
(
"scheme_not_assigned_db",
"rating",
[],
pd.DataFrame(
{
"rating": [],
},
index=audformat.filewise_index(),
dtype="object",
),
),
(
"scheme_not_assigned_db",
"rater1",
[],
pd.DataFrame(
{
"rater1": [1],
},
index=audformat.filewise_index(["f1.wav"]),
dtype="object",
),
),
(
"wrong_scheme_labels_db",
"gender",
[],
pd.DataFrame(
{
"gender": ["female", np.nan],
},
index=audformat.filewise_index(["f1.wav", "f2.wav"]),
dtype="string",
),
),
],
)
def test_database_get(request, db, scheme, additional_schemes, expected):
db = request.getfixturevalue(db)
df = db.get(scheme, additional_schemes)
pd.testing.assert_frame_equal(df, expected)

Which do indeed miss your case. Which might explain why we haven't found the bug before.

hagenw added a commit that referenced this issue Sep 27, 2024
@hagenw
Copy link
Member

hagenw commented Sep 27, 2024

I was able to add a test case that covers your described case at #461, which is indeed failing as it returns NaN and not the desired labels:

[left]:  [NaN, NaN, NaN, NaN]
Categories (3, object): ['s1', 's2', 's3']
[right]: ['s1', 's1', 's1', NaN]
Categories (3, object): ['s1', 's2', 's3']

@hagenw
Copy link
Member

hagenw commented Sep 27, 2024

An example, to clarify the expected behavior.

import audformat


db = audformat.Database("mydb")
db.schemes["label1"] = audformat.Scheme("str")
db.schemes["label2"] = audformat.Scheme("int")
files = audformat.filewise_index(["f1", "f2"])
segments = audformat.segmented_index(["f1", "f1", "f2", "f2"], [0, 1, 0, 1], [1, 2, 1, 2])
db["files"] = audformat.Table(files)
db["files"]["label1"] = audformat.Column(scheme_id="label1")
db["files"]["label1"].set(["a", "b"])
db["segments"] = audformat.Table(segments)
db["segments"]["label2"] = audformat.Column(scheme_id="label2")
db["segments"]["label2"].set([0, 1, 2, 3])

Which means we have:

>>> db["files"].df
     label1
file       
f1        a
f2        b

>>> db["segments"].df
                                      label2
file start           end                    
f1   0 days 00:00:00 0 days 00:00:01       0
     0 days 00:00:01 0 days 00:00:02       1
f2   0 days 00:00:00 0 days 00:00:01       2
     0 days 00:00:01 0 days 00:00:02       3

We can request the scheme stored in the filewise table as the main scheme, and the scheme stored in the segmented table as additional annotations. As we cannot merge them easily together all the time, we return the filewise and segmented labels in different rows:

>>> db.get("label1", additional_schemes="label2")
                                     label1  label2
file start           end                           
f1   0 days 00:00:00 NaT                  a    <NA>
                     0 days 00:00:01   <NA>       0
     0 days 00:00:01 0 days 00:00:02   <NA>       1
f2   0 days 00:00:00 NaT                  b    <NA>
                     0 days 00:00:01   <NA>       2
     0 days 00:00:01 0 days 00:00:02   <NA>       3

When starting with the scheme stored as segments as the main scheme, and adding an scheme stored in a filewise table as additional scheme, I would expect it should return the following as we can clearly map the filewise label to every segment (topic of this issue):

>>> db.get("label2", additional_schemes="label1")
                                      label2 label1
file start           end                           
f1   0 days 00:00:00 0 days 00:00:01       0      a
     0 days 00:00:01 0 days 00:00:02       1      a
f2   0 days 00:00:00 0 days 00:00:01       2      b
     0 days 00:00:01 0 days 00:00:02       3      b

An alternative approach would be to be in line with the behavior of the first example, and not merge the rows together:

>>> db.get("label2", additional_schemes="label1")
                                     label2  label1
file start           end                           
f1   0 days 00:00:00 NaT               <NA>       a
                     0 days 00:00:01      0    <NA>
     0 days 00:00:01 0 days 00:00:02      1    <NA>
f2   0 days 00:00:00 NaT               <NA>       b
                     0 days 00:00:01      2    <NA>
     0 days 00:00:01 0 days 00:00:02      3    <NA>

I as a user would prefer to return the values in the way of the first solution and merge the entries together.
@audeerington do you agree, or what would you expect to happen?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

Successfully merging a pull request may close this issue.

2 participants