Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Sep 13, 2024
1 parent 9c1f295 commit c14cde5
Showing 1 changed file with 74 additions and 76 deletions.
150 changes: 74 additions & 76 deletions tests/unittests/text/test_bertscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,158 +192,156 @@ def test_bertscore_sorting(idf: bool):

# First index should be the self-comparison - sorting by length should not shuffle this


@skip_on_connection_issues()
@pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4")
@pytest.mark.parametrize(
["idf", "batch_size"],
[(False, 1),
(False, 9),
(True, 1),
(True, 9)],
[(False, 1), (False, 9), (True, 1), (True, 9)],
)
def test_bertscore_most_similar(idf: bool, batch_size: int):
"""Tests that BERTScore actually gives the highest score to self-similarity."""
short = "hello there"
long = "master kenobi"
longer = "general kenobi"

sentences = [short, long, longer]
preds, targets = list(zip(*list(product(sentences,
sentences))))
score = bert_score(preds, targets, idf=idf, lang="en",
rescale_with_baseline=False, batch_size=batch_size)
preds, targets = list(zip(*list(product(sentences, sentences))))
score = bert_score(preds, targets, idf=idf, lang="en", rescale_with_baseline=False, batch_size=batch_size)
for i in range(len(preds)):
max_pred = i%(len(sentences))*(1 + len(sentences))
max_target = int(i/(len(sentences)))*(1 + len(sentences))
assert score["f1"][i] <= score["f1"][max_pred], \
f"pair: {preds[i], targets[i]} does not have a lower score than {preds[max_pred], targets[max_pred]}\n{i=}{max_pred=}"
assert score["f1"][i] <= score["f1"][max_target], \
f"pair: {preds[i], targets[i]} does not have a lower score than {preds[max_target], targets[max_target]}\n{i=}{max_target=}"
max_pred = i % (len(sentences)) * (1 + len(sentences))
max_target = int(i / (len(sentences))) * (1 + len(sentences))
assert (
score["f1"][i] <= score["f1"][max_pred]
), f"pair: {preds[i], targets[i]} does not have a lower score than {preds[max_pred], targets[max_pred]}\n{i=}{max_pred=}"
assert (
score["f1"][i] <= score["f1"][max_target]
), f"pair: {preds[i], targets[i]} does not have a lower score than {preds[max_target], targets[max_target]}\n{i=}{max_target=}"


@skip_on_connection_issues()
@pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4")
@pytest.mark.parametrize(
["idf"],
[(False,),
(True,)],
"idf",
[(False,), (True,)],
)
def test_bertscore_most_similar_separate_calls(idf: bool):
"""Tests that BERTScore actually gives the highest score to self-similarity."""
short = "hello there"
long = "master kenobi"
longer = "general kenobi"

sentences = [short, long, longer]
pairs_to_compare = product(sentences,
sentences)
preds, targets = list(zip(*list(product(sentences,
sentences))))
score = {"f1": [bert_score([pred],[target], idf=idf, lang="en",
rescale_with_baseline=False)["f1"].item()
for pred, target in pairs_to_compare]}
pairs_to_compare = product(sentences, sentences)
preds, targets = list(zip(*list(product(sentences, sentences))))
score = {
"f1": [
bert_score([pred], [target], idf=idf, lang="en", rescale_with_baseline=False)["f1"].item()
for pred, target in pairs_to_compare
]
}
for i in range(len(preds)):
max_pred = i%(len(sentences))*(1 + len(sentences))
max_target = int(i/(len(sentences)))*(1 + len(sentences))
assert score["f1"][i] <= score["f1"][max_pred], \
f"pair: {preds[i], targets[i]} does not have a lower score than {preds[max_pred], targets[max_pred]}\n{i=}{max_pred=}"
assert score["f1"][i] <= score["f1"][max_target], \
f"pair: {preds[i], targets[i]} does not have a lower score than {preds[max_target], targets[max_target]}\n{i=}{max_target=}"


max_pred = i % (len(sentences)) * (1 + len(sentences))
max_target = int(i / (len(sentences))) * (1 + len(sentences))
assert (
score["f1"][i] <= score["f1"][max_pred]
), f"pair: {preds[i], targets[i]} does not have a lower score than {preds[max_pred], targets[max_pred]}\n{i=}{max_pred=}"
assert (
score["f1"][i] <= score["f1"][max_target]
), f"pair: {preds[i], targets[i]} does not have a lower score than {preds[max_target], targets[max_target]}\n{i=}{max_target=}"


@skip_on_connection_issues()
@pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4")
@pytest.mark.parametrize(
["idf", "batch_size"],
[(False, 1),
(False, 9),
(True, 1),
(True, 9)],
[(False, 1), (False, 9), (True, 1), (True, 9)],
)
def test_bertscore_symmetry(idf: bool, batch_size: int):
"""Tests that BERTscore F1 score is symmetric between reference and prediction.
As F1 is symmetric, it should also be symmetric."""
As F1 is symmetric, it should also be symmetric.
"""
short = "hello there"
long = "master kenobi"
longer = "general kenobi"

sentences = [short, long, longer]
preds, targets = list(zip(*list(product(sentences,
sentences))))
score = bert_score(preds, targets, idf=idf, lang="en",
rescale_with_baseline=False, batch_size=batch_size)
preds, targets = list(zip(*list(product(sentences, sentences))))
score = bert_score(preds, targets, idf=idf, lang="en", rescale_with_baseline=False, batch_size=batch_size)
for i in range(len(preds)):
for j in range(len(targets)):
if preds[i] == targets[j] and preds[j] == targets[i]:
assert score['f1'][i] == pytest.approx(score['f1'][j]), \
f"f1 score for {(preds[i], targets[i])} is not the same as {(preds[j], targets[j])}."
pass
assert score["f1"][i] == pytest.approx(
score["f1"][j]
), f"f1 score for {(preds[i], targets[i])} is not the same as {(preds[j], targets[j])}."


@skip_on_connection_issues()
@pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4")
@pytest.mark.parametrize(
["idf"],
[(False,),
(True,)],
"idf",
[(False,), (True,)],
)
def test_bertscore_symmetry_separate_calls(idf: bool):
"""Tests that BERTscore F1 score is symmetric between reference and prediction.
As F1 is symmetric, it should also be symmetric."""
As F1 is symmetric, it should also be symmetric.
"""
short = "hello there"
long = "master kenobi"
longer = "general kenobi"

sentences = [short, long, longer]
pairs_to_compare = product(sentences,
sentences)
preds, targets = list(zip(*list(product(sentences,
sentences))))
score = {"f1": [bert_score([pred],[target], idf=idf, lang="en",
rescale_with_baseline=False)["f1"].item()
for pred, target in pairs_to_compare]}
pairs_to_compare = product(sentences, sentences)
preds, targets = list(zip(*list(product(sentences, sentences))))
score = {
"f1": [
bert_score([pred], [target], idf=idf, lang="en", rescale_with_baseline=False)["f1"].item()
for pred, target in pairs_to_compare
]
}
for i in range(len(preds)):
for j in range(len(targets)):
if preds[i] == targets[j] and preds[j] == targets[i]:
assert score['f1'][i] == pytest.approx(score['f1'][j]), \
f"f1 score for {(preds[i], targets[i])} is not the same as {(preds[j], targets[j])}."
pass
assert score["f1"][i] == pytest.approx(
score["f1"][j]
), f"f1 score for {(preds[i], targets[i])} is not the same as {(preds[j], targets[j])}."


@skip_on_connection_issues()
@pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4")
@pytest.mark.parametrize(
["idf", "batch_size"],
[(False, 1),
(False, 3)]
)
@pytest.mark.parametrize(["idf", "batch_size"], [(False, 1), (False, 3)])
def test_bertscore_additional_sentence(idf: bool, batch_size: int):
"""Tests that BERTscore keeps the same scores for previous inputs
by adding additional elements to the input lists. This should be the case for idf=False."""
"""Tests that BERTscore keeps the same scores for previous inputs by adding additional elements to the input lists.
This should be the case for idf=False.
"""
short = "hello there"
long = "master kenobi"
longer = "general kenobi"

preds = [long,long]
targets = [long,short]
preds = [long, long]
targets = [long, short]

score = bert_score(preds, targets, idf=idf, lang="en",
rescale_with_baseline=False, batch_size=batch_size)
score = bert_score(preds, targets, idf=idf, lang="en", rescale_with_baseline=False, batch_size=batch_size)

longlong = score["f1"][0]
longshort = score["f1"][1]
# First index should be the self-comparison - sorting by length should not shuffle this
assert longlong > longshort

preds = preds + [short, longer]
targets = targets + [longer, long]

score = bert_score(preds, targets, idf=idf, lang="en",
rescale_with_baseline=False, batch_size=batch_size)
score = bert_score(preds, targets, idf=idf, lang="en", rescale_with_baseline=False, batch_size=batch_size)

# First two indices should be exactly as in the previous call to metric
assert score["f1"][0] == pytest.approx(longlong)
assert score["f1"][1] == pytest.approx(longshort)
# Indices 1 and 2 should also be smaller than self-comparison.
assert score["f1"][0] > score["f1"][1]
assert score["f1"][0] > score["f1"][2]

0 comments on commit c14cde5

Please sign in to comment.