diff --git a/tests/unittests/text/test_bertscore.py b/tests/unittests/text/test_bertscore.py index 43bd0a5c327..3740576b36b 100644 --- a/tests/unittests/text/test_bertscore.py +++ b/tests/unittests/text/test_bertscore.py @@ -192,153 +192,152 @@ def test_bertscore_sorting(idf: bool): # First index should be the self-comparison - sorting by length should not shuffle this + @skip_on_connection_issues() @pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4") @pytest.mark.parametrize( ["idf", "batch_size"], - [(False, 1), - (False, 9), - (True, 1), - (True, 9)], + [(False, 1), (False, 9), (True, 1), (True, 9)], ) def test_bertscore_most_similar(idf: bool, batch_size: int): """Tests that BERTScore actually gives the highest score to self-similarity.""" short = "hello there" long = "master kenobi" longer = "general kenobi" - + sentences = [short, long, longer] - preds, targets = list(zip(*list(product(sentences, - sentences)))) - score = bert_score(preds, targets, idf=idf, lang="en", - rescale_with_baseline=False, batch_size=batch_size) + preds, targets = list(zip(*list(product(sentences, sentences)))) + score = bert_score(preds, targets, idf=idf, lang="en", rescale_with_baseline=False, batch_size=batch_size) for i in range(len(preds)): - max_pred = i%(len(sentences))*(1 + len(sentences)) - max_target = int(i/(len(sentences)))*(1 + len(sentences)) - assert score["f1"][i] <= score["f1"][max_pred], \ - f"pair: {preds[i], targets[i]} does not have a lower score than {preds[max_pred], targets[max_pred]}\n{i=}{max_pred=}" - assert score["f1"][i] <= score["f1"][max_target], \ - f"pair: {preds[i], targets[i]} does not have a lower score than {preds[max_target], targets[max_target]}\n{i=}{max_target=}" + max_pred = i % (len(sentences)) * (1 + len(sentences)) + max_target = int(i / (len(sentences))) * (1 + len(sentences)) + assert ( + score["f1"][i] <= score["f1"][max_pred] + ), f"pair: {preds[i], targets[i]} does not have a lower score than {preds[max_pred], targets[max_pred]}\n{i=}{max_pred=}" + assert ( + score["f1"][i] <= score["f1"][max_target] + ), f"pair: {preds[i], targets[i]} does not have a lower score than {preds[max_target], targets[max_target]}\n{i=}{max_target=}" + @skip_on_connection_issues() @pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4") @pytest.mark.parametrize( - ["idf"], - [(False,), - (True,)], + "idf", + [(False,), (True,)], ) def test_bertscore_most_similar_separate_calls(idf: bool): """Tests that BERTScore actually gives the highest score to self-similarity.""" short = "hello there" long = "master kenobi" longer = "general kenobi" - + sentences = [short, long, longer] - pairs_to_compare = product(sentences, - sentences) - preds, targets = list(zip(*list(product(sentences, - sentences)))) - score = {"f1": [bert_score([pred],[target], idf=idf, lang="en", - rescale_with_baseline=False)["f1"].item() - for pred, target in pairs_to_compare]} + pairs_to_compare = product(sentences, sentences) + preds, targets = list(zip(*list(product(sentences, sentences)))) + score = { + "f1": [ + bert_score([pred], [target], idf=idf, lang="en", rescale_with_baseline=False)["f1"].item() + for pred, target in pairs_to_compare + ] + } for i in range(len(preds)): - max_pred = i%(len(sentences))*(1 + len(sentences)) - max_target = int(i/(len(sentences)))*(1 + len(sentences)) - assert score["f1"][i] <= score["f1"][max_pred], \ - f"pair: {preds[i], targets[i]} does not have a lower score than {preds[max_pred], targets[max_pred]}\n{i=}{max_pred=}" - assert score["f1"][i] <= score["f1"][max_target], \ - f"pair: {preds[i], targets[i]} does not have a lower score than {preds[max_target], targets[max_target]}\n{i=}{max_target=}" - - + max_pred = i % (len(sentences)) * (1 + len(sentences)) + max_target = int(i / (len(sentences))) * (1 + len(sentences)) + assert ( + score["f1"][i] <= score["f1"][max_pred] + ), f"pair: {preds[i], targets[i]} does not have a lower score than {preds[max_pred], targets[max_pred]}\n{i=}{max_pred=}" + assert ( + score["f1"][i] <= score["f1"][max_target] + ), f"pair: {preds[i], targets[i]} does not have a lower score than {preds[max_target], targets[max_target]}\n{i=}{max_target=}" + + @skip_on_connection_issues() @pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4") @pytest.mark.parametrize( ["idf", "batch_size"], - [(False, 1), - (False, 9), - (True, 1), - (True, 9)], + [(False, 1), (False, 9), (True, 1), (True, 9)], ) def test_bertscore_symmetry(idf: bool, batch_size: int): """Tests that BERTscore F1 score is symmetric between reference and prediction. - As F1 is symmetric, it should also be symmetric.""" + As F1 is symmetric, it should also be symmetric. + + """ short = "hello there" long = "master kenobi" longer = "general kenobi" sentences = [short, long, longer] - preds, targets = list(zip(*list(product(sentences, - sentences)))) - score = bert_score(preds, targets, idf=idf, lang="en", - rescale_with_baseline=False, batch_size=batch_size) + preds, targets = list(zip(*list(product(sentences, sentences)))) + score = bert_score(preds, targets, idf=idf, lang="en", rescale_with_baseline=False, batch_size=batch_size) for i in range(len(preds)): for j in range(len(targets)): if preds[i] == targets[j] and preds[j] == targets[i]: - assert score['f1'][i] == pytest.approx(score['f1'][j]), \ - f"f1 score for {(preds[i], targets[i])} is not the same as {(preds[j], targets[j])}." - pass + assert score["f1"][i] == pytest.approx( + score["f1"][j] + ), f"f1 score for {(preds[i], targets[i])} is not the same as {(preds[j], targets[j])}." + @skip_on_connection_issues() @pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4") @pytest.mark.parametrize( - ["idf"], - [(False,), - (True,)], + "idf", + [(False,), (True,)], ) def test_bertscore_symmetry_separate_calls(idf: bool): """Tests that BERTscore F1 score is symmetric between reference and prediction. - As F1 is symmetric, it should also be symmetric.""" + + As F1 is symmetric, it should also be symmetric. + + """ short = "hello there" long = "master kenobi" longer = "general kenobi" - + sentences = [short, long, longer] - pairs_to_compare = product(sentences, - sentences) - preds, targets = list(zip(*list(product(sentences, - sentences)))) - score = {"f1": [bert_score([pred],[target], idf=idf, lang="en", - rescale_with_baseline=False)["f1"].item() - for pred, target in pairs_to_compare]} + pairs_to_compare = product(sentences, sentences) + preds, targets = list(zip(*list(product(sentences, sentences)))) + score = { + "f1": [ + bert_score([pred], [target], idf=idf, lang="en", rescale_with_baseline=False)["f1"].item() + for pred, target in pairs_to_compare + ] + } for i in range(len(preds)): for j in range(len(targets)): if preds[i] == targets[j] and preds[j] == targets[i]: - assert score['f1'][i] == pytest.approx(score['f1'][j]), \ - f"f1 score for {(preds[i], targets[i])} is not the same as {(preds[j], targets[j])}." - pass + assert score["f1"][i] == pytest.approx( + score["f1"][j] + ), f"f1 score for {(preds[i], targets[i])} is not the same as {(preds[j], targets[j])}." + @skip_on_connection_issues() @pytest.mark.skipif(not _TRANSFORMERS_GREATER_EQUAL_4_4, reason="test requires transformers>4.4") -@pytest.mark.parametrize( - ["idf", "batch_size"], - [(False, 1), - (False, 3)] -) +@pytest.mark.parametrize(["idf", "batch_size"], [(False, 1), (False, 3)]) def test_bertscore_additional_sentence(idf: bool, batch_size: int): - """Tests that BERTscore keeps the same scores for previous inputs - by adding additional elements to the input lists. This should be the case for idf=False.""" + """Tests that BERTscore keeps the same scores for previous inputs by adding additional elements to the input lists. + + This should be the case for idf=False. + """ short = "hello there" long = "master kenobi" longer = "general kenobi" - preds = [long,long] - targets = [long,short] + preds = [long, long] + targets = [long, short] - score = bert_score(preds, targets, idf=idf, lang="en", - rescale_with_baseline=False, batch_size=batch_size) + score = bert_score(preds, targets, idf=idf, lang="en", rescale_with_baseline=False, batch_size=batch_size) longlong = score["f1"][0] longshort = score["f1"][1] # First index should be the self-comparison - sorting by length should not shuffle this assert longlong > longshort - + preds = preds + [short, longer] targets = targets + [longer, long] - score = bert_score(preds, targets, idf=idf, lang="en", - rescale_with_baseline=False, batch_size=batch_size) + score = bert_score(preds, targets, idf=idf, lang="en", rescale_with_baseline=False, batch_size=batch_size) # First two indices should be exactly as in the previous call to metric assert score["f1"][0] == pytest.approx(longlong) @@ -346,4 +345,3 @@ def test_bertscore_additional_sentence(idf: bool, batch_size: int): # Indices 1 and 2 should also be smaller than self-comparison. assert score["f1"][0] > score["f1"][1] assert score["f1"][0] > score["f1"][2] -