Skip to content

Commit

Permalink
fix dist_n and diversity
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexey Gorbatovski authored and Alexey Gorbatovski committed Oct 17, 2024
1 parent 8c5a0a8 commit 3ff9145
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 22 deletions.
46 changes: 30 additions & 16 deletions turbo_alignment/metrics/distinctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,23 @@

from turbo_alignment.metrics.metric import Metric
from turbo_alignment.settings.metric import ElementWiseScores, MetricResults, MetricType
from transformers.tokenization_utils_base import PreTrainedTokenizerBase


@Metric.register(MetricType.DIST_N)
class DistinctnessMetric(Metric):
def compute(self, **kwargs) -> list[MetricResults]:
predictions: list[list[str]] = kwargs.get('predictions', None)
dataset_name: str = kwargs.get('dataset_name', '')
tokenizer: PreTrainedTokenizerBase = kwargs.get("tokenizer", None)
vocab_size: int = tokenizer.vocab_size

if predictions is None:
raise ValueError('predictions should not be None')

dist_n = defaultdict(list)
for prompt_answers in predictions:
ans_dist_n = self.distinctness(prompt_answers)
ans_dist_n = self.distinctness(prompt_answers, vocab_size)
for label, value in ans_dist_n.items():
dist_n[label].append(value)

Expand All @@ -31,21 +34,32 @@ def compute(self, **kwargs) -> list[MetricResults]:
]

@staticmethod
def distinctness(answers: list[str]) -> dict[str, float]:
unigrams, bigrams, trigrams = set(), set(), set()
total_words = 0
def distinctness(answers: list[str], vocab_size: int) -> dict[str, float]:
ngram_sets = [set() for _ in range(5)]
total_ngrams = [0] * 5

for answer in answers:
words = answer.split(' ')
total_words += len(words)
unigrams.update(words)
for i in range(len(words) - 1):
bigrams.add(words[i] + '_' + words[i + 1])
for i in range(len(words) - 2):
trigrams.add(words[i] + '_' + words[i + 1] + '_' + words[i + 2])

return {
'dist_1': len(unigrams) / total_words,
'dist_2': len(bigrams) / total_words,
'dist_3': len(trigrams) / total_words,
}
ngram_sets[0].update(words)
total_ngrams[0] += len(words)

for n in range(1, 5):
ngrams = ['_'.join(words[i : i + n + 1]) for i in range(len(words) - n)]
ngram_sets[n].update(ngrams)
total_ngrams[n] += len(ngrams)

result = dict()
for n in range(5):
result[f'dist_{n+1}'] = len(ngram_sets[n]) / total_ngrams[n] if total_ngrams[n] > 0 else 0
try:
result[f'ead_dist_{n+1}'] = (
len(ngram_sets[n]) / (vocab_size * (1 - ((vocab_size - 1) / vocab_size) ** total_ngrams[n]))
if total_ngrams[n] > 0
else 0
)
except ZeroDivisionError:
result[f'ead_dist_{n+1}'] = 0

result['dist_mean'] = sum(result[f'dist_{n+1}'] for n in range(5)) / 5
result['ead_dist_mean'] = sum(result[f'ead_dist_{n+1}'] for n in range(5)) / 5
return result
16 changes: 11 additions & 5 deletions turbo_alignment/metrics/diversity.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def compute(self, **kwargs) -> list[MetricResults]:
tokenizer: PreTrainedTokenizerBase = kwargs.get('tokenizer', None)
predictions: list[list[str]] = kwargs.get('predictions', None)
dataset_name: str = kwargs.get('dataset_name', '')
top_k: int = kwargs.get('top_k', None)

if predictions is None:
raise ValueError('predictions should not be None')
Expand All @@ -25,7 +26,7 @@ def compute(self, **kwargs) -> list[MetricResults]:
element_wise_diversity_scores = [
ElementWiseScores(
label=dataset_name + '@@' + 'diversity',
values=[self.average_token_entropy(answer_group, tokenizer) for answer_group in predictions],
values=[self.average_token_entropy(answer_group, tokenizer, top_k) for answer_group in predictions],
)
]

Expand All @@ -34,15 +35,15 @@ def compute(self, **kwargs) -> list[MetricResults]:
for need_average in self._settings.need_average
]

def average_token_entropy(self, answer_group: list[str], tokenizer: PreTrainedTokenizerBase) -> float:
entropies = [self.token_entropy(answer, tokenizer) for answer in answer_group]
def average_token_entropy(self, answer_group: list[str], tokenizer: PreTrainedTokenizerBase, top_k: int) -> float:
entropies = [self.token_entropy(answer, tokenizer, top_k) for answer in answer_group]
if entropies:
return sum(entropies) / len(entropies)

return np.nan

@staticmethod
def token_entropy(sample: str, tokenizer: PreTrainedTokenizerBase) -> float:
def token_entropy(sample: str, tokenizer: PreTrainedTokenizerBase, top_k: int = None) -> float:
stats: dict[int, Any] = defaultdict(int)
num_tokens = 0
tokens = tokenizer.encode(sample)
Expand All @@ -54,4 +55,9 @@ def token_entropy(sample: str, tokenizer: PreTrainedTokenizerBase) -> float:
for k in stats.keys():
stats[k] /= num_tokens

return entropy(list(stats.values()))
if top_k is None:
top_k_stats = list(stats.values())
else:
top_k_stats = sorted(stats.values(), reverse=True)[:top_k]

return entropy(top_k_stats)
2 changes: 1 addition & 1 deletion turbo_alignment/metrics/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ class DistinctnessSettings(MetricSettings):

@MetricSettingsRegistry.register(MetricType.DIVERSITY)
class DiversitySettings(MetricSettings):
...
top_k: int | None = None


@MetricSettingsRegistry.register(MetricType.LENGTH)
Expand Down

0 comments on commit 3ff9145

Please sign in to comment.