diff --git a/README.md b/README.md index 8dd26446..2ae3e5ed 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ +

@@ -118,6 +119,7 @@ These commands are grouped, so each group has a prefix but you can easily tab co - The bot needs Administrative permissions for this, and you need to set `MODERATIONS_ALERT_CHANNEL` to the channel ID of a desired channel in your .env file if you want to receive alerts about moderated messages. - This uses the OpenAI Moderations endpoint to check for messages, requests are only sent to the moderations endpoint at a MINIMUM request gap of 0.5 seconds, to ensure you don't get blocked and to ensure reliability. - The bot uses numerical thresholds to determine whether a message is toxic or not, and I have manually tested and fine tuned these thresholds to a point that I think is good, please open an issue if you have any suggestions for the thresholds! +- There are two thresholds for the bot, there are instances in which the bot will outright delete a message and an instance where the bot will send a message to the alert channel notifying admins and giving them quick options to delete and timeout the user (check out the screenshots at the beginning of the README to see this). # Permanent Memory Permanent memory has now been implemented into the bot, using the OpenAI Ada embeddings endpoint, and Pinecone DB. diff --git a/gpt3discord.py b/gpt3discord.py index 6b357e61..bbd26a95 100644 --- a/gpt3discord.py +++ b/gpt3discord.py @@ -24,7 +24,7 @@ from models.usage_service_model import UsageService from models.env_service_model import EnvService -__version__ = "4.1" +__version__ = "4.2" """ The pinecone service is used to store and retrieve conversation embeddings. diff --git a/models/moderations_service_model.py b/models/moderations_service_model.py index dd5029f4..7875e72d 100644 --- a/models/moderations_service_model.py +++ b/models/moderations_service_model.py @@ -1,7 +1,7 @@ import asyncio import os import traceback -from datetime import datetime +from datetime import datetime, timedelta from pathlib import Path import discord @@ -12,6 +12,42 @@ usage_service = UsageService(Path(os.environ.get("DATA_DIR", os.getcwd()))) model = Model(usage_service) +class ModerationResult: + WARN = "warn" + DELETE = "delete" + NONE = "none" + +class ThresholdSet: + + def __init__(self, h_t, hv_t, sh_t, s_t, sm_t, v_t, vg_t): + self.keys = [ + "hate", + "hate/threatening", + "self-harm", + "sexual", + "sexual/minors", + "violence", + "violence/graphic" + ] + self.thresholds = [ + h_t, + hv_t, + sh_t, + s_t, + sm_t, + v_t, + vg_t, + ] + + def moderate(self, text, response_message): + category_scores = response_message["results"][0]["category_scores"] + flagged = response_message["results"][0]["flagged"] + + for category, threshold in zip(self.keys, self.thresholds): + if category_scores[category] > threshold: + return (True, flagged) + return (False, flagged) + class Moderation: def __init__(self, message, timestamp): @@ -33,6 +69,24 @@ def build_moderation_embed(): ) return embed + @staticmethod + def build_admin_warning_message(moderated_message, deleted_message=None, timed_out=None): + embed = discord.Embed( + title="Potentially unwanted message in the " + + moderated_message.guild.name + + " server", + description=f"**Message from {moderated_message.author.mention}:** {moderated_message.content}", + colour=discord.Colour.yellow(), + ) + link = f"https://discord.com/channels/{moderated_message.guild.id}/{moderated_message.channel.id}/{moderated_message.id}" + embed.add_field(name="Message link", value=link, inline=False) + if deleted_message: + embed.add_field(name="Message deleted by: ", value=deleted_message, inline=False) + if timed_out: + embed.add_field(name="User timed out by: ", value=timed_out, inline=False) + return embed + + @staticmethod def build_admin_moderated_message(moderated_message, response_message): @@ -52,57 +106,20 @@ def build_admin_moderated_message(moderated_message, response_message): @staticmethod def determine_moderation_result(text, response): - HATE_THRESHOLD = 0.005 - HATE_VIOLENCE_THRESHOLD = 0.05 - SELF_HARM_THRESHOLD = 0.05 - SEXUAL_THRESHOLD = 0.91 - SEXUAL_MINORS_THRESHOLD = 0.1 - VIOLENCE_THRESHOLD = 0.08 - VIOLENCE_GRAPHIC_THRESHOLD = 0.1 - - extreme_hatred_qualifiers = [ - "i fucking hate", - "fucking hate", - "i fucking despise", - ] + warn_set = ThresholdSet(0.005,0.05,0.05,0.91,0.1,0.08,0.1) + delete_set = ThresholdSet(0.1,0.1,0.1,0.95,0.2,0.6,0.4) - thresholds = [ - HATE_THRESHOLD, - HATE_VIOLENCE_THRESHOLD, - SELF_HARM_THRESHOLD, - SEXUAL_THRESHOLD, - SEXUAL_MINORS_THRESHOLD, - VIOLENCE_THRESHOLD, - VIOLENCE_GRAPHIC_THRESHOLD, - ] - threshold_iterator = [ - "hate", - "hate/threatening", - "self-harm", - "sexual", - "sexual/minors", - "violence", - "violence/graphic", - ] - - category_scores = response["results"][0]["category_scores"] - - flagged = response["results"][0]["flagged"] + warn_result, flagged_warn = warn_set.moderate(text, response) + delete_result, flagged_delete = delete_set.moderate(text, response) - # Iterate the category scores using the threshold_iterator and compare the values to thresholds - for category, threshold in zip(threshold_iterator, thresholds): - if category == "hate": - if ( - "hate" in text.lower() - ): # The word "hate" makes the model oversensitive. This is a (bad) workaround. - threshold = 0.1 - if any(word in text.lower() for word in extreme_hatred_qualifiers): - threshold = 0.6 + if delete_result: + return ModerationResult.DELETE + elif warn_result: + return ModerationResult.WARN + else: + return ModerationResult.NONE - if category_scores[category] > threshold: - return True - return False # This function will be called by the bot to process the message queue @staticmethod @@ -128,7 +145,7 @@ async def process_moderation_queue( to_moderate.message.content, response ) - if moderation_result: + if moderation_result == ModerationResult.DELETE: # Take care of the flagged message response_message = await to_moderate.message.reply( embed=Moderation.build_moderation_embed() @@ -143,6 +160,11 @@ async def process_moderation_queue( to_moderate, response_message ) ) + elif moderation_result == ModerationResult.WARN: + response_message = await moderations_alert_channel.send( + embed=Moderation.build_admin_warning_message(to_moderate.message), + ) + await response_message.edit(view=ModerationAdminView(to_moderate.message, response_message)) else: await moderation_queue.put(to_moderate) @@ -152,3 +174,63 @@ async def process_moderation_queue( except: traceback.print_exc() pass + + +class ModerationAdminView(discord.ui.View): + def __init__(self, message, moderation_message, nodelete=False): + super().__init__(timeout=None) # 1 hour interval to redo. + self.message = message + self.moderation_message = moderation_message, + if not nodelete: + self.add_item(DeleteMessageButton(self.message, self.moderation_message)) + self.add_item(TimeoutUserButton(self.message, self.moderation_message, 1, nodelete)) + self.add_item(TimeoutUserButton(self.message, self.moderation_message, 6, nodelete)) + self.add_item(TimeoutUserButton(self.message, self.moderation_message, 12, nodelete)) + self.add_item(TimeoutUserButton(self.message, self.moderation_message, 24, nodelete)) + + +class DeleteMessageButton(discord.ui.Button["ModerationAdminView"]): + def __init__(self, message, moderation_message): + super().__init__(style=discord.ButtonStyle.danger, label="Delete Message") + self.message = message + self.moderation_message = moderation_message + + async def callback(self, interaction: discord.Interaction): + + # Get the user + await self.message.delete() + await interaction.response.send_message( + "This message was deleted", ephemeral=True, delete_after=10 + ) + await self.moderation_message[0].edit(embed=Moderation.build_admin_warning_message(self.message, deleted_message=interaction.user.mention), + view=ModerationAdminView(self.message, self.moderation_message, nodelete=True)) + + +class TimeoutUserButton(discord.ui.Button["ModerationAdminView"]): + def __init__(self, message, moderation_message, hours, nodelete): + super().__init__(style=discord.ButtonStyle.danger, label=f"Timeout {hours}h") + self.message = message + self.moderation_message = moderation_message + self.hours = hours + self.nodelete = nodelete + + async def callback(self, interaction: discord.Interaction): + # Get the user id + try: + await self.message.delete() + except: + pass + + try: + await self.message.author.timeout(until = discord.utils.utcnow() + timedelta(hours=self.hours), reason="Breaking the server chat rules") + except Exception as e: + traceback.print_exc() + pass + + await interaction.response.send_message( + f"This user was timed out for {self.hours} hour(s)", ephemeral=True, delete_after=10 + ) + moderation_message = self.moderation_message[0][0] if self.nodelete else self.moderation_message[0] + await moderation_message.edit(embed=Moderation.build_admin_warning_message(self.message, deleted_message=interaction.user.mention, timed_out=interaction.user.mention), + view=ModerationAdminView(self.message, self.moderation_message, nodelete=True)) +