Update server to match sdk interface for modular checks

protectai · Jan 21, 2024 · 6f8a21e · 6f8a21e
1 parent 3c12155
commit 6f8a21e
Show file tree

Hide file tree

Showing 9 changed files with 340 additions and 232 deletions.
diff --git a/javascript-sdk/src/api.ts b/javascript-sdk/src/api.ts
@@ -44,25 +44,15 @@ export default class RebuffApi implements Rebuff {
 
   async detectInjection({
     userInput = "",
-    maxHeuristicScore = 0.75,
-    maxVectorScore = 0.9,
-    maxModelScore = 0.9,
-    runHeuristicCheck = true,
-    runVectorCheck = true,
-    runLanguageModelCheck = true,
+    tacticOverrides = [],
   }: DetectRequest): Promise<DetectResponse> {
     if (userInput === null) {
       throw new RebuffError("userInput is required");
     }
     const requestData: DetectRequest = {
       userInput: "",
       userInputBase64: encodeString(userInput),
-      runHeuristicCheck: runHeuristicCheck,
-      runVectorCheck: runVectorCheck,
-      runLanguageModelCheck: runLanguageModelCheck,
-      maxVectorScore,
-      maxModelScore,
-      maxHeuristicScore,
+      tacticOverrides,
     };
 
     const response = await fetch(`${this.apiUrl}/api/detect`, {
@@ -76,10 +66,6 @@ export default class RebuffApi implements Rebuff {
     if (!response.ok) {
       throw new RebuffError((responseData as any)?.message);
     }
-    responseData.injectionDetected =
-      responseData.heuristicScore > maxHeuristicScore ||
-      responseData.modelScore > maxModelScore ||
-      responseData.vectorScore.topScore > maxVectorScore;
     return responseData;
   }
 

diff --git a/python-sdk/rebuff/__init__.py b/python-sdk/rebuff/__init__.py
@@ -1,10 +1,11 @@
 from rebuff._version import __version__
 
 from .rebuff import (
-    ApiFailureResponse,
-    DetectApiRequest,
-    DetectApiSuccessResponse,
+    DetectResponse,
     Rebuff,
+    TacticName,
+    TacticOverride,
+    TacticResult,
 )
 
 from .sdk import RebuffSdk, RebuffDetectionResponse
diff --git a/python-sdk/rebuff/rebuff.py b/python-sdk/rebuff/rebuff.py
@@ -1,41 +1,134 @@
+from enum import Enum
 import secrets
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import List, Optional, Dict, Any, Union, Tuple
 
 import requests
 from pydantic import BaseModel
 
-
-class DetectApiRequest(BaseModel):
-    userInput: str
-    userInputBase64: Optional[str] = None
-    runHeuristicCheck: bool
-    runVectorCheck: bool
-    runLanguageModelCheck: bool
-    maxHeuristicScore: float
-    maxModelScore: float
-    maxVectorScore: float
-
-
-class DetectApiSuccessResponse(BaseModel):
-    heuristicScore: float
-    modelScore: float
-    vectorScore: Dict[str, float]
-    runHeuristicCheck: bool
-    runVectorCheck: bool
-    runLanguageModelCheck: bool
-    maxHeuristicScore: float
-    maxModelScore: float
-    maxVectorScore: float
-    injectionDetected: bool
-
-
-class ApiFailureResponse(BaseModel):
-    error: str
-    message: str
+def to_camel(string: str) -> str:
+    string_split = string.split("_")
+    return string_split[0] + "".join(word.capitalize() for word in string_split[1:])
+
+class RebuffBaseModel(BaseModel):
+  class Config:
+        alias_generator = to_camel
+        populate_by_name = True
+
+
+class TacticName(str, Enum):
+  HEURISTIC = "heuristic"
+  """
+  A series of heuristics are used to determine whether the input is prompt injection.
+  """
+
+  LANGUAGE_MODEL = "language_model"
+  """
+  A language model is asked if the input appears to be prompt injection.
+  """
+
+  VECTOR_DB = "vector_db"
+  """
+  A vector database of known prompt injection attacks is queried for similarity.
+  """
+
+class TacticOverride(RebuffBaseModel):
+  """
+  Override settings for a specific tactic.
+  """
+
+  name: TacticName
+  """
+  The name of the tactic to override.
+  """
+
+  threshold: Optional[float] = None
+  """
+  The threshold to use for this tactic. If the score is above this threshold, the tactic will be considered detected.
+  If not specified, the default threshold for the tactic will be used.
+  """
+
+  run: Optional[bool] = True
+  """
+  Whether to run this tactic. Defaults to true if not specified.
+  """
+
+class DetectRequest(RebuffBaseModel):
+  """
+  Request to detect prompt injection.
+  """
+
+  user_input: str
+  """
+  The user input to check for prompt injection.
+  """
+
+  user_input_base64: Optional[str] = None
+  """
+  The base64-encoded user input. If this is specified, the user input will be ignored.
+  """
+
+  tactic_overrides: Optional[List[TacticOverride]] = None
+  """
+  Any tactics to change behavior for. If any tactic is not specified, the default threshold for that tactic will be used.
+  """
+
+class TacticResult(RebuffBaseModel):
+  """
+  Result of a tactic execution.
+  """
+
+  name: str
+  """
+  The name of the tactic.
+  """
+
+  score: float
+  """
+  The score for the tactic. This is a number between 0 and 1. The closer to 1, the more likely that this is a prompt injection attempt.
+  """
+
+  detected: bool
+  """
+  Whether this tactic evaluated the input as a prompt injection attempt.
+  """
+
+  threshold: float
+  """
+  The threshold used for this tactic. If the score is above this threshold, the tactic will be considered detected.
+  """
+
+  additional_fields: Dict[str, Any]
+  """
+  Some tactics return additional fields:
+    * "vector_db":
+      - "countOverMaxVectorScore" (int): The number of different vectors whose similarity score is above the 
+          threshold.
+  """
+
+class DetectResponse(RebuffBaseModel):
+  """
+  Response from a prompt injection detection request.
+  """
+
+  injection_detected: bool
+  """
+  Whether prompt injection was detected.
+  """
+
+  tactic_results: List[TacticResult]
+  """
+  The result for each tactic that was executed.
+  """
+
+class ApiFailureResponse(Exception):
+   def __init__(self, error: str, message: str):
+       super().__init__(f"Error: {error}, Message: {message}")
+       self.error = error
+       self.message = message
 
 
 class Rebuff:
-    def __init__(self, api_token: str, api_url: str = "https://playground.rebuff.ai"):
+    def __init__(self, api_token: str, api_url: str = "https://www.rebuff.ai/playground"):
         self.api_token = api_token
         self.api_url = api_url
         self._headers = {
@@ -46,63 +139,47 @@ def __init__(self, api_token: str, api_url: str = "https://playground.rebuff.ai"
     def detect_injection(
         self,
         user_input: str,
-        max_heuristic_score: float = 0.75,
-        max_vector_score: float = 0.90,
-        max_model_score: float = 0.9,
-        check_heuristic: bool = True,
-        check_vector: bool = True,
-        check_llm: bool = True,
-    ) -> Union[DetectApiSuccessResponse, ApiFailureResponse]:
+        tactic_overrides: Optional[List[TacticOverride]] = None,
+    ) -> DetectResponse:
         """
         Detects if the given user input contains an injection attempt.
 
         Args:
             user_input (str): The user input to be checked for injection.
-            max_heuristic_score (float, optional): The maximum heuristic score allowed. Defaults to 0.75.
-            max_vector_score (float, optional): The maximum vector score allowed. Defaults to 0.90.
-            max_model_score (float, optional): The maximum model (LLM) score allowed. Defaults to 0.9.
-            check_heuristic (bool, optional): Whether to run the heuristic check. Defaults to True.
-            check_vector (bool, optional): Whether to run the vector check. Defaults to True.
-            check_llm (bool, optional): Whether to run the language model check. Defaults to True.
+            tactic_overrides (Optional[List[TacticOverride]], optional): A list of tactics to override.
+                If a tactic is not specified in this list, the default threshold for that tactic will be used.
 
         Returns:
-            Tuple[Union[DetectApiSuccessResponse, ApiFailureResponse], bool]: A tuple containing the detection
-                metrics and a boolean indicating if an injection was detected.
+            DetectResponse: An object containing the detection metrics and a boolean indicating if an injection was
+                detected.
+
+        Example:
+            >>> from rebuff import Rebuff, TacticOverride, TacticName
+            >>> rb = Rebuff(api_token='your_api_token')
+            >>> user_input = "Your user input here"
+            >>> tactic_overrides = [
+            ...    TacticOverride(name=TacticName.HEURISTIC, threshold=0.6),
+            ...    TacticOverride(name=TacticName.LANGUAGE_MODEL, run=False),
+            ... ]
+            >>> response = rb.detect_injection(user_input, tactic_overrides)
         """
-        request_data = DetectApiRequest(
-            userInput=user_input,
-            userInputBase64=encode_string(user_input),
-            runHeuristicCheck=check_heuristic,
-            runVectorCheck=check_vector,
-            runLanguageModelCheck=check_llm,
-            maxVectorScore=max_vector_score,
-            maxModelScore=max_model_score,
-            maxHeuristicScore=max_heuristic_score,
+        request_data = DetectRequest(
+            user_input=user_input,
+            user_input_base64=encode_string(user_input),
+            tactic_overrides=tactic_overrides,
         )
 
         response = requests.post(
             f"{self.api_url}/api/detect",
-            json=request_data.dict(),
+            json=request_data.model_dump(mode="json", by_alias=True, exclude_none=True),
             headers=self._headers,
         )
 
-        response.raise_for_status()
-
         response_json = response.json()
-        success_response = DetectApiSuccessResponse.parse_obj(response_json)
-
-        if (
-            success_response.heuristicScore > max_heuristic_score
-            or success_response.modelScore > max_model_score
-            or success_response.vectorScore["topScore"] > max_vector_score
-        ):
-            # Injection detected
-            success_response.injectionDetected = True
-            return success_response
-        else:
-            # No injection detected
-            success_response.injectionDetected = False
-            return success_response
+        if "error" in response_json:
+            raise ApiFailureResponse(response_json["error"], response_json.get("message", "No message provided"))
+        response.raise_for_status()
+        return DetectResponse.model_validate(response_json)
 
     @staticmethod
     def generate_canary_word(length: int = 8) -> str: