Fix competency extraction feature (#145)

ls1intum · Aug 27, 2024 · 5765c9c · 5765c9c
1 parent 2e4f640
commit 5765c9c
Show file tree

Hide file tree

Showing 14 changed files with 292 additions and 37 deletions.
diff --git a/app/domain/__init__.py b/app/domain/__init__.py
@@ -3,6 +3,9 @@
 from .pipeline_execution_settings_dto import PipelineExecutionSettingsDTO
 from .chat.chat_pipeline_execution_dto import ChatPipelineExecutionDTO
 from .chat.chat_pipeline_execution_base_data_dto import ChatPipelineExecutionBaseDataDTO
+from .competency_extraction_pipeline_execution_dto import (
+    CompetencyExtractionPipelineExecutionDTO,
+)
 from app.domain.chat.exercise_chat.exercise_chat_pipeline_execution_dto import (
     ExerciseChatPipelineExecutionDTO,
 )

diff --git a/app/domain/chat/chat_pipeline_execution_dto.py b/app/domain/chat/chat_pipeline_execution_dto.py
@@ -2,16 +2,11 @@
 
 from pydantic import Field
 
-from app.domain import PipelineExecutionDTO, PipelineExecutionSettingsDTO
+from app.domain import PipelineExecutionDTO
 from app.domain.pyris_message import PyrisMessage
 from app.domain.data.user_dto import UserDTO
-from app.domain.status.stage_dto import StageDTO
 
 
 class ChatPipelineExecutionDTO(PipelineExecutionDTO):
     chat_history: List[PyrisMessage] = Field(alias="chatHistory", default=[])
     user: Optional[UserDTO]
-    settings: Optional[PipelineExecutionSettingsDTO]
-    initial_stages: Optional[List[StageDTO]] = Field(
-        default=None, alias="initialStages"
-    )
diff --git a/app/domain/competency_extraction_pipeline_execution_dto.py b/app/domain/competency_extraction_pipeline_execution_dto.py
@@ -0,0 +1,22 @@
+from typing import List
+
+from pydantic import Field, BaseModel
+
+from . import PipelineExecutionDTO
+from .data.competency_dto import CompetencyTaxonomy, Competency
+
+
+class CompetencyExtractionPipelineExecutionDTO(BaseModel):
+    execution: PipelineExecutionDTO
+    course_description: str = Field(alias="courseDescription")
+    current_competencies: list[Competency] = Field(
+        alias="currentCompetencies", default=[]
+    )
+    taxonomy_options: List[CompetencyTaxonomy] = Field(
+        alias="taxonomyOptions", default=[]
+    )
+    max_n: int = Field(
+        alias="maxN",
+        description="Maximum number of competencies to extract from the course description",
+        default=10,
+    )
diff --git a/app/domain/data/competency_dto.py b/app/domain/data/competency_dto.py
@@ -3,6 +3,7 @@
 from typing import Optional
 
 from pydantic import BaseModel, Field
+from pydantic.v1 import validator
 
 
 class CompetencyTaxonomy(str, Enum):
@@ -21,3 +22,29 @@ class CompetencyDTO(BaseModel):
     taxonomy: Optional[CompetencyTaxonomy] = None
     soft_due_date: Optional[datetime] = Field(default=None, alias="softDueDate")
     optional: Optional[bool] = None
+
+
+class Competency(BaseModel):
+    title: str = Field(
+        description="Title of the competency that contains no more than 4 words",
+    )
+    description: str = Field(
+        description="Description of the competency as plain string. DO NOT RETURN A LIST OF STRINGS."
+    )
+    taxonomy: CompetencyTaxonomy = Field(
+        description="Selected taxonomy based on bloom's taxonomy"
+    )
+
+    @validator("title")
+    def validate_title(cls, field):
+        """Validate the subject of the competency."""
+        if len(field.split()) > 4:
+            raise ValueError("Title must contain no more than 4 words")
+        return field
+
+    @validator("taxonomy")
+    def validate_selected_taxonomy(cls, field):
+        """Validate the selected taxonomy."""
+        if field not in CompetencyTaxonomy.__members__:
+            raise ValueError(f"Invalid taxonomy: {field}")
+        return field
diff --git a/app/domain/ingestion/ingestion_pipeline_execution_dto.py b/app/domain/ingestion/ingestion_pipeline_execution_dto.py
@@ -1,17 +1,12 @@
-from typing import List, Optional
+from typing import List
 
 from pydantic import Field
 
-from app.domain import PipelineExecutionDTO, PipelineExecutionSettingsDTO
+from app.domain import PipelineExecutionDTO
 from app.domain.data.lecture_unit_dto import LectureUnitDTO
-from app.domain.status.stage_dto import StageDTO
 
 
 class IngestionPipelineExecutionDto(PipelineExecutionDTO):
     lecture_units: List[LectureUnitDTO] = Field(
         ..., alias="pyrisLectureUnitWebhookDTOS"
     )
-    settings: Optional[PipelineExecutionSettingsDTO]
-    initial_stages: Optional[List[StageDTO]] = Field(
-        default=None, alias="initialStages"
-    )
diff --git a/app/domain/pipeline_execution_dto.py b/app/domain/pipeline_execution_dto.py
@@ -1,8 +1,16 @@
-from pydantic import BaseModel
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+from app.domain.pipeline_execution_settings_dto import PipelineExecutionSettingsDTO
+from app.domain.status.stage_dto import StageDTO
 
 
 class PipelineExecutionDTO(BaseModel):
-    pass
+    settings: Optional[PipelineExecutionSettingsDTO]
+    initial_stages: Optional[list[StageDTO]] = Field(
+        default=None, alias="initialStages"
+    )
 
     class Config:
         populate_by_name = True
diff --git a/app/domain/status/competency_extraction_status_update_dto.py b/app/domain/status/competency_extraction_status_update_dto.py
@@ -0,0 +1,6 @@
+from app.domain.data.competency_dto import Competency
+from app.domain.status.status_update_dto import StatusUpdateDTO
+
+
+class CompetencyExtractionStatusUpdateDTO(StatusUpdateDTO):
+    result: list[Competency] = []
diff --git a/app/llm/external/openai_chat.py b/app/llm/external/openai_chat.py
@@ -7,7 +7,6 @@
 from openai import OpenAI
 from openai.lib.azure import AzureOpenAI
 from openai.types.chat import ChatCompletionMessage, ChatCompletionMessageParam
-from openai.types.chat.completion_create_params import ResponseFormat
 from openai.types.shared_params import ResponseFormatJSONObject
 
 from ...common.message_converters import map_str_to_role, map_role_to_str

diff --git a/app/pipeline/__init__.py b/app/pipeline/__init__.py
@@ -1 +1 @@
-from ..pipeline.pipeline import Pipeline
+from app.pipeline.pipeline import Pipeline
diff --git a/app/pipeline/chat/course_chat_pipeline.py b/app/pipeline/chat/course_chat_pipeline.py
@@ -266,7 +266,8 @@ def get_competency_list() -> list:
         def lecture_content_retrieval() -> str:
             """
             Retrieve content from indexed lecture slides.
-            This will run a RAG retrieval based on the chat history on the indexed lecture slides and return the most relevant paragraphs.
+            This will run a RAG retrieval based on the chat history on the indexed lecture slides and return the most
+            relevant paragraphs.
             Use this if you think it can be useful to answer the student's question, or if the student explicitly asks
             a question about the lecture content or slides.
             Only use this once.

diff --git a/app/pipeline/competency_extraction_pipeline.py b/app/pipeline/competency_extraction_pipeline.py
@@ -0,0 +1,96 @@
+import logging
+from typing import Optional
+
+from langchain.output_parsers import PydanticOutputParser
+from langchain_core.prompts import (
+    ChatPromptTemplate,
+)
+
+from app.domain import (
+    CompetencyExtractionPipelineExecutionDTO,
+    PyrisMessage,
+    IrisMessageRole,
+)
+from app.domain.data.text_message_content_dto import TextMessageContentDTO
+from app.domain.data.competency_dto import Competency
+from app.llm import CapabilityRequestHandler, RequirementList, CompletionArguments
+from app.pipeline import Pipeline
+from app.web.status.status_update import CompetencyExtractionCallback
+from app.pipeline.prompts.competency_extraction import system_prompt
+
+logger = logging.getLogger(__name__)
+
+
+class CompetencyExtractionPipeline(Pipeline):
+    callback: CompetencyExtractionCallback
+    request_handler: CapabilityRequestHandler
+    output_parser: PydanticOutputParser
+
+    def __init__(self, callback: Optional[CompetencyExtractionCallback] = None):
+        super().__init__(
+            implementation_id="competency_extraction_pipeline_reference_impl"
+        )
+        self.callback = callback
+        self.request_handler = CapabilityRequestHandler(requirements=RequirementList())
+        self.output_parser = PydanticOutputParser(pydantic_object=Competency)
+
+    def __call__(
+        self,
+        dto: CompetencyExtractionPipelineExecutionDTO,
+        prompt: Optional[ChatPromptTemplate] = None,
+        **kwargs,
+    ):
+        if not dto.course_description:
+            raise ValueError("Course description is required")
+        if not dto.taxonomy_options:
+            raise ValueError("Taxonomy options are required")
+        if not dto.max_n:
+            raise ValueError("Non-zero max_n is required")
+
+        taxonomy_options = ", ".join(dto.taxonomy_options)
+        current_competencies = "\n\n".join(
+            [c.model_dump_json(indent=4) for c in dto.current_competencies]
+        )
+        if current_competencies:
+            current_competencies = (
+                f"\nHere are the current competencies in the course:\n{current_competencies}\n"
+                f"Do not repeat these competencies.\n"
+            )
+
+        prompt = system_prompt.format(
+            taxonomy_list=taxonomy_options,
+            course_description=dto.course_description,
+            max_n=dto.max_n,
+            current_competencies=current_competencies,
+        )
+        prompt = PyrisMessage(
+            sender=IrisMessageRole.SYSTEM,
+            contents=[TextMessageContentDTO(text_content=prompt)],
+        )
+
+        response = self.request_handler.chat(
+            [prompt], CompletionArguments(temperature=0.4)
+        )
+        response = response.contents[0].text_content
+
+        generated_competencies: list[Competency] = []
+
+        # Find all competencies in the response up to the max_n
+        competencies = response.split("\n\n")[: dto.max_n]
+        for i, competency in enumerate(competencies):
+            logger.debug(f"Processing competency {i + 1}: {competency}")
+            if "{" not in competency or "}" not in competency:
+                logger.debug("Skipping competency without JSON")
+                continue
+            # Get the competency JSON object
+            start = competency.index("{")
+            end = competency.index("}") + 1
+            competency = competency[start:end]
+            try:
+                competency = self.output_parser.parse(competency)
+            except Exception as e:
+                logger.debug(f"Error parsing competency: {e}")
+                continue
+            logger.debug(f"Generated competency: {competency}")
+            generated_competencies.append(competency)
+        self.callback.done(final_result=generated_competencies)
diff --git a/app/pipeline/prompts/competency_extraction.py b/app/pipeline/prompts/competency_extraction.py
@@ -0,0 +1,44 @@
+system_prompt = """
+You are an expert in all topics of computer science and its practical applications.
+Your task consists of three parts:
+1. Read the provided curriculum description a university course.
+2. Extract all learning goals ("competencies") from the course description.
+
+Each competency must contain the following fields:
+
+- title:
+The title of the competency, which is a specific topic or skill. This should be a short phrase of at most 4 words.
+
+- description:
+A detailed description of the competency in 2 to 5 bullet points.
+Each bullet point illustrates a specific skill or concept of the competency.
+Each bullet point is a complete sentence containing at most 15 words.
+Each bullet point is on a new line and starts with "- ".
+
+- taxonomy:
+The classification of the competency within Bloom's taxonomy.
+You must choose from these options in Bloom's taxonomy: {taxonomy_list}
+
+All competencies must meet the following requirements:
+
+- is mentioned in the course description.
+- corresponds to exactly one subject or skill covered in the course description.
+- is assigned to exactly one level of Bloom's taxonomy.
+- is small and fine-grained. Large topics should be broken down into smaller competencies.
+- does not overlap with other competencies: each competency is unique. Expanding on a previous competency is allowed.
+
+Here is the provided course description: {course_description}
+
+Here is a template competency in JSON format:
+
+{{
+    "title": "Competency Title",
+    "description": "- You understand this.\n- You are proficient in doing that.\n- You know how to do this.",
+    "taxonomy": "ANALYZE"
+}}
+
+{current_competencies}
+
+Respond with 0 to {max_n} competencies extracted from the course description,
+each in JSON format, split by two newlines.
+"""
diff --git a/app/web/routers/pipelines.py b/app/web/routers/pipelines.py
@@ -9,14 +9,17 @@
 from app.domain import (
     ExerciseChatPipelineExecutionDTO,
     CourseChatPipelineExecutionDTO,
+    CompetencyExtractionPipelineExecutionDTO,
 )
 from app.web.status.status_update import (
     ExerciseChatStatusCallback,
     CourseChatStatusCallback,
+    CompetencyExtractionCallback,
 )
 from app.pipeline.chat.course_chat_pipeline import CourseChatPipeline
 from app.pipeline.chat.exercise_chat_pipeline import ExerciseChatPipeline
 from app.dependencies import TokenValidator
+from app.pipeline.competency_extraction_pipeline import CompetencyExtractionPipeline
 
 router = APIRouter(prefix="/api/v1/pipelines", tags=["pipelines"])
 logger = logging.getLogger(__name__)
@@ -86,6 +89,44 @@ def run_course_chat_pipeline(variant: str, dto: CourseChatPipelineExecutionDTO):
     thread.start()
 
 
+def run_competency_extraction_pipeline_worker(
+    dto: CompetencyExtractionPipelineExecutionDTO, _variant: str
+):
+    try:
+        callback = CompetencyExtractionCallback(
+            run_id=dto.execution.settings.authentication_token,
+            base_url=dto.execution.settings.artemis_base_url,
+            initial_stages=dto.execution.initial_stages,
+        )
+        pipeline = CompetencyExtractionPipeline(callback=callback)
+    except Exception as e:
+        logger.error(f"Error preparing competency extraction pipeline: {e}")
+        logger.error(traceback.format_exc())
+        capture_exception(e)
+        return
+
+    try:
+        pipeline(dto=dto)
+    except Exception as e:
+        logger.error(f"Error running competency extraction pipeline: {e}")
+        logger.error(traceback.format_exc())
+        callback.error("Fatal error.", exception=e)
+
+
+@router.post(
+    "/competency-extraction/{variant}/run",
+    status_code=status.HTTP_202_ACCEPTED,
+    dependencies=[Depends(TokenValidator())],
+)
+def run_competency_extraction_pipeline(
+    variant: str, dto: CompetencyExtractionPipelineExecutionDTO
+):
+    thread = Thread(
+        target=run_competency_extraction_pipeline_worker, args=(dto, variant)
+    )
+    thread.start()
+
+
 @router.get("/{feature}")
 def get_pipeline(feature: str):
     """