From 5850463e72732884da3ffc0f2acd8e51388f26bb Mon Sep 17 00:00:00 2001
From: Wendong <w3ndong.fan@gmail.com>
Date: Sun, 19 Jan 2025 05:46:19 +0800
Subject: [PATCH] feat: enhance source2synth

---
 camel/agents/multi_hop_generator_agent.py     |  38 ++-
 camel/agents/programmed_agent_instruction.py  |  91 +++++--
 camel/datagen/source2synth/__init__.py        |  31 +++
 .../source2synth/data_processor.py            | 223 +++++++++++++++---
 .../source2synth/models.py                    |  25 ++
 .../user_data_processor_config.py             |  17 +-
 .../self_instruct/data_output.json            |   0
 .../self_instruct/seed_tasks.jsonl            |   0
 .../self_instruct/self_instruct.py            |   0
 examples/{ => datagen}/source2synth.py        | 109 ++++++++-
 10 files changed, 473 insertions(+), 61 deletions(-)
 create mode 100644 camel/datagen/source2synth/__init__.py
 rename camel/{synthetic_datagen => datagen}/source2synth/data_processor.py (61%)
 rename camel/{synthetic_datagen => datagen}/source2synth/models.py (73%)
 rename camel/{synthetic_datagen => datagen}/source2synth/user_data_processor_config.py (85%)
 rename examples/{synthetic_datagen => datagen}/self_instruct/data_output.json (100%)
 rename examples/{synthetic_datagen => datagen}/self_instruct/seed_tasks.jsonl (100%)
 rename examples/{synthetic_datagen => datagen}/self_instruct/self_instruct.py (100%)
 rename examples/{ => datagen}/source2synth.py (59%)

diff --git a/camel/agents/multi_hop_generator_agent.py b/camel/agents/multi_hop_generator_agent.py
index a232fce846..988342b9af 100644
--- a/camel/agents/multi_hop_generator_agent.py
+++ b/camel/agents/multi_hop_generator_agent.py
@@ -22,17 +22,36 @@
     ProgrammedAgentInstructionResult,
     programmable_capability,
 )
-from camel.messages import BaseMessage
-from camel.synthetic_datagen.source2synth.models import (
+from camel.datagen.source2synth.models import (
     ContextPrompt,
     MultiHopQA,
 )
+from camel.messages import BaseMessage
 
 
 class MultiHopGeneratorAgent(ProgrammableChatAgent):
+    r"""An agent specialized in generating multi-hop question-answer pairs.
+
+    This agent is designed to create complex questions that require multiple
+    steps of reasoning to answer. It analyzes context to identify related
+    facts and generates questions that require connecting these facts
+    logically.
+
+    Attributes:
+        model_config (ConfigDict): Configuration for model behavior.
+        system_message (BaseMessage): System message defining agent's role and
+            instructions.
+    """
+
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
-    def __init__(self, **kwargs: Any):
+    def __init__(self, **kwargs: Any) -> None:
+        r"""Initialize the MultiHopGeneratorAgent.
+
+        Args:
+            **kwargs (Any): Additional keyword arguments to pass to parent
+                class.
+        """
         super().__init__(**kwargs)
 
         system_text: str = textwrap.dedent(
@@ -64,6 +83,19 @@ def __init__(self, **kwargs: Any):
     def generate_multi_hop_qa(
         self, context: str
     ) -> ProgrammedAgentInstructionResult[MultiHopQA]:
+        r"""Generate a multi-hop question-answer pair from given context.
+
+        Args:
+            context (str): The input text context to generate QA from.
+
+        Returns:
+            ProgrammedAgentInstructionResult[MultiHopQA]: Result containing the
+                generated question, reasoning steps, answer, and supporting
+                facts.
+
+        Raises:
+            RuntimeError: If the agent fails to generate a response.
+        """
         context_prompt = ContextPrompt(
             main_context=context, related_contexts=None
         )
diff --git a/camel/agents/programmed_agent_instruction.py b/camel/agents/programmed_agent_instruction.py
index 708d5997bb..bf38d67107 100644
--- a/camel/agents/programmed_agent_instruction.py
+++ b/camel/agents/programmed_agent_instruction.py
@@ -26,6 +26,16 @@
 
 
 class ProgrammableAgentRequirement(Enum):
+    r"""Requirements for programmable agent state.
+
+    Defines the possible requirements that can be used to repair the state
+    of a programmable agent.
+
+    Attributes:
+        LAST_MESSAGE_NOT_USER (str): Requires that the last message in the
+            conversation was not from the user.
+    """
+
     LAST_MESSAGE_NOT_USER = "LAST_MESSAGE_NOT_USER"
 
 
@@ -34,6 +44,11 @@ class ProgrammedAgentInstructionResult(BaseModel, Generic[T]):
 
     Contains the messages exchanged during execution and the computed value.
     The value type is specified by the generic type parameter T.
+
+    Attributes:
+        user_message (BaseMessage): The message sent by the user.
+        agent_message (BaseMessage): The message sent by the agent.
+        value (T): The computed result value of type T.
     """
 
     user_message: BaseMessage
@@ -48,8 +63,7 @@ class AbstractProgrammableAgent(abc.ABC):
 
     A programmable agent is an agent that can be programmed to perform a
     specific function or task. This class defines the interface for a
-    programmable
-    agent.
+    programmable agent.
 
     These methods should be implemented in order to ensure the agent supports
     the necessary guarantees to enable a programming interface while
@@ -68,16 +82,15 @@ def run_atomic(
         An atomic operation is an operation that is guaranteed to
         be executed without interruption by any other operation.
 
-        If the operation fails or times out the agents state should be
-        unchanged.
+        Args:
+            callback (Callable[[], ProgrammedAgentInstructionResult[T]]): The
+                operation to execute atomically.
 
-        If an operation is already in progress, this method should throw an
-        exception. (It is up to the caller to do any queuing)
+        Returns:
+            ProgrammedAgentInstructionResult[T]: The result of the operation.
 
-        If the agent is in a state where it can perform the operation,
-        it must leave the agent in a state where it can perform the
-        operation again. Though if state changes in successful operation
-        improve its ability to perform the operation, it should keep them.
+        Raises:
+            RuntimeError: If an operation is already in progress.
         """
         raise NotImplementedError
 
@@ -86,10 +99,13 @@ def repair_state(self, requirement: ProgrammableAgentRequirement) -> None:
         r"""Repair the state of the agent.
 
         Agents may have other non-atomic interfaces, such as a user interface,
-        or chat between other agents.
+        or chat between other agents. This method should restore the agent to
+        a state where it can perform operations according to the specified
+        requirement.
 
-        This method should restore the agent to a state where it can perform
-        operations according to the specified requirement.
+        Args:
+            requirement (ProgrammableAgentRequirement): The requirement to
+                repair the state for.
         """
         raise NotImplementedError
 
@@ -99,10 +115,16 @@ def programmable_capability(
 ) -> Callable[..., ProgrammedAgentInstructionResult[T]]:
     r"""Decorator for programmable agent capabilities.
 
-    Wraps a method to ensure it is executed atomically via the agent's
-    run_atomic interface.
-    The decorated method must return a ProgrammedAgentInstructionResult with
-    appropriate type parameter.
+    This decorator ensures that the decorated method is executed atomically
+    and maintains the agent's state guarantees.
+
+    Args:
+        func (Callable[..., ProgrammedAgentInstructionResult[T]]): The method
+            to decorate.
+
+    Returns:
+        Callable[..., ProgrammedAgentInstructionResult[T]]: The decorated
+            method that ensures atomic execution.
     """
 
     @wraps(func)
@@ -120,9 +142,20 @@ class ProgrammableChatAgent(ChatAgent, AbstractProgrammableAgent):
     Provides a default implementation of atomic execution using threading locks
     and basic state tracking for message roles. Implementing classes need to
     provide specific repair logic for their use cases.
+
+    Attributes:
+        _operation_lock (threading.Lock): Lock for ensuring atomic operations.
+        _last_message_role (Optional[str]): Role of the last message in the
+            conversation.
     """
 
-    def __init__(self, **kwargs: Any):
+    def __init__(self, **kwargs: Any) -> None:
+        r"""Initialize the ProgrammableChatAgent.
+
+        Args:
+            **kwargs (Any): Additional keyword arguments to pass to parent
+                class.
+        """
         super().__init__(**kwargs)
         self._operation_lock = threading.Lock()
         self._last_message_role: Optional[str] = None
@@ -130,6 +163,20 @@ def __init__(self, **kwargs: Any):
     def run_atomic(
         self, callback: Callable[[], ProgrammedAgentInstructionResult[T]]
     ) -> ProgrammedAgentInstructionResult[T]:
+        r"""Run an atomic operation on the agent.
+
+        Ensures thread-safe execution of the callback function by using a lock.
+
+        Args:
+            callback (Callable[[], ProgrammedAgentInstructionResult[T]]): The
+                operation to execute atomically.
+
+        Returns:
+            ProgrammedAgentInstructionResult[T]: The result of the operation.
+
+        Raises:
+            RuntimeError: If an operation is already in progress.
+        """
         if not self._operation_lock.acquire(blocking=False):
             raise RuntimeError("Operation already in progress")
 
@@ -141,6 +188,14 @@ def run_atomic(
             self._operation_lock.release()
 
     def repair_state(self, requirement: ProgrammableAgentRequirement) -> None:
+        r"""Repair the state of the agent.
+
+        Implements basic state repair for message role requirements.
+
+        Args:
+            requirement (ProgrammableAgentRequirement): The requirement to
+                repair the state for.
+        """
         if requirement == ProgrammableAgentRequirement.LAST_MESSAGE_NOT_USER:
             if self._last_message_role == "user":
                 raise NotImplementedError(
diff --git a/camel/datagen/source2synth/__init__.py b/camel/datagen/source2synth/__init__.py
new file mode 100644
index 0000000000..e9ddca05fb
--- /dev/null
+++ b/camel/datagen/source2synth/__init__.py
@@ -0,0 +1,31 @@
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
+from .data_processor import (
+    DataCurator,
+    ExampleConstructor,
+    UserDataProcessor,
+)
+from .models import MultiHopQA, ReasoningStep
+from .user_data_processor_config import (
+    ProcessorConfig,
+)
+
+__all__ = [
+    "DataCurator",
+    "ExampleConstructor",
+    "ProcessorConfig",
+    "UserDataProcessor",
+    "ReasoningStep",
+    "MultiHopQA",
+]
diff --git a/camel/synthetic_datagen/source2synth/data_processor.py b/camel/datagen/source2synth/data_processor.py
similarity index 61%
rename from camel/synthetic_datagen/source2synth/data_processor.py
rename to camel/datagen/source2synth/data_processor.py
index 9780663c40..ec7d84ecc4 100644
--- a/camel/synthetic_datagen/source2synth/data_processor.py
+++ b/camel/datagen/source2synth/data_processor.py
@@ -15,33 +15,61 @@
 import random
 from typing import Any, Dict, List, Optional, Sequence
 
-import numpy as np
 from tqdm import tqdm
 
 from camel.agents.multi_hop_generator_agent import MultiHopGeneratorAgent
-from camel.logger import get_logger
-from camel.synthetic_datagen.source2synth.user_data_processor_config import (
+from camel.datagen.source2synth.user_data_processor_config import (
     ProcessorConfig,
 )
+from camel.logger import get_logger
 
 logger = get_logger(__name__)
 
 
 class UserDataProcessor:
-    r"""User Data Processor."""
+    r"""A processor for generating multi-hop question-answer pairs from user
+    data.
+
+    This class handles the processing of text data to generate multi-hop
+    question-answer pairs using either an AI model or rule-based approaches.
+    It manages the entire pipeline from text preprocessing to dataset curation.
+
+    Attributes:
+        config (ProcessorConfig): Configuration for data processing parameters.
+        rng (random.Random): Random number generator for reproducibility.
+        multi_hop_agent (Optional[MultiHopGeneratorAgent]): Agent for
+            generating QA pairs.
+    """
 
     def __init__(self, config: Optional[ProcessorConfig] = None):
+        r"""Initialize the UserDataProcessor.
+
+        Args:
+            config (Optional[ProcessorConfig], optional): Configuration for
+                data processing. (default: :obj:`None`)
+        """
         self.config = config or ProcessorConfig()
-        random.seed(self.config.seed)
-        np.random.seed(self.config.seed)
+        self.rng = random.Random(self.config.seed)
         self.multi_hop_agent = (
-            MultiHopGeneratorAgent() if self.config.use_ai_model else None
+            self.config.hop_generating_agent
+            if self.config.use_ai_model
+            else None
         )
 
     def process_text(
         self, text: str, source: str = "user_input"
     ) -> List[Dict[str, Any]]:
-        r"""Process a single text."""
+        r"""Process a single text to generate multi-hop QA pairs.
+
+        Args:
+            text (str): The input text to process.
+            source (str, optional): Source identifier for the text.
+                (default: :obj:`"user_input"`)
+
+        Returns:
+            List[Dict[str, Any]]: List of processed examples with QA pairs and
+                metadata.
+        """
         # Convert text to standard format
         raw_data = [
             {
@@ -55,7 +83,7 @@ def process_text(
         examples = constructor.construct_examples(raw_data)
 
         # Manage data
-        curator = DataCurator(self.config)
+        curator = DataCurator(self.config, self.rng)
         final_dataset = curator.curate_dataset(examples)
 
         return final_dataset
@@ -63,7 +91,20 @@ def process_text(
     def process_batch(
         self, texts: List[str], sources: Optional[List[str]] = None
     ) -> List[Dict[str, Any]]:
-        r"""Process multiple texts in batch."""
+        r"""Process multiple texts in batch to generate multi-hop QA pairs.
+
+        Args:
+            texts (List[str]): List of input texts to process.
+            sources (Optional[List[str]], optional): List of source
+                identifiers. (default: :obj:`None`)
+
+        Returns:
+            List[Dict[str, Any]]: List of processed examples with QA pairs and
+                metadata.
+
+        Raises:
+            ValueError: If length of sources doesn't match length of texts.
+        """
         if sources is None:
             sources = ["user_input"] * len(texts)
         elif len(sources) != len(texts):
@@ -82,27 +123,52 @@ def process_batch(
         examples = constructor.construct_examples(raw_data)
 
         # Manage data
-        curator = DataCurator(self.config)
+        curator = DataCurator(self.config, self.rng)
         final_dataset = curator.curate_dataset(examples)
 
         return final_dataset
 
 
 class ExampleConstructor:
-    r"""Example Constructor."""
+    r"""Constructs training examples from raw text data.
+
+    This class handles the construction of training examples by preprocessing
+    text, extracting information pairs, and generating question-answer pairs.
+
+    Attributes:
+        config (ProcessorConfig): Configuration for example construction.
+        multi_hop_agent (Optional[MultiHopGeneratorAgent]): Agent for QA
+            generation.
+    """
 
     def __init__(
         self,
         config: ProcessorConfig,
         multi_hop_agent: Optional[MultiHopGeneratorAgent] = None,
     ):
+        r"""Initialize the ExampleConstructor.
+
+        Args:
+            config (ProcessorConfig): Configuration for example construction.
+            multi_hop_agent (Optional[MultiHopGeneratorAgent], optional):
+                Agent for generating multi-hop QA pairs. (default: :obj:`None`)
+        """
         self.config = config
         self.multi_hop_agent = multi_hop_agent
 
     def construct_examples(
         self, raw_data: List[Dict[str, Any]]
     ) -> List[Dict[str, Any]]:
-        r"""Construct training examples."""
+        r"""Construct training examples from raw data.
+
+        Args:
+            raw_data (List[Dict[str, Any]]): List of raw data dictionaries
+                containing text and metadata.
+
+        Returns:
+            List[Dict[str, Any]]: List of constructed examples with QA pairs
+                and metadata.
+        """
         logger.info("Starting to construct training examples...")
         examples = []
 
@@ -135,7 +201,15 @@ def construct_examples(
         return examples
 
     def _preprocess_text(self, text: str) -> str:
-        r"""Text preprocessing."""
+        r"""Preprocess input text for example construction.
+
+        Args:
+            text (str): Input text to preprocess.
+
+        Returns:
+            str: Preprocessed text, or empty string if text fails quality
+                checks.
+        """
         if not isinstance(text, str):
             return ''
 
@@ -156,7 +230,14 @@ def _preprocess_text(self, text: str) -> str:
         return text
 
     def _check_text_quality(self, text: str) -> bool:
-        r"""Check text quality."""
+        r"""Check the quality of input text.
+
+        Args:
+            text (str): Text to check quality for.
+
+        Returns:
+            bool: True if text passes quality checks, False otherwise.
+        """
         # 1. Basic quality check
         if text.count('.') < 2:  # Must have at least 2 sentences
             return False
@@ -171,7 +252,15 @@ def _check_text_quality(self, text: str) -> bool:
         return True
 
     def _extract_info_pairs(self, text: str) -> List[Dict[str, Sequence[str]]]:
-        r"""Extract information pairs and relationships."""
+        r"""Extract information pairs and relationships from text.
+
+        Args:
+            text (str): Input text to extract information from.
+
+        Returns:
+            List[Dict[str, Sequence[str]]]: List of dictionaries containing
+                premise, intermediate, conclusion, and related contexts.
+        """
         # Split into sentences
         sentences = [s.strip() for s in text.split('.') if s.strip()]
         info_pairs = []
@@ -200,7 +289,15 @@ def _extract_info_pairs(self, text: str) -> List[Dict[str, Sequence[str]]]:
     def _generate_qa_pairs(
         self, info_pairs: List[Dict[str, Sequence[str]]]
     ) -> List[Dict[str, str]]:
-        r"""Generate multi-hop question-answer pairs."""
+        r"""Generate multi-hop question-answer pairs from information pairs.
+
+        Args:
+            info_pairs (List[Dict[str, Sequence[str]]]): List of information
+                pairs extracted from text.
+
+        Returns:
+            List[Dict[str, str]]: List of generated QA pairs.
+        """
         qa_pairs = []
 
         for pair in info_pairs:
@@ -219,7 +316,15 @@ def _generate_qa_pairs(
         return qa_pairs
 
     def _calculate_complexity(self, qa_pairs: List[Dict[str, Any]]) -> float:
-        r"""Calculate complexity of QA pairs."""
+        r"""Calculate the complexity score for a set of QA pairs.
+
+        Args:
+            qa_pairs (List[Dict[str, Any]]): List of QA pairs to calculate
+                complexity for.
+
+        Returns:
+            float: Complexity score between 0.0 and 1.0.
+        """
         if not qa_pairs:
             return 0.0
 
@@ -233,10 +338,10 @@ def _calculate_complexity(self, qa_pairs: List[Dict[str, Any]]) -> float:
             supporting_facts_count = len(qa.get('supporting_facts', []))
 
             # 3. Question length
-            question_length = len(qa['question'].split())
+            question_length = len(qa.get('question', '').split())
 
             # 4. Answer length
-            answer_length = len(qa['answer'].split())
+            answer_length = len(qa.get('answer', '').split())
 
             # Calculate complexity of a single QA pair
             qa_complexity = (
@@ -256,15 +361,37 @@ def _calculate_complexity(self, qa_pairs: List[Dict[str, Any]]) -> float:
 
 
 class DataCurator:
-    r"""Data Manager."""
+    r"""Manages and curates datasets of multi-hop question-answer pairs.
+
+    This class handles dataset management tasks including quality filtering,
+    complexity filtering, deduplication, and dataset sampling.
 
-    def __init__(self, config: ProcessorConfig):
+    Attributes:
+        config (ProcessorConfig): Configuration for data curation parameters.
+        rng (random.Random): Random number generator for reproducible sampling.
+    """
+
+    def __init__(self, config: ProcessorConfig, rng: random.Random):
+        r"""Initialize the DataCurator.
+
+        Args:
+            config (ProcessorConfig): Configuration for data curation.
+            rng (random.Random): Random number generator for reproducibility.
+        """
         self.config = config
+        self.rng = rng
 
     def curate_dataset(
         self, examples: List[Dict[str, Any]]
     ) -> List[Dict[str, Any]]:
-        r"""Dataset management."""
+        r"""Manage and curate a dataset through multiple filtering stages.
+
+        Args:
+            examples (List[Dict[str, Any]]): List of examples to curate.
+
+        Returns:
+            List[Dict[str, Any]]: Curated dataset meeting quality criteria.
+        """
         logger.info("Starting dataset management...")
 
         # 1. Quality filtering
@@ -296,7 +423,14 @@ def curate_dataset(
     def _quality_filter(
         self, examples: List[Dict[str, Any]]
     ) -> List[Dict[str, Any]]:
-        r"""Quality filtering."""
+        r"""Filter examples based on quality criteria.
+
+        Args:
+            examples (List[Dict[str, Any]]): List of examples to filter.
+
+        Returns:
+            List[Dict[str, Any]]: Examples that pass quality checks.
+        """
         filtered = []
 
         for example in examples:
@@ -314,7 +448,14 @@ def _quality_filter(
         return filtered
 
     def _check_qa_quality(self, qa_pairs: List[Dict[str, str]]) -> bool:
-        r"""Check quality of QA pairs."""
+        r"""Check the quality of question-answer pairs.
+
+        Args:
+            qa_pairs (List[Dict[str, str]]): List of QA pairs to check.
+
+        Returns:
+            bool: True if QA pairs meet quality criteria, False otherwise.
+        """
         if not qa_pairs:
             return False
 
@@ -335,7 +476,17 @@ def _check_qa_quality(self, qa_pairs: List[Dict[str, str]]) -> bool:
     def _complexity_filter(
         self, examples: List[Dict[str, Any]]
     ) -> List[Dict[str, Any]]:
-        r"""Complexity filtering."""
+        """
+        Filter examples based on complexity threshold.
+
+        Removes examples with complexity scores below the configured threshold.
+
+        Args:
+            examples (List[Dict[str, Any]]): List of examples to filter.
+
+        Returns:
+            List[Dict[str, Any]]: Examples meeting complexity threshold.
+        """
         return [
             example
             for example in examples
@@ -346,7 +497,14 @@ def _complexity_filter(
     def _remove_duplicates(
         self, examples: List[Dict[str, Any]]
     ) -> List[Dict[str, Any]]:
-        r"""Remove duplicates."""
+        r"""Remove duplicate examples from the dataset.
+
+        Args:
+            examples (List[Dict[str, Any]]): List of examples to deduplicate.
+
+        Returns:
+            List[Dict[str, Any]]: Deduplicated examples.
+        """
         seen = set()
         unique_examples = []
 
@@ -366,8 +524,15 @@ def _remove_duplicates(
     def _sample_dataset(
         self, examples: List[Dict[str, Any]]
     ) -> List[Dict[str, Any]]:
-        r"""Sample to target dataset size."""
+        r"""Sample examples to match target dataset size.
+
+        Args:
+            examples (List[Dict[str, Any]]): List of examples to sample from.
+
+        Returns:
+            List[Dict[str, Any]]: Sampled dataset of target size or smaller.
+        """
         if len(examples) <= self.config.dataset_size:
             return examples
 
-        return random.sample(examples, self.config.dataset_size)
+        return self.rng.sample(examples, self.config.dataset_size)
diff --git a/camel/synthetic_datagen/source2synth/models.py b/camel/datagen/source2synth/models.py
similarity index 73%
rename from camel/synthetic_datagen/source2synth/models.py
rename to camel/datagen/source2synth/models.py
index 568581411c..b85b228f88 100644
--- a/camel/synthetic_datagen/source2synth/models.py
+++ b/camel/datagen/source2synth/models.py
@@ -17,12 +17,30 @@
 
 
 class ReasoningStep(BaseModel):
+    r"""A single step in a multi-hop reasoning process.
+
+    Attributes:
+        step (str): The textual description of the reasoning step.
+    """
+
     step: str = Field(
         ..., description="A single step in the reasoning process."
     )
 
 
 class MultiHopQA(BaseModel):
+    r"""A multi-hop question-answer pair with reasoning steps and supporting
+    facts.
+
+    Attributes:
+        question (str): The question requiring multi-hop reasoning.
+        reasoning_steps (List[ReasoningStep]): List of reasoning steps to
+            answer.
+        answer (str): The final answer to the question.
+        supporting_facts (List[str]): List of facts supporting the reasoning.
+        type (str): The type of question-answer pair.
+    """
+
     question: str = Field(
         ..., description="The question that requires multi-hop reasoning."
     )
@@ -57,6 +75,13 @@ class Config:
 
 
 class ContextPrompt(BaseModel):
+    r"""A context prompt for generating multi-hop question-answer pairs.
+
+    Attributes:
+        main_context (str): The primary context for generating QA pairs.
+        related_contexts (Optional[List[str]]): Additional related contexts.
+    """
+
     main_context: str = Field(
         ...,
         description="The main context for generating"
diff --git a/camel/synthetic_datagen/source2synth/user_data_processor_config.py b/camel/datagen/source2synth/user_data_processor_config.py
similarity index 85%
rename from camel/synthetic_datagen/source2synth/user_data_processor_config.py
rename to camel/datagen/source2synth/user_data_processor_config.py
index 9b99b9b831..8acc8cdaae 100644
--- a/camel/synthetic_datagen/source2synth/user_data_processor_config.py
+++ b/camel/datagen/source2synth/user_data_processor_config.py
@@ -23,7 +23,15 @@ class ProcessorConfig(BaseModel):
     r"""Data processing configuration class"""
 
     def __repr__(self):
-        return "MultiHopGeneratorAgent()"
+        return (
+            f"ProcessorConfig("
+            f"seed={self.seed}, min_length={self.min_length}, "
+            f"max_length={self.max_length}, "
+            f"complexity_threshold={self.complexity_threshold}, "
+            f"dataset_size={self.dataset_size}, "
+            f"use_ai_model={self.use_ai_model}"
+            f")"
+        )
 
     model_config = ConfigDict(
         validate_assignment=True,
@@ -45,13 +53,6 @@ def __repr__(self):
         default=512, description="Maximum text length", gt=0
     )
 
-    quality_threshold: float = Field(
-        default=0.7,
-        description="Quality threshold for processing",
-        ge=0.0,
-        le=1.0,
-    )
-
     complexity_threshold: float = Field(
         default=0.5,
         description="Complexity threshold for processing",
diff --git a/examples/synthetic_datagen/self_instruct/data_output.json b/examples/datagen/self_instruct/data_output.json
similarity index 100%
rename from examples/synthetic_datagen/self_instruct/data_output.json
rename to examples/datagen/self_instruct/data_output.json
diff --git a/examples/synthetic_datagen/self_instruct/seed_tasks.jsonl b/examples/datagen/self_instruct/seed_tasks.jsonl
similarity index 100%
rename from examples/synthetic_datagen/self_instruct/seed_tasks.jsonl
rename to examples/datagen/self_instruct/seed_tasks.jsonl
diff --git a/examples/synthetic_datagen/self_instruct/self_instruct.py b/examples/datagen/self_instruct/self_instruct.py
similarity index 100%
rename from examples/synthetic_datagen/self_instruct/self_instruct.py
rename to examples/datagen/self_instruct/self_instruct.py
diff --git a/examples/source2synth.py b/examples/datagen/source2synth.py
similarity index 59%
rename from examples/source2synth.py
rename to examples/datagen/source2synth.py
index ba2549aedc..c4f236acca 100644
--- a/examples/source2synth.py
+++ b/examples/datagen/source2synth.py
@@ -15,10 +15,10 @@
 import json
 import logging
 
-from camel.synthetic_datagen.source2synth.data_processor import (
+from camel.datagen.source2synth.data_processor import (
     UserDataProcessor,
 )
-from camel.synthetic_datagen.source2synth.user_data_processor_config import (
+from camel.datagen.source2synth.user_data_processor_config import (
     ProcessorConfig,
 )
 
@@ -43,7 +43,6 @@ def main():
         seed=42,
         min_length=50,
         max_length=1000,
-        quality_threshold=0.7,
         complexity_threshold=0.5,
         dataset_size=10,
         use_ai_model=True,
@@ -171,3 +170,107 @@ def main():
 
 if __name__ == "__main__":
     main()
+
+
+'''
+===============================================================================
+Constructing examples: 100%|
+███████████████████████████████████████████████████████████████████████████████
+█| 1/1 [00:10<00:00, 10.98s/it]
+Constructing examples: 100%|
+███████████████████████████████████████████████████████████████████████████████
+█| 3/3 [00:22<00:00,  7.64s/it]
+
+=== Single Text Processing Example ===
+
+Text 1:
+Source: technology_evolution
+Complexity: 0.88
+
+Q&A Pairs:
+
+Q&A Pair 1:
+Type: multi_hop_qa
+Question: How did the invention of transistors impact the development of 
+personal computers?
+Reasoning Steps:
+1. {'step': 'Identify the role of transistors in electronics.'}
+2. {'step': 'Understand how transistors enabled the miniaturization of 
+computers.'}
+3. {'step': 'Connect the miniaturization of computers to the creation of 
+personal computers in the 1980s.'}
+4. {'step': 'Determine the overall impact of personal computers on work and 
+communication.'}
+Answer: The invention of transistors allowed for smaller and more efficient 
+computers, which led to the development of personal computers in the 1980s, 
+transforming work and communication.
+Supporting Facts:
+1. Transistors are semiconductor devices that revolutionized electronics.
+2. The miniaturization of computers was made possible by transistors.
+3. Personal computers emerged in the 1980s as a result of smaller computer 
+designs.
+4. Personal computers changed how people work and communicate.
+
+Q&A Pair 2:
+Type: multi_hop_qa
+Question: What was the sequence of developments that led from transistors to 
+the internet?
+Reasoning Steps:
+1. {'step': 'Identify how transistors contributed to the development of 
+smaller and more efficient computers.'}
+2. {'step': 'Explain how the miniaturization of computers resulted in the 
+creation of personal computers in the 1980s.'}
+3. {'step': 'Discuss how personal computers transformed work and communication.
+'}
+4. {'step': 'Connect the transformation in communication to the rise of the 
+internet.'}
+Answer: Transistors enabled smaller computers, which led to personal computers 
+in the 1980s, transforming communication and eventually giving rise to the 
+internet.
+Supporting Facts:
+1. Transistors are tiny semiconductor devices that made computers smaller and 
+more efficient.
+2. The miniaturization of computers allowed for the creation of personal 
+computers in the 1980s.
+3. Personal computers transformed how people work and communicate.
+4. The digital revolution and personal computers contributed to the rise of 
+the internet, connecting billions worldwide.
+
+Q&A Pair 3:
+Type: multi_hop_qa
+Question: How did the miniaturization of computers contribute to the 
+development of artificial intelligence systems today?
+Reasoning Steps:
+1. {'step': 'Identify the impact of miniaturization on the creation of 
+personal computers in the 1980s.'}
+2. {'step': 'Explain how personal computers transformed communication and work.
+'}
+3. {'step': 'Connect the digital revolution and the rise of the internet to 
+the development of artificial intelligence.'}
+4. {'step': 'Discuss how the interconnected network of the internet supports 
+AI systems in various industries.'}
+Answer: The miniaturization of computers led to personal computers, which 
+transformed communication and work, and this digital revolution, along with 
+the internet, supports the development of artificial intelligence systems 
+today.
+Supporting Facts:
+1. Miniaturization of computers enabled the creation of personal computers in 
+the 1980s.
+2. Personal computers transformed how people work and communicate.
+3. The digital revolution led to the rise of the internet, connecting billions 
+of people.
+4. The internet powers artificial intelligence systems that are reshaping 
+various industries.
+
+=== Batch Processing Statistics ===
+Total texts processed: 3
+Total Q&A pairs generated: 9
+
+=== Generation Statistics ===
+AI-generated multi-hop Q&A count: 9
+Template-generated multi-hop Q&A count: 0
+
+Average reasoning steps: 4.00
+Average complexity score: 0.90
+===============================================================================
+'''
\ No newline at end of file