From 5850463e72732884da3ffc0f2acd8e51388f26bb Mon Sep 17 00:00:00 2001 From: Wendong Date: Sun, 19 Jan 2025 05:46:19 +0800 Subject: [PATCH] feat: enhance source2synth --- camel/agents/multi_hop_generator_agent.py | 38 ++- camel/agents/programmed_agent_instruction.py | 91 +++++-- camel/datagen/source2synth/__init__.py | 31 +++ .../source2synth/data_processor.py | 223 +++++++++++++++--- .../source2synth/models.py | 25 ++ .../user_data_processor_config.py | 17 +- .../self_instruct/data_output.json | 0 .../self_instruct/seed_tasks.jsonl | 0 .../self_instruct/self_instruct.py | 0 examples/{ => datagen}/source2synth.py | 109 ++++++++- 10 files changed, 473 insertions(+), 61 deletions(-) create mode 100644 camel/datagen/source2synth/__init__.py rename camel/{synthetic_datagen => datagen}/source2synth/data_processor.py (61%) rename camel/{synthetic_datagen => datagen}/source2synth/models.py (73%) rename camel/{synthetic_datagen => datagen}/source2synth/user_data_processor_config.py (85%) rename examples/{synthetic_datagen => datagen}/self_instruct/data_output.json (100%) rename examples/{synthetic_datagen => datagen}/self_instruct/seed_tasks.jsonl (100%) rename examples/{synthetic_datagen => datagen}/self_instruct/self_instruct.py (100%) rename examples/{ => datagen}/source2synth.py (59%) diff --git a/camel/agents/multi_hop_generator_agent.py b/camel/agents/multi_hop_generator_agent.py index a232fce846..988342b9af 100644 --- a/camel/agents/multi_hop_generator_agent.py +++ b/camel/agents/multi_hop_generator_agent.py @@ -22,17 +22,36 @@ ProgrammedAgentInstructionResult, programmable_capability, ) -from camel.messages import BaseMessage -from camel.synthetic_datagen.source2synth.models import ( +from camel.datagen.source2synth.models import ( ContextPrompt, MultiHopQA, ) +from camel.messages import BaseMessage class MultiHopGeneratorAgent(ProgrammableChatAgent): + r"""An agent specialized in generating multi-hop question-answer pairs. + + This agent is designed to create complex questions that require multiple + steps of reasoning to answer. It analyzes context to identify related + facts and generates questions that require connecting these facts + logically. + + Attributes: + model_config (ConfigDict): Configuration for model behavior. + system_message (BaseMessage): System message defining agent's role and + instructions. + """ + model_config = ConfigDict(arbitrary_types_allowed=True) - def __init__(self, **kwargs: Any): + def __init__(self, **kwargs: Any) -> None: + r"""Initialize the MultiHopGeneratorAgent. + + Args: + **kwargs (Any): Additional keyword arguments to pass to parent + class. + """ super().__init__(**kwargs) system_text: str = textwrap.dedent( @@ -64,6 +83,19 @@ def __init__(self, **kwargs: Any): def generate_multi_hop_qa( self, context: str ) -> ProgrammedAgentInstructionResult[MultiHopQA]: + r"""Generate a multi-hop question-answer pair from given context. + + Args: + context (str): The input text context to generate QA from. + + Returns: + ProgrammedAgentInstructionResult[MultiHopQA]: Result containing the + generated question, reasoning steps, answer, and supporting + facts. + + Raises: + RuntimeError: If the agent fails to generate a response. + """ context_prompt = ContextPrompt( main_context=context, related_contexts=None ) diff --git a/camel/agents/programmed_agent_instruction.py b/camel/agents/programmed_agent_instruction.py index 708d5997bb..bf38d67107 100644 --- a/camel/agents/programmed_agent_instruction.py +++ b/camel/agents/programmed_agent_instruction.py @@ -26,6 +26,16 @@ class ProgrammableAgentRequirement(Enum): + r"""Requirements for programmable agent state. + + Defines the possible requirements that can be used to repair the state + of a programmable agent. + + Attributes: + LAST_MESSAGE_NOT_USER (str): Requires that the last message in the + conversation was not from the user. + """ + LAST_MESSAGE_NOT_USER = "LAST_MESSAGE_NOT_USER" @@ -34,6 +44,11 @@ class ProgrammedAgentInstructionResult(BaseModel, Generic[T]): Contains the messages exchanged during execution and the computed value. The value type is specified by the generic type parameter T. + + Attributes: + user_message (BaseMessage): The message sent by the user. + agent_message (BaseMessage): The message sent by the agent. + value (T): The computed result value of type T. """ user_message: BaseMessage @@ -48,8 +63,7 @@ class AbstractProgrammableAgent(abc.ABC): A programmable agent is an agent that can be programmed to perform a specific function or task. This class defines the interface for a - programmable - agent. + programmable agent. These methods should be implemented in order to ensure the agent supports the necessary guarantees to enable a programming interface while @@ -68,16 +82,15 @@ def run_atomic( An atomic operation is an operation that is guaranteed to be executed without interruption by any other operation. - If the operation fails or times out the agents state should be - unchanged. + Args: + callback (Callable[[], ProgrammedAgentInstructionResult[T]]): The + operation to execute atomically. - If an operation is already in progress, this method should throw an - exception. (It is up to the caller to do any queuing) + Returns: + ProgrammedAgentInstructionResult[T]: The result of the operation. - If the agent is in a state where it can perform the operation, - it must leave the agent in a state where it can perform the - operation again. Though if state changes in successful operation - improve its ability to perform the operation, it should keep them. + Raises: + RuntimeError: If an operation is already in progress. """ raise NotImplementedError @@ -86,10 +99,13 @@ def repair_state(self, requirement: ProgrammableAgentRequirement) -> None: r"""Repair the state of the agent. Agents may have other non-atomic interfaces, such as a user interface, - or chat between other agents. + or chat between other agents. This method should restore the agent to + a state where it can perform operations according to the specified + requirement. - This method should restore the agent to a state where it can perform - operations according to the specified requirement. + Args: + requirement (ProgrammableAgentRequirement): The requirement to + repair the state for. """ raise NotImplementedError @@ -99,10 +115,16 @@ def programmable_capability( ) -> Callable[..., ProgrammedAgentInstructionResult[T]]: r"""Decorator for programmable agent capabilities. - Wraps a method to ensure it is executed atomically via the agent's - run_atomic interface. - The decorated method must return a ProgrammedAgentInstructionResult with - appropriate type parameter. + This decorator ensures that the decorated method is executed atomically + and maintains the agent's state guarantees. + + Args: + func (Callable[..., ProgrammedAgentInstructionResult[T]]): The method + to decorate. + + Returns: + Callable[..., ProgrammedAgentInstructionResult[T]]: The decorated + method that ensures atomic execution. """ @wraps(func) @@ -120,9 +142,20 @@ class ProgrammableChatAgent(ChatAgent, AbstractProgrammableAgent): Provides a default implementation of atomic execution using threading locks and basic state tracking for message roles. Implementing classes need to provide specific repair logic for their use cases. + + Attributes: + _operation_lock (threading.Lock): Lock for ensuring atomic operations. + _last_message_role (Optional[str]): Role of the last message in the + conversation. """ - def __init__(self, **kwargs: Any): + def __init__(self, **kwargs: Any) -> None: + r"""Initialize the ProgrammableChatAgent. + + Args: + **kwargs (Any): Additional keyword arguments to pass to parent + class. + """ super().__init__(**kwargs) self._operation_lock = threading.Lock() self._last_message_role: Optional[str] = None @@ -130,6 +163,20 @@ def __init__(self, **kwargs: Any): def run_atomic( self, callback: Callable[[], ProgrammedAgentInstructionResult[T]] ) -> ProgrammedAgentInstructionResult[T]: + r"""Run an atomic operation on the agent. + + Ensures thread-safe execution of the callback function by using a lock. + + Args: + callback (Callable[[], ProgrammedAgentInstructionResult[T]]): The + operation to execute atomically. + + Returns: + ProgrammedAgentInstructionResult[T]: The result of the operation. + + Raises: + RuntimeError: If an operation is already in progress. + """ if not self._operation_lock.acquire(blocking=False): raise RuntimeError("Operation already in progress") @@ -141,6 +188,14 @@ def run_atomic( self._operation_lock.release() def repair_state(self, requirement: ProgrammableAgentRequirement) -> None: + r"""Repair the state of the agent. + + Implements basic state repair for message role requirements. + + Args: + requirement (ProgrammableAgentRequirement): The requirement to + repair the state for. + """ if requirement == ProgrammableAgentRequirement.LAST_MESSAGE_NOT_USER: if self._last_message_role == "user": raise NotImplementedError( diff --git a/camel/datagen/source2synth/__init__.py b/camel/datagen/source2synth/__init__.py new file mode 100644 index 0000000000..e9ddca05fb --- /dev/null +++ b/camel/datagen/source2synth/__init__.py @@ -0,0 +1,31 @@ +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= +from .data_processor import ( + DataCurator, + ExampleConstructor, + UserDataProcessor, +) +from .models import MultiHopQA, ReasoningStep +from .user_data_processor_config import ( + ProcessorConfig, +) + +__all__ = [ + "DataCurator", + "ExampleConstructor", + "ProcessorConfig", + "UserDataProcessor", + "ReasoningStep", + "MultiHopQA", +] diff --git a/camel/synthetic_datagen/source2synth/data_processor.py b/camel/datagen/source2synth/data_processor.py similarity index 61% rename from camel/synthetic_datagen/source2synth/data_processor.py rename to camel/datagen/source2synth/data_processor.py index 9780663c40..ec7d84ecc4 100644 --- a/camel/synthetic_datagen/source2synth/data_processor.py +++ b/camel/datagen/source2synth/data_processor.py @@ -15,33 +15,61 @@ import random from typing import Any, Dict, List, Optional, Sequence -import numpy as np from tqdm import tqdm from camel.agents.multi_hop_generator_agent import MultiHopGeneratorAgent -from camel.logger import get_logger -from camel.synthetic_datagen.source2synth.user_data_processor_config import ( +from camel.datagen.source2synth.user_data_processor_config import ( ProcessorConfig, ) +from camel.logger import get_logger logger = get_logger(__name__) class UserDataProcessor: - r"""User Data Processor.""" + r"""A processor for generating multi-hop question-answer pairs from user + data. + + This class handles the processing of text data to generate multi-hop + question-answer pairs using either an AI model or rule-based approaches. + It manages the entire pipeline from text preprocessing to dataset curation. + + Attributes: + config (ProcessorConfig): Configuration for data processing parameters. + rng (random.Random): Random number generator for reproducibility. + multi_hop_agent (Optional[MultiHopGeneratorAgent]): Agent for + generating QA pairs. + """ def __init__(self, config: Optional[ProcessorConfig] = None): + r"""Initialize the UserDataProcessor. + + Args: + config (Optional[ProcessorConfig], optional): Configuration for + data processing. (default: :obj:`None`) + """ self.config = config or ProcessorConfig() - random.seed(self.config.seed) - np.random.seed(self.config.seed) + self.rng = random.Random(self.config.seed) self.multi_hop_agent = ( - MultiHopGeneratorAgent() if self.config.use_ai_model else None + self.config.hop_generating_agent + if self.config.use_ai_model + else None ) def process_text( self, text: str, source: str = "user_input" ) -> List[Dict[str, Any]]: - r"""Process a single text.""" + r"""Process a single text to generate multi-hop QA pairs. + + Args: + text (str): The input text to process. + source (str, optional): Source identifier for the text. + (default: :obj:`"user_input"`) + + Returns: + List[Dict[str, Any]]: List of processed examples with QA pairs and + metadata. + """ # Convert text to standard format raw_data = [ { @@ -55,7 +83,7 @@ def process_text( examples = constructor.construct_examples(raw_data) # Manage data - curator = DataCurator(self.config) + curator = DataCurator(self.config, self.rng) final_dataset = curator.curate_dataset(examples) return final_dataset @@ -63,7 +91,20 @@ def process_text( def process_batch( self, texts: List[str], sources: Optional[List[str]] = None ) -> List[Dict[str, Any]]: - r"""Process multiple texts in batch.""" + r"""Process multiple texts in batch to generate multi-hop QA pairs. + + Args: + texts (List[str]): List of input texts to process. + sources (Optional[List[str]], optional): List of source + identifiers. (default: :obj:`None`) + + Returns: + List[Dict[str, Any]]: List of processed examples with QA pairs and + metadata. + + Raises: + ValueError: If length of sources doesn't match length of texts. + """ if sources is None: sources = ["user_input"] * len(texts) elif len(sources) != len(texts): @@ -82,27 +123,52 @@ def process_batch( examples = constructor.construct_examples(raw_data) # Manage data - curator = DataCurator(self.config) + curator = DataCurator(self.config, self.rng) final_dataset = curator.curate_dataset(examples) return final_dataset class ExampleConstructor: - r"""Example Constructor.""" + r"""Constructs training examples from raw text data. + + This class handles the construction of training examples by preprocessing + text, extracting information pairs, and generating question-answer pairs. + + Attributes: + config (ProcessorConfig): Configuration for example construction. + multi_hop_agent (Optional[MultiHopGeneratorAgent]): Agent for QA + generation. + """ def __init__( self, config: ProcessorConfig, multi_hop_agent: Optional[MultiHopGeneratorAgent] = None, ): + r"""Initialize the ExampleConstructor. + + Args: + config (ProcessorConfig): Configuration for example construction. + multi_hop_agent (Optional[MultiHopGeneratorAgent], optional): + Agent for generating multi-hop QA pairs. (default: :obj:`None`) + """ self.config = config self.multi_hop_agent = multi_hop_agent def construct_examples( self, raw_data: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: - r"""Construct training examples.""" + r"""Construct training examples from raw data. + + Args: + raw_data (List[Dict[str, Any]]): List of raw data dictionaries + containing text and metadata. + + Returns: + List[Dict[str, Any]]: List of constructed examples with QA pairs + and metadata. + """ logger.info("Starting to construct training examples...") examples = [] @@ -135,7 +201,15 @@ def construct_examples( return examples def _preprocess_text(self, text: str) -> str: - r"""Text preprocessing.""" + r"""Preprocess input text for example construction. + + Args: + text (str): Input text to preprocess. + + Returns: + str: Preprocessed text, or empty string if text fails quality + checks. + """ if not isinstance(text, str): return '' @@ -156,7 +230,14 @@ def _preprocess_text(self, text: str) -> str: return text def _check_text_quality(self, text: str) -> bool: - r"""Check text quality.""" + r"""Check the quality of input text. + + Args: + text (str): Text to check quality for. + + Returns: + bool: True if text passes quality checks, False otherwise. + """ # 1. Basic quality check if text.count('.') < 2: # Must have at least 2 sentences return False @@ -171,7 +252,15 @@ def _check_text_quality(self, text: str) -> bool: return True def _extract_info_pairs(self, text: str) -> List[Dict[str, Sequence[str]]]: - r"""Extract information pairs and relationships.""" + r"""Extract information pairs and relationships from text. + + Args: + text (str): Input text to extract information from. + + Returns: + List[Dict[str, Sequence[str]]]: List of dictionaries containing + premise, intermediate, conclusion, and related contexts. + """ # Split into sentences sentences = [s.strip() for s in text.split('.') if s.strip()] info_pairs = [] @@ -200,7 +289,15 @@ def _extract_info_pairs(self, text: str) -> List[Dict[str, Sequence[str]]]: def _generate_qa_pairs( self, info_pairs: List[Dict[str, Sequence[str]]] ) -> List[Dict[str, str]]: - r"""Generate multi-hop question-answer pairs.""" + r"""Generate multi-hop question-answer pairs from information pairs. + + Args: + info_pairs (List[Dict[str, Sequence[str]]]): List of information + pairs extracted from text. + + Returns: + List[Dict[str, str]]: List of generated QA pairs. + """ qa_pairs = [] for pair in info_pairs: @@ -219,7 +316,15 @@ def _generate_qa_pairs( return qa_pairs def _calculate_complexity(self, qa_pairs: List[Dict[str, Any]]) -> float: - r"""Calculate complexity of QA pairs.""" + r"""Calculate the complexity score for a set of QA pairs. + + Args: + qa_pairs (List[Dict[str, Any]]): List of QA pairs to calculate + complexity for. + + Returns: + float: Complexity score between 0.0 and 1.0. + """ if not qa_pairs: return 0.0 @@ -233,10 +338,10 @@ def _calculate_complexity(self, qa_pairs: List[Dict[str, Any]]) -> float: supporting_facts_count = len(qa.get('supporting_facts', [])) # 3. Question length - question_length = len(qa['question'].split()) + question_length = len(qa.get('question', '').split()) # 4. Answer length - answer_length = len(qa['answer'].split()) + answer_length = len(qa.get('answer', '').split()) # Calculate complexity of a single QA pair qa_complexity = ( @@ -256,15 +361,37 @@ def _calculate_complexity(self, qa_pairs: List[Dict[str, Any]]) -> float: class DataCurator: - r"""Data Manager.""" + r"""Manages and curates datasets of multi-hop question-answer pairs. + + This class handles dataset management tasks including quality filtering, + complexity filtering, deduplication, and dataset sampling. - def __init__(self, config: ProcessorConfig): + Attributes: + config (ProcessorConfig): Configuration for data curation parameters. + rng (random.Random): Random number generator for reproducible sampling. + """ + + def __init__(self, config: ProcessorConfig, rng: random.Random): + r"""Initialize the DataCurator. + + Args: + config (ProcessorConfig): Configuration for data curation. + rng (random.Random): Random number generator for reproducibility. + """ self.config = config + self.rng = rng def curate_dataset( self, examples: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: - r"""Dataset management.""" + r"""Manage and curate a dataset through multiple filtering stages. + + Args: + examples (List[Dict[str, Any]]): List of examples to curate. + + Returns: + List[Dict[str, Any]]: Curated dataset meeting quality criteria. + """ logger.info("Starting dataset management...") # 1. Quality filtering @@ -296,7 +423,14 @@ def curate_dataset( def _quality_filter( self, examples: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: - r"""Quality filtering.""" + r"""Filter examples based on quality criteria. + + Args: + examples (List[Dict[str, Any]]): List of examples to filter. + + Returns: + List[Dict[str, Any]]: Examples that pass quality checks. + """ filtered = [] for example in examples: @@ -314,7 +448,14 @@ def _quality_filter( return filtered def _check_qa_quality(self, qa_pairs: List[Dict[str, str]]) -> bool: - r"""Check quality of QA pairs.""" + r"""Check the quality of question-answer pairs. + + Args: + qa_pairs (List[Dict[str, str]]): List of QA pairs to check. + + Returns: + bool: True if QA pairs meet quality criteria, False otherwise. + """ if not qa_pairs: return False @@ -335,7 +476,17 @@ def _check_qa_quality(self, qa_pairs: List[Dict[str, str]]) -> bool: def _complexity_filter( self, examples: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: - r"""Complexity filtering.""" + """ + Filter examples based on complexity threshold. + + Removes examples with complexity scores below the configured threshold. + + Args: + examples (List[Dict[str, Any]]): List of examples to filter. + + Returns: + List[Dict[str, Any]]: Examples meeting complexity threshold. + """ return [ example for example in examples @@ -346,7 +497,14 @@ def _complexity_filter( def _remove_duplicates( self, examples: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: - r"""Remove duplicates.""" + r"""Remove duplicate examples from the dataset. + + Args: + examples (List[Dict[str, Any]]): List of examples to deduplicate. + + Returns: + List[Dict[str, Any]]: Deduplicated examples. + """ seen = set() unique_examples = [] @@ -366,8 +524,15 @@ def _remove_duplicates( def _sample_dataset( self, examples: List[Dict[str, Any]] ) -> List[Dict[str, Any]]: - r"""Sample to target dataset size.""" + r"""Sample examples to match target dataset size. + + Args: + examples (List[Dict[str, Any]]): List of examples to sample from. + + Returns: + List[Dict[str, Any]]: Sampled dataset of target size or smaller. + """ if len(examples) <= self.config.dataset_size: return examples - return random.sample(examples, self.config.dataset_size) + return self.rng.sample(examples, self.config.dataset_size) diff --git a/camel/synthetic_datagen/source2synth/models.py b/camel/datagen/source2synth/models.py similarity index 73% rename from camel/synthetic_datagen/source2synth/models.py rename to camel/datagen/source2synth/models.py index 568581411c..b85b228f88 100644 --- a/camel/synthetic_datagen/source2synth/models.py +++ b/camel/datagen/source2synth/models.py @@ -17,12 +17,30 @@ class ReasoningStep(BaseModel): + r"""A single step in a multi-hop reasoning process. + + Attributes: + step (str): The textual description of the reasoning step. + """ + step: str = Field( ..., description="A single step in the reasoning process." ) class MultiHopQA(BaseModel): + r"""A multi-hop question-answer pair with reasoning steps and supporting + facts. + + Attributes: + question (str): The question requiring multi-hop reasoning. + reasoning_steps (List[ReasoningStep]): List of reasoning steps to + answer. + answer (str): The final answer to the question. + supporting_facts (List[str]): List of facts supporting the reasoning. + type (str): The type of question-answer pair. + """ + question: str = Field( ..., description="The question that requires multi-hop reasoning." ) @@ -57,6 +75,13 @@ class Config: class ContextPrompt(BaseModel): + r"""A context prompt for generating multi-hop question-answer pairs. + + Attributes: + main_context (str): The primary context for generating QA pairs. + related_contexts (Optional[List[str]]): Additional related contexts. + """ + main_context: str = Field( ..., description="The main context for generating" diff --git a/camel/synthetic_datagen/source2synth/user_data_processor_config.py b/camel/datagen/source2synth/user_data_processor_config.py similarity index 85% rename from camel/synthetic_datagen/source2synth/user_data_processor_config.py rename to camel/datagen/source2synth/user_data_processor_config.py index 9b99b9b831..8acc8cdaae 100644 --- a/camel/synthetic_datagen/source2synth/user_data_processor_config.py +++ b/camel/datagen/source2synth/user_data_processor_config.py @@ -23,7 +23,15 @@ class ProcessorConfig(BaseModel): r"""Data processing configuration class""" def __repr__(self): - return "MultiHopGeneratorAgent()" + return ( + f"ProcessorConfig(" + f"seed={self.seed}, min_length={self.min_length}, " + f"max_length={self.max_length}, " + f"complexity_threshold={self.complexity_threshold}, " + f"dataset_size={self.dataset_size}, " + f"use_ai_model={self.use_ai_model}" + f")" + ) model_config = ConfigDict( validate_assignment=True, @@ -45,13 +53,6 @@ def __repr__(self): default=512, description="Maximum text length", gt=0 ) - quality_threshold: float = Field( - default=0.7, - description="Quality threshold for processing", - ge=0.0, - le=1.0, - ) - complexity_threshold: float = Field( default=0.5, description="Complexity threshold for processing", diff --git a/examples/synthetic_datagen/self_instruct/data_output.json b/examples/datagen/self_instruct/data_output.json similarity index 100% rename from examples/synthetic_datagen/self_instruct/data_output.json rename to examples/datagen/self_instruct/data_output.json diff --git a/examples/synthetic_datagen/self_instruct/seed_tasks.jsonl b/examples/datagen/self_instruct/seed_tasks.jsonl similarity index 100% rename from examples/synthetic_datagen/self_instruct/seed_tasks.jsonl rename to examples/datagen/self_instruct/seed_tasks.jsonl diff --git a/examples/synthetic_datagen/self_instruct/self_instruct.py b/examples/datagen/self_instruct/self_instruct.py similarity index 100% rename from examples/synthetic_datagen/self_instruct/self_instruct.py rename to examples/datagen/self_instruct/self_instruct.py diff --git a/examples/source2synth.py b/examples/datagen/source2synth.py similarity index 59% rename from examples/source2synth.py rename to examples/datagen/source2synth.py index ba2549aedc..c4f236acca 100644 --- a/examples/source2synth.py +++ b/examples/datagen/source2synth.py @@ -15,10 +15,10 @@ import json import logging -from camel.synthetic_datagen.source2synth.data_processor import ( +from camel.datagen.source2synth.data_processor import ( UserDataProcessor, ) -from camel.synthetic_datagen.source2synth.user_data_processor_config import ( +from camel.datagen.source2synth.user_data_processor_config import ( ProcessorConfig, ) @@ -43,7 +43,6 @@ def main(): seed=42, min_length=50, max_length=1000, - quality_threshold=0.7, complexity_threshold=0.5, dataset_size=10, use_ai_model=True, @@ -171,3 +170,107 @@ def main(): if __name__ == "__main__": main() + + +''' +=============================================================================== +Constructing examples: 100%| +███████████████████████████████████████████████████████████████████████████████ +█| 1/1 [00:10<00:00, 10.98s/it] +Constructing examples: 100%| +███████████████████████████████████████████████████████████████████████████████ +█| 3/3 [00:22<00:00, 7.64s/it] + +=== Single Text Processing Example === + +Text 1: +Source: technology_evolution +Complexity: 0.88 + +Q&A Pairs: + +Q&A Pair 1: +Type: multi_hop_qa +Question: How did the invention of transistors impact the development of +personal computers? +Reasoning Steps: +1. {'step': 'Identify the role of transistors in electronics.'} +2. {'step': 'Understand how transistors enabled the miniaturization of +computers.'} +3. {'step': 'Connect the miniaturization of computers to the creation of +personal computers in the 1980s.'} +4. {'step': 'Determine the overall impact of personal computers on work and +communication.'} +Answer: The invention of transistors allowed for smaller and more efficient +computers, which led to the development of personal computers in the 1980s, +transforming work and communication. +Supporting Facts: +1. Transistors are semiconductor devices that revolutionized electronics. +2. The miniaturization of computers was made possible by transistors. +3. Personal computers emerged in the 1980s as a result of smaller computer +designs. +4. Personal computers changed how people work and communicate. + +Q&A Pair 2: +Type: multi_hop_qa +Question: What was the sequence of developments that led from transistors to +the internet? +Reasoning Steps: +1. {'step': 'Identify how transistors contributed to the development of +smaller and more efficient computers.'} +2. {'step': 'Explain how the miniaturization of computers resulted in the +creation of personal computers in the 1980s.'} +3. {'step': 'Discuss how personal computers transformed work and communication. +'} +4. {'step': 'Connect the transformation in communication to the rise of the +internet.'} +Answer: Transistors enabled smaller computers, which led to personal computers +in the 1980s, transforming communication and eventually giving rise to the +internet. +Supporting Facts: +1. Transistors are tiny semiconductor devices that made computers smaller and +more efficient. +2. The miniaturization of computers allowed for the creation of personal +computers in the 1980s. +3. Personal computers transformed how people work and communicate. +4. The digital revolution and personal computers contributed to the rise of +the internet, connecting billions worldwide. + +Q&A Pair 3: +Type: multi_hop_qa +Question: How did the miniaturization of computers contribute to the +development of artificial intelligence systems today? +Reasoning Steps: +1. {'step': 'Identify the impact of miniaturization on the creation of +personal computers in the 1980s.'} +2. {'step': 'Explain how personal computers transformed communication and work. +'} +3. {'step': 'Connect the digital revolution and the rise of the internet to +the development of artificial intelligence.'} +4. {'step': 'Discuss how the interconnected network of the internet supports +AI systems in various industries.'} +Answer: The miniaturization of computers led to personal computers, which +transformed communication and work, and this digital revolution, along with +the internet, supports the development of artificial intelligence systems +today. +Supporting Facts: +1. Miniaturization of computers enabled the creation of personal computers in +the 1980s. +2. Personal computers transformed how people work and communicate. +3. The digital revolution led to the rise of the internet, connecting billions +of people. +4. The internet powers artificial intelligence systems that are reshaping +various industries. + +=== Batch Processing Statistics === +Total texts processed: 3 +Total Q&A pairs generated: 9 + +=== Generation Statistics === +AI-generated multi-hop Q&A count: 9 +Template-generated multi-hop Q&A count: 0 + +Average reasoning steps: 4.00 +Average complexity score: 0.90 +=============================================================================== +''' \ No newline at end of file