Source code for agentscope.parsers.regex_tagged_content_parser
+# -*- coding: utf-8 -*-
+"""The parser for dynamic tagged content"""
+import json
+import re
+from typing import Union, Sequence, Optional, List
+
+from loguru import logger
+
+from ..exception import TagNotFoundError
+from ..models import ModelResponse
+from ..parsers import ParserBase
+from ..parsers.parser_base import DictFilterMixin
+
+
+
+[docs]
+class RegexTaggedContentParser(ParserBase, DictFilterMixin):
+ """A regex tagged content parser, which extracts tagged content according
+ to the provided regex pattern. Different from other parsers, this parser
+ allows to extract multiple tagged content without knowing the keys in
+ advance. The parsed result will be a dictionary within the parsed field of
+ the model response.
+
+ Compared with other parsers, this parser is more flexible and can be used
+ in dynamic scenarios where
+ - the keys are not known in advance
+ - the number of the tagged content is not fixed
+
+ Note: Without knowing the keys in advance, it's hard to prepare a format
+ instruction template for different scenarios. Therefore, we ask the user
+ to provide the format instruction in the constructor. Of course, the user
+ can construct and manage the prompt by themselves optionally.
+
+ Example:
+ By default, the parser use a regex pattern to extract tagged content
+ with the following format:
+ ```
+ <{name1}>{content1}</{name1}>
+ <{name2}>{content2}</{name2}>
+ ```
+ The parser will extract the content as the following dictionary:
+ ```
+ {
+ "name1": content1,
+ "name2": content2,
+ }
+ ```
+ """
+
+
+[docs]
+ def __init__(
+ self,
+ tagged_content_pattern: str = r"<(?P<name>[^>]+)>"
+ r"(?P<content>.*?)"
+ r"</\1?>",
+ format_instruction: Optional[str] = None,
+ try_parse_json: bool = True,
+ required_keys: Optional[List[str]] = None,
+ keys_to_memory: Union[str, bool, Sequence[str]] = True,
+ keys_to_content: Union[str, bool, Sequence[str]] = True,
+ keys_to_metadata: Union[str, bool, Sequence[str]] = False,
+ ) -> None:
+ """Initialize the regex tagged content parser.
+
+ Args:
+ tagged_content_pattern (`Optional[str]`, defaults to
+ `"<(?P<name>[^>]+)>(?P<content>.*?)</\1?>"`):
+ The regex pattern to extract tagged content. The pattern should
+ contain two named groups: `name` and `content`. The `name`
+ group is used as the key of the tagged content, and the
+ `content` group is used as the value.
+ format_instruction (`Optional[str]`, defaults to `None`):
+ The instruction for the format of the tagged content, which
+ will be attached to the end of the prompt messages to remind
+ the LLM to follow the format.
+ try_parse_json (`bool`, defaults to `True`):
+ Whether to try to parse the tagged content as JSON. Note
+ the parsing function won't raise exceptions.
+ required_keys (`Optional[List[str]]`, defaults to `None`):
+ The keys that are required in the tagged content.
+ keys_to_memory (`Union[str, bool, Sequence[str]]`,
+ defaults to `True`):
+ The keys to save to memory.
+ keys_to_content (`Union[str, bool, Sequence[str]]`,
+ defaults to `True`):
+ The keys to save to content.
+ keys_to_metadata (`Union[str, bool, Sequence[str]]`,
+ defaults to `False`):
+ The key or keys to be filtered in `to_metadata` method. If
+ it's
+ - `False`, `None` will be returned in the `to_metadata` method
+ - `str`, the corresponding value will be returned
+ - `List[str]`, a filtered dictionary will be returned
+ - `True`, the whole dictionary will be returned
+ """
+
+ DictFilterMixin.__init__(
+ self,
+ keys_to_memory=keys_to_memory,
+ keys_to_content=keys_to_content,
+ keys_to_metadata=keys_to_metadata,
+ )
+
+ assert (
+ "<name>" in tagged_content_pattern
+ ), "The tagged content pattern should contain a named group 'name'."
+ assert (
+ "<content>" in tagged_content_pattern
+ ), "The tagged content pattern should contain a named group 'content'."
+
+ self.tagged_content_pattern = tagged_content_pattern
+ self._format_instruction = format_instruction
+ self.try_parse_json = try_parse_json
+ self.required_keys = required_keys or []
+
+
+ @property
+ def format_instruction(self) -> str:
+ """The format instruction for the tagged content."""
+ if self._format_instruction is None:
+ raise ValueError(
+ "The format instruction is not provided. Please provide it in "
+ "the constructor of the parser.",
+ )
+ return self._format_instruction
+
+
+[docs]
+ def parse(self, response: ModelResponse) -> ModelResponse:
+ """Parse the response text by the regex pattern, and return a dict of
+ the content in the parsed field of the response.
+
+ Args:
+ response (`ModelResponse`):
+ The response to be parsed.
+
+ Returns:
+ `ModelResponse`: The response with the parsed field as the parsed
+ result.
+ """
+ assert response.text is not None, "The response text is None."
+
+ matches = re.finditer(
+ self.tagged_content_pattern,
+ response.text,
+ flags=re.DOTALL,
+ )
+
+ results = {}
+ for match in matches:
+ results[match.group("name")] = match.group("content")
+
+ keys_missing = [
+ key for key in self.required_keys if key not in results
+ ]
+
+ if len(keys_missing) > 0:
+ raise TagNotFoundError(
+ f"Failed to find tags: {', '.join(keys_missing)}",
+ response.text,
+ )
+
+ if self.try_parse_json:
+ keys_failed = []
+ for key in results:
+ try:
+ results[key] = json.loads(results[key])
+ except json.JSONDecodeError:
+ keys_failed.append(key)
+
+ logger.debug(
+ f'Failed to parse JSON for keys: {", ".join(keys_failed)}',
+ )
+
+ response.parsed = results
+ return response
+
+
+