HKUDS · LarFii · Dec 9, 2024 · Dec 9, 2024
diff --git a/lightrag/llm.py b/lightrag/llm.py
@@ -29,6 +29,7 @@
 from .utils import (
     wrap_embedding_func_with_attrs,
     locate_json_string_body_from_string,
+    safe_unicode_decode,
 )
 
 import sys
@@ -85,14 +86,14 @@ async def inner():
                 if content is None:
                     continue
                 if r"\u" in content:
-                    content = content.encode("utf-8").decode("unicode_escape")
+                    content = safe_unicode_decode(content.encode("utf-8"))
                 yield content
 
         return inner()
     else:
         content = response.choices[0].message.content
         if r"\u" in content:
-            content = content.encode("utf-8").decode("unicode_escape")
+            content = safe_unicode_decode(content.encode("utf-8"))
         return content
 
 

diff --git a/lightrag/utils.py b/lightrag/utils.py
@@ -507,3 +507,20 @@ async def save_to_cache(hashing_kv, cache_data: CacheData):
     }
 
     await hashing_kv.upsert({cache_data.mode: mode_cache})
+
+
+def safe_unicode_decode(content):
+    # Regular expression to find all Unicode escape sequences of the form \uXXXX
+    unicode_escape_pattern = re.compile(r"\\u([0-9a-fA-F]{4})")
+
+    # Function to replace the Unicode escape with the actual character
+    def replace_unicode_escape(match):
+        # Convert the matched hexadecimal value into the actual Unicode character
+        return chr(int(match.group(1), 16))
+
+    # Perform the substitution
+    decoded_content = unicode_escape_pattern.sub(
+        replace_unicode_escape, content.decode("utf-8")
+    )
+
+    return decoded_content