From 2f1c26fc477b8ee42126682061314917ec2283a3 Mon Sep 17 00:00:00 2001 From: Boris Lau Date: Wed, 31 Jul 2024 09:46:02 -0700 Subject: [PATCH] Handlig unicode on annotation creation This is an odd bug! On specific browser, they have noticed that on Label Studio this gives the following issues: ``` psycopg2.errors.InvalidTextRepresentation: invalid input syntax for type json DETAIL: Unicode low surrogate must follow a high surrogate ``` We work around this by doing a round trip to JSON, with `ensure_ascii=False` will ensure that any UNICODE characters are preserved, and therefore escaped on the round trip back. Note: we don't care about any unicode in annotation, as we don't use the text for referencing our annotation. Test Plan: We can reproduce this by forcing the annotation end point to consume unicode. 1. Use the `copy as CURL` option on your browser to copy the "update annotation" call from Label Studio 2. Inject some unicode into the CURL command - e.g. ``` curl 'http://localhost:8080/api/annotations/126?taskID=5979&project=43' \ -X 'PATCH' \ #.... snipped --data-raw $'{"result":[{"value":{"value":{"start":"/div[1]/div[1]/text()[1]","startOffset":0,"end":"/div[1]/div[1]/text()[1]","endOffset":88,"globalOffsets":{"start":0,"end":88},"text":"Some unicode: \udfff " ...],"draft_id":0,"parent_prediction":null,"parent_annotation":null,"project":"43"}' ``` 3. Run it againt the dev server, previously this will cause 500, and with the fix it should 200. --- label_studio/tasks/serializers.py | 40 +++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/label_studio/tasks/serializers.py b/label_studio/tasks/serializers.py index 648c3ab471e..9d35d1121c7 100644 --- a/label_studio/tasks/serializers.py +++ b/label_studio/tasks/serializers.py @@ -75,6 +75,42 @@ def create(self, *args, **kwargs): def validate_result(self, value): data = value + + # This is to work around a mysterious bug related to DRF handling: + # https://github.com/encode/django-rest-framework/issues/7026 + # + # This manifest itself as 500 error when writing to database: + # + # psycopg2.errors.InvalidTextRepresentation: invalid input syntax for + # type json + # DETAIL: Unicode low surrogate must follow a high surrogate + # + # This only happens to specific browser, presumably some bad unicode + # was leaked to the server. + # + # We work around this by doing a round trip to JSON, with + # ensure_ascii=False will ensure that any UNICODE characters are + # preserved, and therefore escaped on the round trip back. + # + # Note: we don't care about any unicode in annotation, as we don't use + # the text for referencing our annotation. + try: + json_str = value + if not isinstance(value, str): + json_str = json.dumps(value, ensure_ascii=False) + + # Strip unicode characters + non_unicode_json_str = json_str.encode('ascii', 'ignore').decode('ascii') + + # If the string is different, then we have some unicode characters + # then we will do the roundtrip de-unicoding. + if json_str != non_unicode_json_str: + logging.warning('Annotation result contains unicode characters, stripping them') + value = non_unicode_json_str + except Exception as e: + # Be defensive here, just proceed as normal if we can't strip unicode + logging.error(f'Error while trying to strip unicode characters from annotation result: {e}') + # convert from str to json if need if isinstance(value, str): try: @@ -282,7 +318,7 @@ def _insert_valid_user_reviews(dicts, members_email_to_id, default_user): if email not in members_email_to_id: obj['created_by_id'] = default_user.id logger.warning('Email not found in members_email_to_id, default user used instead') - + # resolve annotators by email else: obj['created_by_id'] = members_email_to_id[email] @@ -427,7 +463,7 @@ def add_reviews(self, task_reviews, annotation_mapping, project): """ Save task reviews to DB """ return [] - + def add_drafts(self, task_drafts, db_tasks, annotation_mapping, project): """ Save task drafts to DB """