Skip to content

Commit

Permalink
feat: remove XML illegal chars in metadata before uploading
Browse files Browse the repository at this point in the history
  • Loading branch information
yzqzss committed Jul 25, 2023
1 parent 4c0516d commit 72c552a
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 13 deletions.
36 changes: 23 additions & 13 deletions biliarchiver/_biliarchiver_upload_bvid.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from biliarchiver.utils.identifier import human_readable_upper_part_map
from biliarchiver.config import BILIBILI_IDENTIFIER_PERFIX, config
from biliarchiver.utils.dirLock import UploadLock, AlreadyRunningError
from biliarchiver.utils.xml_chars import xml_chars_legalize
from biliarchiver.version import BILI_ARCHIVER_VERSION

def upload_bvid(bvid: str, *, update_existing: bool = False, collection: str):
Expand Down Expand Up @@ -154,17 +155,17 @@ def _upload_bvid(bvid: str, *, update_existing: bool = False, collection: str):
'scanner': f'biliarchiver v{BILI_ARCHIVER_VERSION} (dev)',
}

# XML 中不能有 \b 等特殊控制字符,IA 会拒收。
# 先只简单删 \b ,如果以后再发现元数据里出现其它非法字符,再说。
_md_str = json.dumps(md, ensure_ascii=False)
if "\\b" in _md_str:
print("WARNING: \\b in metadata, removing it")
md = json.loads(_md_str.replace("\\b", ""))
del(_md_str)

print(filedict)
print(md)

# remove XML illegal characters
_md_before = hash(json.dumps(md))
md = xml_chars_legalize(obj=md)
assert isinstance(md, dict)
if hash(json.dumps(md)) != _md_before:
print(f"Removed XML illegal characters from metadata, cleaned metadata:")
print(md)

if filedict:
r = item.upload(
files=filedict,
Expand All @@ -186,18 +187,27 @@ def _upload_bvid(bvid: str, *, update_existing: bool = False, collection: str):

new_md = {}
if item.metadata.get("upload-state") != "uploaded":
new_md.update({"upload-state": "uploaded"})
new_md["upload-state"] = "uploaded"
if item.metadata.get("creator") != md['creator']:
new_md.update({"creator": md['creator']})
new_md["creator"] = md['creator']
if item.metadata.get("description", "") != bv_info['data']['View']['desc']:
new_md.update({"description": bv_info['data']['View']['desc']})
new_md["description"] = bv_info['data']['View']['desc']
if item.metadata.get("scanner") != md['scanner']:
new_md.update({"scanner": md['scanner']})
new_md["scanner"] = md['scanner']
if item.metadata.get("external-identifier") != md['external-identifier']:
new_md.update({"external-identifier": md['external-identifier']})
new_md["external-identifier"] = md['external-identifier']
if new_md:
print(f"Updating metadata:")
print(new_md)

# remove XML illegal characters
_md_before = hash(json.dumps(md))
new_md = xml_chars_legalize(obj=new_md)
assert isinstance(new_md, dict)
if hash(json.dumps(new_md)) != _md_before:
print(f"Removed XML illegal characters from metadata, cleaned metadata:")
print(new_md)

r = item.modify_metadata(
metadata=new_md,
access_key=access_key,
Expand Down
73 changes: 73 additions & 0 deletions biliarchiver/utils/xml_chars.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from typing import Union

_xml_ILLEGAL_CHARS = []
_xml_ILLEGAL_CHARS.extend([bytes([i]).decode('ascii') for i in range(int('01', 16), int('08', 16)+1)])
_xml_ILLEGAL_CHARS.extend([bytes([i]).decode('ascii') for i in range(int('0b', 16), int('0c', 16)+1)])
_xml_ILLEGAL_CHARS.extend([bytes([i]).decode('ascii') for i in range(int('0e', 16), int('1f', 16)+1)])
_xml_ILLEGAL_CHARS.extend(['\x7f'])
# NOTE: The following are non-ASCII characters, which will not appear in string obj in Python.
# range(int('80', 16), int('84', 16)+1)])
# range(int('86', 16), int('9f', 16)+1)])
XML_ILLEGAL_CHARS = _xml_ILLEGAL_CHARS


def _legalize_str(s: str, print_info: bool=False):
for c in XML_ILLEGAL_CHARS:
hash_before = hash(s)
s = s.replace(c, '')
if print_info and hash(s) != hash_before:
print(f"Removed XML illegal char \\x{ord(c):02x}")
return s

def _legalize_list(l: list):
for i, v in enumerate(l):
if isinstance(v, str):
l[i] = _legalize_str(v)
elif isinstance(v, dict):
l[i] = _legalize_dict(v)
elif isinstance(v, list):
l[i] = _legalize_list(v)
else:
pass
return l

def _legalize_dict(d: dict):
for k, v in d.items():
if isinstance(v, str):
d[k] = _legalize_str(v)
elif isinstance(v, list):
d[k] = _legalize_list(v)
elif isinstance(v, dict):
d[k] = _legalize_dict(v)
else:
pass
return d

def xml_chars_legalize(obj: Union[dict, str, list]) -> Union[dict, str, list]:
""" Remove XML illegal characters from a dict, list or str. """
if isinstance(obj, str):
return _legalize_str(obj)
elif isinstance(obj, dict):
return _legalize_dict(obj)
elif isinstance(obj, list):
return _legalize_list(obj)
else:
raise TypeError(f"Unexpected type: {type(obj)}")

def _test_xml_chars_legalize():
_str = '\x0bA\x0cB\vC'
_str_after = 'ABC'
assert _legalize_str(_str) == _str_after
_list = ['\x0bA', '\x0cB', 'C\v']
_list_after = ['A', 'B', 'C']
assert _legalize_list(_list) == _list_after
_dict = {'A': '\x0bA', 'B': '\x0cB', 'C': 'C\v', 'list': _list}
_dict_after = {'A': 'A', 'B': 'B', 'C': 'C', 'list': _list_after}
assert _legalize_dict(_dict) == _dict_after
_dict_in_list_in_dict_in_dict = {"dict": [{'A': '\x0bA', 'B': '\x0cB', 'C': 'C\v', 'list': _list, 'dict': _dict}]}
_dict_balabala_after = {'dict': [{'A': 'A', 'B': 'B', 'C': 'C', 'list': _list_after, 'dict': _dict_after}]}
assert _legalize_dict(_dict_in_list_in_dict_in_dict) == _dict_balabala_after

if __name__ == '__main__':
# TODO: use pytest
_test_xml_chars_legalize()

0 comments on commit 72c552a

Please sign in to comment.