From 067606b2147da6c8763aed351f4d075170577da5 Mon Sep 17 00:00:00 2001 From: Rongrong Date: Thu, 7 Nov 2024 01:36:49 +0800 Subject: [PATCH] =?UTF-8?q?fix(parsing.utils):=20hashtag-breaking=20'?= =?UTF-8?q?=E3=83=BB'=20not=20escaped?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rongrong --- docs/CHANGELOG.md | 1 + docs/CHANGELOG.zh.md | 1 + src/parsing/utils.py | 10 ++++++---- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 9b4863e686..91e70bb49f 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -11,6 +11,7 @@ - **Canonical `DATABASE_URL` not recognized**: Since v2.9.0, `DATABASE_URL` is canonicalized before connecting to the corresponding database. However, a canonical URL pointing to a local path cannot be recognized when checking the validity of the scheme (database type). Both canonical (`scheme:/path/to/file.db`) and traditional (`scheme:///path/to/file.db`) forms of such URLs are recognized correctly now. - **Monitoring not deferred as per server-side cache when subscribing**: Since v2.7.0, monitoring tasks will be deferred when aggressive server-side caches (e.g., Cloudflare and RSSHub, which make it futile to check for updates before cache expiration) are detected. However, the first monitoring task for a newly subscribed feed was not being deferred. This has been fixed and the first monitoring task now waits for the server-side cache to expire. +- **Minor bug fixes** ## v2.9.0: Telegraph-related revert, skip cert verification, and more diff --git a/docs/CHANGELOG.zh.md b/docs/CHANGELOG.zh.md index 3e1f400391..29bdbffc4b 100644 --- a/docs/CHANGELOG.zh.md +++ b/docs/CHANGELOG.zh.md @@ -11,6 +11,7 @@ - **无法识别规范的 `DATABASE_URL`**: 自 v2.9.0 起, 在连接到相应的数据库之前,`DATABASE_URL` 被规范化。然而,在检查 scheme (数据库类型) 的合法性时,无法识别指向本地路径的规范 URL。现在,此类 URL 的规范 (`scheme:/path/to/file.db`) 和传统 (`scheme:///path/to/file.db`) 形式都被正确识别。 - **订阅时不会根据服务端缓存延迟监控**:自 v2.7.0 起,当检测到激进的服务器端缓存时,监控任务将被延迟(例如 Cloudflare 和 RSSHub,它们使得在缓存过期之前检查更新变得徒劳无功)。但是,当新订阅 feed 时,第一个监视任务不会被推迟。该问题已修复,第一个监控任务会等待服务端缓存过期。 +- **次要的 bug 修复** ## v2.9.0: 与 Telegraph 相关的 revert、跳过证书校验和更多 diff --git a/src/parsing/utils.py b/src/parsing/utils.py index 85fe231b95..10ac5a26bb 100644 --- a/src/parsing/utils.py +++ b/src/parsing/utils.py @@ -94,9 +94,11 @@ '\u2028' # LINE SEPARATOR '\u2029' # PARAGRAPH SEPARATOR ) -CHARACTERS_TO_ESCAPE_IN_HASHTAG: Final[str] = ''.join( - # all characters here will be replaced with '_' - sorted(set(SPACES + INVALID_CHARACTERS + string.punctuation + string.whitespace)) +INVALID_CHARACTERS_IN_HASHTAG: Final[str] = ''.join( + sorted( + # Known characters that break hashtags. Though '・' breaks hashtags, it is not the case of '·'. + set(chain(SPACES, INVALID_CHARACTERS, string.punctuation, string.whitespace, '・')) + ) ) escapeSpecialCharInReSet = partial( @@ -155,7 +157,7 @@ def __merge_chars_into_ranged_set(sorted_chars: str) -> str: ' ', ) escapeHashtag = partial( - re.compile(rf'[{__merge_chars_into_ranged_set(CHARACTERS_TO_ESCAPE_IN_HASHTAG)}]+').sub, + re.compile(rf'[{__merge_chars_into_ranged_set(INVALID_CHARACTERS_IN_HASHTAG)}]+').sub, '_', ) isAbsoluteHttpLink = re.compile(r'^https?://').match