Skip to content

Commit

Permalink
perf(parsing.utils): ensure_plain: use run_async only if processing HTML
Browse files Browse the repository at this point in the history
Utilizing aio_helper.run_async() actually means that the function call
will be queued until the ThreadPoolExecutor is free to pick it up. The
queue can be extremely long under high loads.

Preventing ensure_plain from utilizing aio_helper.run_async() when
unneeded, so that the pressure of ThreadPoolExecutor won't be too high.

Signed-off-by: Rongrong <[email protected]>
  • Loading branch information
Rongronggg9 committed Aug 5, 2024
1 parent 8eaf7e3 commit f102f34
Showing 1 changed file with 7 additions and 7 deletions.
14 changes: 7 additions & 7 deletions src/parsing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>.

from __future__ import annotations
from typing import Optional, Sequence, Union, Final, Iterable, Awaitable
from typing import Optional, Sequence, Union, Final, Iterable

import re
import string
Expand Down Expand Up @@ -166,13 +166,17 @@ async def html_validator(html: str) -> str:
return await run_async(_html_validator, html, prefer_pool='thread')


def ensure_plain_sync(s: str, enable_emojify: bool = False) -> str:
def _bs_html_get_text(s: str) -> str:
return BeautifulSoup(s, 'lxml').get_text()


async def ensure_plain(s: str, enable_emojify: bool = False) -> str:
if not s:
return s
s = stripAnySpace(
replaceSpecialSpace(
replaceInvalidCharacter(
BeautifulSoup(s, 'lxml').get_text()
await run_async(_bs_html_get_text, s, prefer_pool='thread')
if '<' in s and '>' in s
else unescape(s)
)
Expand All @@ -181,10 +185,6 @@ def ensure_plain_sync(s: str, enable_emojify: bool = False) -> str:
return emojify(s) if enable_emojify else s


def ensure_plain(s: str, enable_emojify: bool = False) -> Awaitable[str]:
return run_async(ensure_plain_sync, s, enable_emojify, prefer_pool='thread')


async def parse_entry(entry, feed_link: Optional[str] = None):
class EntryParsed:
content: str = ''
Expand Down

0 comments on commit f102f34

Please sign in to comment.