Skip to content

Commit

Permalink
Add retry logic with detailled logs to extraction of video data from …
Browse files Browse the repository at this point in the history
…HTML page
  • Loading branch information
benoit74 committed Jul 10, 2024
1 parent 9fc26d4 commit fb295fe
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 6 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Fixed

- Restore functionality to resist temporary bad TED responses when parsing video pages (#209)

## [3.0.2] - 2024-06-24

### Changed
Expand Down
29 changes: 23 additions & 6 deletions src/ted2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import dateutil.parser
import jinja2
import yt_dlp
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag
from kiwixstorage import KiwixStorage
from pif import get_public_ip
from slugify import slugify
Expand Down Expand Up @@ -821,11 +821,28 @@ def extract_info_from_video_page(
try:
soup = BeautifulSoup(html_content, features="html.parser")

json_data = json.loads(
soup.find(
"script", attrs={"id": "__NEXT_DATA__"}
).string # pyright: ignore
)["props"]["pageProps"]["videoData"]
next_data_tag = soup.find("script", attrs={"id": "__NEXT_DATA__"})

# TED is sometimes inconsistant in sending HTML content, it sometimes sends
# the HTML without the required script containing the talks data, so we
# retry after 5 seconds
if (
not next_data_tag
or not isinstance(next_data_tag, Tag)
or not isinstance(next_data_tag.string, str)
):
logger.debug(
"Insufficient data returned by server, __NEXT_DATA__ script not "
"found in HTML page. Retrying in 5 seconds..."
)
time.sleep(5)
return self.extract_info_from_video_page(
url, retry_count=retry_count + 1
)

json_data = json.loads(next_data_tag.string)["props"]["pageProps"][
"videoData"
]

requested_lang_code = self.get_lang_code_from_url(url)
if requested_lang_code and json_data["language"] != requested_lang_code:
Expand Down

0 comments on commit fb295fe

Please sign in to comment.