Skip to content

Commit

Permalink
retry video info extract on missing videoData
Browse files Browse the repository at this point in the history
  • Loading branch information
elfkuzco committed Sep 17, 2024
1 parent 3d71c31 commit abecbcc
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 7 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed

- Restore functionality to resist temporary bad TED responses when parsing video pages (#209)
- Retry video data extraction if `videoData` is missing from page data (#226)

## [3.0.2] - 2024-06-24

Expand Down
26 changes: 19 additions & 7 deletions src/ted2zim/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,7 +803,7 @@ def extract_info_from_video_page(
"""

# Every TED video page has a <script>-tag with a Javascript
# object with JSON in it. We will just stip away the object
# object with JSON in it. We will just strip away the object
# signature and load the json to extract meta-data out of it.
# returns True if successfully scraped new video

Expand Down Expand Up @@ -840,9 +840,21 @@ def extract_info_from_video_page(
url, retry_count=retry_count + 1
)

json_data = json.loads(next_data_tag.string)["props"]["pageProps"][
"videoData"
]
# Sometimes, the video data is not included in the json data, so we retry
# the request.
try:
json_data = json.loads(next_data_tag.string)["props"]["pageProps"][
"videoData"
]
except KeyError:
logger.debug(
"Insufficient data returned by server, videoData not "
"found in JSON string. Retrying in 5 seconds..."
)
time.sleep(5)
return self.extract_info_from_video_page(
url, retry_count=retry_count + 1
)

requested_lang_code = self.get_lang_code_from_url(url)
if requested_lang_code and json_data["language"] != requested_lang_code:
Expand All @@ -855,10 +867,10 @@ def extract_info_from_video_page(
# and overwrite it accordingly
json_data["playerData"] = json.loads(json_data["playerData"])
return json_data
except Exception:
except Exception as exc:
logger.error(
f"Problem occured while parsing {url}. HTML content was:\n"
f"{html_content}"
f"Problem occured while parsing {url}, error: {exc!s}. "
f"HTML content was:\n{html_content}"
)
raise

Expand Down

0 comments on commit abecbcc

Please sign in to comment.