From 55d6a9c0b2c3ffd286aa87e4530dc639cfbd7bf7 Mon Sep 17 00:00:00 2001 From: barry Date: Sun, 18 Feb 2024 16:58:21 -0500 Subject: [PATCH] Cleanup and final redgif support --- .../celery/task_logic/ingest_task_logic.py | 13 ++++- .../core/celery/tasks/ingest_tasks.py | 2 +- .../core/services/redgifs_token_manager.py | 29 +++++++++-- redditrepostsleuth/core/util/helpers.py | 49 +++++++++++++------ redditrepostsleuth/ingestsvc/ingestsvc.py | 2 +- 5 files changed, 73 insertions(+), 22 deletions(-) diff --git a/redditrepostsleuth/core/celery/task_logic/ingest_task_logic.py b/redditrepostsleuth/core/celery/task_logic/ingest_task_logic.py index d8e8b2a..00d71bc 100644 --- a/redditrepostsleuth/core/celery/task_logic/ingest_task_logic.py +++ b/redditrepostsleuth/core/celery/task_logic/ingest_task_logic.py @@ -51,7 +51,14 @@ def pre_process_post( post = reddit_submission_to_post(submission) + proxy = None + parsed_url = urlparse(post.url) + if parsed_url.netloc in domains_to_proxy: + proxy = proxy_manager.get_proxy().address + if post.post_type_id == 2: # image + + # Hacky RedGif support. Will need to be refactored if we have to do similar for other sites redgif_url = None if 'redgif' in post.url: token = redgif_manager.get_redgifs_token() @@ -62,7 +69,7 @@ def pre_process_post( redgif_manager.remove_redgifs_token('localhost') raise e - process_image_post(post, url=redgif_url) + process_image_post(post, url=redgif_url, proxy=proxy) elif post.post_type_id == 6: # gallery process_gallery(post, submission) @@ -82,7 +89,9 @@ def process_image_post(post: Post, url: str = None, proxy: str = None, hash_size :param hash_size: Size of hash :return: Post object with hashes """ - log.info('Hashing image with URL: %s', post.url) + log.debug('Hashing image with URL: %s', post.url) + if url: + log.info('Hashing %s', post.url) try: img = generate_img_by_url_requests(url or post.url, proxy=proxy) diff --git a/redditrepostsleuth/core/celery/tasks/ingest_tasks.py b/redditrepostsleuth/core/celery/tasks/ingest_tasks.py index ca79daf..4bb4726 100644 --- a/redditrepostsleuth/core/celery/tasks/ingest_tasks.py +++ b/redditrepostsleuth/core/celery/tasks/ingest_tasks.py @@ -69,7 +69,7 @@ def save_new_post(self, submission: dict, repost_check: bool = True): monitored_sub = uow.monitored_sub.get_by_sub(post.subreddit) if monitored_sub and monitored_sub.active: - log.info('Sending ingested post to monitored sub queue') + log.info('Sending ingested post to monitored sub queue for %s', monitored_sub.name) celery.send_task('redditrepostsleuth.core.celery.tasks.monitored_sub_tasks.sub_monitor_check_post', args=[post.post_id, monitored_sub], queue='submonitor', countdown=20) diff --git a/redditrepostsleuth/core/services/redgifs_token_manager.py b/redditrepostsleuth/core/services/redgifs_token_manager.py index e3500c2..ff9a20d 100644 --- a/redditrepostsleuth/core/services/redgifs_token_manager.py +++ b/redditrepostsleuth/core/services/redgifs_token_manager.py @@ -10,6 +10,10 @@ log = logging.getLogger(__name__) +""" +Class for managing and caching RedGifs API tokens. Currently overkill but if we need to backfill the database or +API rate limits get tight this will support caching a token for each proxy to Redis +""" class RedGifsTokenManager: def __init__(self): config = Config() @@ -22,16 +26,30 @@ def __init__(self): ) - def _cache_token(self, key: str, token: str): + def _cache_token(self, key: str, token: str) -> None: + """ + Take a given token and cache it to Redis + :param key: key of the token + :param token: API token + """ log.info('Caching token for %s', key) self.redis.set(f'redgifs-token:{key}', token, ex=82800) - def remove_redgifs_token(self, key: str): + def remove_redgifs_token(self, key: str) -> None: + """ + Removed a cached token from Redis with a given key + :param key: key to remove + """ log.info('Removing token for %s', key) self.redis.delete(f'redgifs-token:{key}') def get_redgifs_token(self, address: str = 'localhost') -> str: + """ + Either return an existing cached token or create a new one + :param address: address of the proxy being used + :return: Token + """ cached_token = self.redis.get(f'redgifs-token:{address}') if not cached_token: return self._request_and_cache_token(address) @@ -40,7 +58,12 @@ def get_redgifs_token(self, address: str = 'localhost') -> str: return cached_token - def _request_and_cache_token(self, proxy_address): + def _request_and_cache_token(self, proxy_address: str = 'localhost') -> str: + """ + Hit the Redgif API and request a new auth token. Cache it to Redis + :param proxy_address: Proxy to use, if any + :return: Token + """ proxies = None if proxy_address != 'localhost': proxies = {'http': f'https://{proxy_address}', 'https': f'http://{proxy_address}'} diff --git a/redditrepostsleuth/core/util/helpers.py b/redditrepostsleuth/core/util/helpers.py index 06347d3..d3c9d03 100644 --- a/redditrepostsleuth/core/util/helpers.py +++ b/redditrepostsleuth/core/util/helpers.py @@ -250,33 +250,52 @@ def get_default_image_search_settings(config: Config) -> ImageSearchSettings: ) def get_image_search_settings_from_request(req, config: Config) -> ImageSearchSettings: - return ImageSearchSettings( + search_settings = ImageSearchSettings( req.get_param_as_int('target_match_percent', required=True, default=None) or config.default_image_target_match, config.default_image_target_annoy_distance, target_title_match=req.get_param_as_int('target_title_match', required=False, default=None) or config.default_image_target_title_match, - filter_dead_matches=req.get_param_as_bool('filter_dead_matches', required=False, - default=None) or config.default_image_dead_matches_filter, - filter_removed_matches=req.get_param_as_bool('filter_removed_matches', required=False, - default=None) or config.default_image_removed_match_filter, - only_older_matches=req.get_param_as_bool('only_older_matches', required=False, - default=None) or config.default_image_only_older_matches, - filter_same_author=req.get_param_as_bool('filter_same_author', required=False, - default=None) or config.default_image_same_author_filter, - filter_crossposts=req.get_param_as_bool('filter_crossposts', required=False, - default=None) or config.default_image_crosspost_filter, + filter_dead_matches=req.get_param_as_bool('filter_dead_matches', required=False, default=None), + filter_removed_matches=req.get_param_as_bool('filter_removed_matches', required=False, default=None), + only_older_matches=req.get_param_as_bool('only_older_matches', required=False, default=None), + filter_same_author=req.get_param_as_bool('filter_same_author', required=False, default=None), + filter_crossposts=req.get_param_as_bool('include_crossposts', required=False, default=None), target_meme_match_percent=req.get_param_as_int('target_meme_match_percent', required=False, default=None) or config.default_image_target_meme_match, - meme_filter=req.get_param_as_bool('meme_filter', required=False, - default=None) or config.default_image_meme_filter, - same_sub=req.get_param_as_bool('same_sub', required=False, - default=None) or config.default_image_same_sub_filter, + meme_filter=req.get_param_as_bool('meme_filter', required=False, default=None), + same_sub=req.get_param_as_bool('same_sub', required=False, default=None), max_days_old=req.get_param_as_int('max_days_old', required=False, default=None) or config.default_link_max_days_old_filter, max_depth=10000 ) + if search_settings.filter_dead_matches is None: + search_settings.filter_dead_matches = config.default_image_dead_matches_filter + + if search_settings.filter_removed_matches is None: + search_settings.filter_removed_matches = config.default_image_removed_match_filter + + if search_settings.only_older_matches is None: + search_settings.only_older_matches = config.default_image_only_older_matches + + if search_settings.filter_same_author is None: + search_settings.filter_same_author = config.default_image_same_author_filter + + if search_settings.meme_filter is None: + search_settings.meme_filter = config.default_image_meme_filter + + if search_settings.filter_crossposts is None: + search_settings.filter_crossposts = config.default_image_crosspost_filter + else: + search_settings.filter_crossposts = not search_settings.filter_crossposts + + if search_settings.same_sub is None: + search_settings.same_sub = config.default_image_same_sub_filter + + + return search_settings + def get_default_link_search_settings(config: Config) -> SearchSettings: return SearchSettings( diff --git a/redditrepostsleuth/ingestsvc/ingestsvc.py b/redditrepostsleuth/ingestsvc/ingestsvc.py index 8f17965..28aa4b1 100644 --- a/redditrepostsleuth/ingestsvc/ingestsvc.py +++ b/redditrepostsleuth/ingestsvc/ingestsvc.py @@ -190,7 +190,7 @@ async def main() -> None: oldest_post = uow.posts.get_newest_post() oldest_id = oldest_post.post_id - #await ingest_range(newest_id, oldest_id) + await ingest_range(newest_id, oldest_id) delay = 0 while True: