Skip to content

Commit

Permalink
working redgifs ingest. Proxy not enabled
Browse files Browse the repository at this point in the history
  • Loading branch information
barrycarey committed Feb 18, 2024
1 parent 61dff2f commit cd56958
Show file tree
Hide file tree
Showing 7 changed files with 178 additions and 16 deletions.
64 changes: 58 additions & 6 deletions redditrepostsleuth/core/celery/task_logic/ingest_task_logic.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,68 @@
import logging
import os
from hashlib import md5
from typing import Optional
from urllib.parse import urlparse

import imagehash
import redgifs
from redgifs import HTTPException

from redditrepostsleuth.core.db.databasemodels import Post, PostHash
from redditrepostsleuth.core.exception import ImageRemovedException, ImageConversionException, InvalidImageUrlException, \
GalleryNotProcessed
from redditrepostsleuth.core.util.imagehashing import log, generate_img_by_url_requests
from redditrepostsleuth.core.proxy_manager import ProxyManager
from redditrepostsleuth.core.services.redgifs_token_manager import RedGifsTokenManager
from redditrepostsleuth.core.util.constants import GENERIC_USER_AGENT
from redditrepostsleuth.core.util.imagehashing import generate_img_by_url_requests
from redditrepostsleuth.core.util.objectmapping import reddit_submission_to_post

log = logging.getLogger(__name__)

def pre_process_post(submission: dict) -> Optional[Post]:

def get_redgif_id_from_url(url: str) -> Optional[str]:
parsed_url = urlparse(url)
id, _ = os.path.splitext(parsed_url.path.replace('/i/', ''))
return id

def get_redgif_image_url(reddit_url: str, token: str, proxy: str = None) -> Optional[str]:

id = get_redgif_id_from_url(reddit_url)
if not id:
log.error('Failed to parse RedGifs ID from %s', reddit_url)
return

api = redgifs.API()
api.http._proxy = {'http': proxy, 'https': proxy}
api.http.headers.update({'User-Agent': GENERIC_USER_AGENT, 'authorization': f'Bearer {token}'})
try:
gif = api.get_gif(id)
except Exception as e:
log.error('')
return gif.urls.hd


def pre_process_post(
submission: dict,
proxy_manager: ProxyManager,
redgif_manager: RedGifsTokenManager,
domains_to_proxy: list[str]
) -> Optional[Post]:

post = reddit_submission_to_post(submission)

if post.post_type_id == 2: # image
process_image_post(post)
redgif_url = None
if 'redgif' in post.url:
token = redgif_manager.get_redgifs_token()
try:
redgif_url = get_redgif_image_url(submission['url'], token)
except HTTPException as e:
if 'code' in e.error and e.error['code'] == 'TokenDecodeError':
redgif_manager.remove_redgifs_token('localhost')
raise e

process_image_post(post, url=redgif_url)
elif post.post_type_id == 6: # gallery
process_gallery(post, submission)

Expand All @@ -28,12 +73,19 @@ def pre_process_post(submission: dict) -> Optional[Post]:
return post


def process_image_post(post: Post, hash_size: int = 16) -> Post:

def process_image_post(post: Post, url: str = None, proxy: str = None, hash_size: int = 16) -> Post:
"""
Process an image post to generate the required hashes
:param proxy: Proxy to request image with
:param post: post object
:param url: Alternate URL to use
:param hash_size: Size of hash
:return: Post object with hashes
"""
log.info('Hashing image with URL: %s', post.url)

try:
img = generate_img_by_url_requests(post.url)
img = generate_img_by_url_requests(url or post.url, proxy=proxy)
except ImageConversionException as e:
log.warning('Image conversion error: %s', e)
raise
Expand Down
46 changes: 41 additions & 5 deletions redditrepostsleuth/core/celery/tasks/ingest_tasks.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,47 @@
import json
import random
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import Optional

import requests
from celery import Task
from redgifs import HTTPException
from sqlalchemy.exc import IntegrityError

from redditrepostsleuth.core.celery import celery
from redditrepostsleuth.core.celery.basetasks import SqlAlchemyTask
from redditrepostsleuth.core.celery.task_logic.ingest_task_logic import pre_process_post
from redditrepostsleuth.core.celery.task_logic.ingest_task_logic import pre_process_post, get_redgif_image_url
from redditrepostsleuth.core.config import Config
from redditrepostsleuth.core.db.db_utils import get_db_engine
from redditrepostsleuth.core.db.uow.unitofworkmanager import UnitOfWorkManager
from redditrepostsleuth.core.exception import InvalidImageUrlException, GalleryNotProcessed, ImageConversionException, \
ImageRemovedException
ImageRemovedException, RedGifsTokenException
from redditrepostsleuth.core.logging import get_configured_logger
from redditrepostsleuth.core.proxy_manager import ProxyManager
from redditrepostsleuth.core.services.eventlogging import EventLogging
from redditrepostsleuth.core.services.redgifs_token_manager import RedGifsTokenManager
from redditrepostsleuth.core.util.constants import GENERIC_USER_AGENT
from redditrepostsleuth.core.util.objectmapping import reddit_submission_to_post

log = get_configured_logger('redditrepostsleuth')


@celery.task(bind=True, base=SqlAlchemyTask, ignore_reseults=True, serializer='pickle', autoretry_for=(ConnectionError,ImageConversionException,GalleryNotProcessed), retry_kwargs={'max_retries': 10, 'countdown': 300})
@dataclass
class RedGifsToken:
token: str
expires_at: datetime
proxy: str

class IngestTask(Task):
def __init__(self):
self.config = Config()
self.uowm = UnitOfWorkManager(get_db_engine(self.config))
self.event_logger = EventLogging()
self._redgifs_token_manager = RedGifsTokenManager()
self._proxy_manager = ProxyManager(self.uowm, 1000)
self.domains_to_proxy = []

@celery.task(bind=True, base=IngestTask, ignore_reseults=True, serializer='pickle', autoretry_for=(ConnectionError,ImageConversionException,GalleryNotProcessed, HTTPException), retry_kwargs={'max_retries': 10, 'countdown': 300})
def save_new_post(self, submission: dict, repost_check: bool = True):

# TODO: temp fix until I can fix imgur gifs
Expand All @@ -24,9 +54,15 @@ def save_new_post(self, submission: dict, repost_check: bool = True):
return

try:
post = pre_process_post(submission)
post = pre_process_post(submission, self._proxy_manager, self._redgifs_token_manager, [])
except (ImageRemovedException, InvalidImageUrlException) as e:
return
except GalleryNotProcessed as e:
log.warning('Gallery not finished processing')
raise e
except Exception as e:
log.exception('Failed during post pre-process')
return

if not post:
return
Expand Down
6 changes: 5 additions & 1 deletion redditrepostsleuth/core/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,8 @@ def __init__(self, message):

class UserNotFound(RepostSleuthException):
def __init__(self, message):
super(UserNotFound, self).__init__(message)
super(UserNotFound, self).__init__(message)

class RedGifsTokenException(RepostSleuthException):
def __init__(self, message):
super(RedGifsTokenException, self).__init__(message)
61 changes: 61 additions & 0 deletions redditrepostsleuth/core/services/redgifs_token_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import json
import logging

import requests
from redis import Redis

from redditrepostsleuth.core.config import Config
from redditrepostsleuth.core.exception import RedGifsTokenException
from redditrepostsleuth.core.util.constants import GENERIC_USER_AGENT

log = logging.getLogger(__name__)

class RedGifsTokenManager:
def __init__(self):
config = Config()
self.redis = Redis(
host=config.redis_host,
port=config.redis_port,
db=config.redis_database,
password=config.redis_password,
decode_responses=True
)


def _cache_token(self, key: str, token: str):
log.info('Caching token for %s', key)
self.redis.set(f'redgifs-token:{key}', token, ex=82800)

def remove_redgifs_token(self, key: str):
log.info('Removing token for %s', key)
self.redis.delete(f'redgifs-token:{key}')


def get_redgifs_token(self, address: str = 'localhost') -> str:
cached_token = self.redis.get(f'redgifs-token:{address}')
if not cached_token:
return self._request_and_cache_token(address)

log.debug('Found cached token for %s', address)
return cached_token


def _request_and_cache_token(self, proxy_address):
proxies = None
if proxy_address != 'localhost':
proxies = {'http': f'https://{proxy_address}', 'https': f'http://{proxy_address}'}

token_res = requests.get(
'https://api.redgifs.com/v2/auth/temporary',
headers={'User-Agent': GENERIC_USER_AGENT},
proxies=proxies
)

if token_res.status_code != 200:
log.error('Failed to get RedGif token. Status Code %s', token_res.status_code)
raise RedGifsTokenException(f'Failed to get RedGif token. Status Code {token_res.status_code}')

token_data = json.loads(token_res.text)

self._cache_token(proxy_address or 'localhost', token_data['token'])
return token_data['token']
2 changes: 2 additions & 0 deletions redditrepostsleuth/core/util/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"
}

GENERIC_USER_AGENT = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'

REDDIT_REMOVAL_REASONS = ['deleted', 'author', 'reddit', 'copyright_takedown', 'content_takedown']

EXCLUDE_FROM_TOP_REPOSTERS = [
Expand Down
12 changes: 9 additions & 3 deletions redditrepostsleuth/core/util/imagehashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from redditrepostsleuth.core.db.databasemodels import Post
from redditrepostsleuth.core.exception import ImageConversionException, ImageRemovedException, InvalidImageUrlException
from redditrepostsleuth.core.util.constants import GENERIC_USER_AGENT

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -51,23 +52,28 @@ def generate_img_by_url(url: str) -> Image:

return img if img else None

def generate_img_by_url_requests(url: str) -> Optional[Image]:
def generate_img_by_url_requests(url: str, proxy: str = None) -> Optional[Image]:
"""
Take a URL and generate a PIL image
:param proxy: Optional proxy to use with request
:param url: URL to get
:return: PIL image
"""
if 'redd.it' in url:
useragent = 'repostsleuthbot:v1.0.3 Image Hasher (by /u/barrycarey)'
else:
useragent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
useragent = GENERIC_USER_AGENT

headers = {
'User-Agent': useragent
}

proxies = None
if proxy:
proxies = {'http': proxy, 'https': proxy}

try:
res = requests.get(url, headers=headers, timeout=7)
res = requests.get(url, headers=headers, timeout=7, proxies=proxies)
except (ConnectionError, Timeout) as e:
raise ImageConversionException(str(e))

Expand Down
3 changes: 2 additions & 1 deletion worker-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ distance==0.1.3
pydantic==1.10.9
sentry-sdk==1.29.2
pyjwt==2.8.0
cryptography==41.0.6
cryptography==41.0.6
redgifs==1.9.0

0 comments on commit cd56958

Please sign in to comment.