From c201f45bf7dedd30ccc5d53c4a7febada986b04e Mon Sep 17 00:00:00 2001 From: EpicRandomGuy2 Date: Wed, 10 Apr 2024 23:55:46 -0400 Subject: [PATCH] Add creator links and first post to database entries, switch to new databases --- config.py | 9 +- legend_lore.py | 6 +- mongodb_local.py | 36 ++++---- notion.py | 210 ++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 239 insertions(+), 22 deletions(-) diff --git a/config.py b/config.py index 5f5e3f3..ef7bcbb 100644 --- a/config.py +++ b/config.py @@ -7,20 +7,21 @@ APP_NAME = "LegendLore" APP_VERSION = "1.0" if env == "PROD": - CONNECTION_STRING = "mongodb://192.168.1.47:27017/" + CONNECTION_STRING = "mongodb://192.168.1.47:27017/" # Prod DB DB_NAME = "MapTaggerReddit" else: - CONNECTION_STRING = "mongodb://localhost:27017/" + CONNECTION_STRING = "mongodb://192.168.1.47:27018/" # Dev DB DB_NAME = "MapTaggerReddit" DEFAULT_SUBREDDIT = "all" CREDENTIALS_FILE = "credentials.json" if env == "PROD": - NOTION_DB_ID = "95830a9189804ba29e9681e78b0236af" # Prod Notion + NOTION_DB_ID = "9bc0c253895d4dbfa4a9ca62833af0d3" # Prod Notion else: - NOTION_DB_ID = "e7d05d2c6280444698b59fa79df3f78f" # Dev Notion + NOTION_DB_ID = "2bbecdb9a56943eb886f0ecf11d427d8" # Dev Notion NOTION_DB_NAME = "LegendLore" NUMBER_OF_DAYS_OLD = 7 UPDATE_SCORES_LIMIT = 250 +IGNORE_SENT_TO_NOTION = False SUBREDDITS = [ "battlemaps", "dndmaps", diff --git a/legend_lore.py b/legend_lore.py index 84090d9..f6ee0d9 100644 --- a/legend_lore.py +++ b/legend_lore.py @@ -13,6 +13,7 @@ SUBREDDITS, NUMBER_OF_DAYS_OLD, UPDATE_SCORES_LIMIT, + IGNORE_SENT_TO_NOTION, ) from name_change import NAME_CHANGE @@ -130,11 +131,14 @@ def main(): try: notion.send_to_notion( post, - overwrite=False, + overwrite=True, + ignore_sent_to_notion=IGNORE_SENT_TO_NOTION, update_score=update_scores, updated_score_titles=updated_score_titles, ) + # notion.send_creator_link_to_notion(post) + break except Exception as e: # If failure, wait 10 seconds and try again (up to 5 times) diff --git a/mongodb_local.py b/mongodb_local.py index fa4ab9a..b7c0fe4 100644 --- a/mongodb_local.py +++ b/mongodb_local.py @@ -1,6 +1,12 @@ from pymongo import MongoClient from pandas import DataFrame -from config import CONNECTION_STRING, DB_NAME, DEFAULT_SUBREDDIT, SUBREDDITS +from config import ( + CONNECTION_STRING, + DB_NAME, + DEFAULT_SUBREDDIT, + SUBREDDITS, + IGNORE_SENT_TO_NOTION, +) def get_database_client(connection_string=CONNECTION_STRING, db_name=DB_NAME): @@ -181,23 +187,23 @@ def set_sent_to_notion( db_name=DB_NAME, database=None, ): + if not IGNORE_SENT_TO_NOTION: + if database == None: + database_client = get_database_client(connection_string, db_name) + database = database_client[subreddit] - if database == None: - database_client = get_database_client(connection_string, db_name) - database = database_client[subreddit] - - # Get post from DB - query = {"title": post["title"]} - post_df = DataFrame(database.find(query)) + # Get post from DB + query = {"title": post["title"]} + post_df = DataFrame(database.find(query)) - # Update any matching items in the query (should only be 1) - for index, row in post_df.iterrows(): - query = {"_id": row["_id"]} - new = {"$set": {"sent_to_notion": sent}} - database.update_many(query, new) - database_client["all"].update_many(query, new) + # Update any matching items in the query (should only be 1) + for index, row in post_df.iterrows(): + query = {"_id": row["_id"]} + new = {"$set": {"sent_to_notion": sent}} + database.update_many(query, new) + database_client["all"].update_many(query, new) - # print(f"sent_to_notion {sent} added to {post_df['title']}") + # print(f"sent_to_notion {sent} added to {post_df['title']}") def update_post_score( diff --git a/notion.py b/notion.py index ac99dce..626b57e 100644 --- a/notion.py +++ b/notion.py @@ -1,4 +1,5 @@ import time +import re import requests import json import httpx @@ -10,7 +11,7 @@ from bs4 import BeautifulSoup from mongodb_local import get_database_client, get_post_from_db, set_sent_to_notion from pandas import DataFrame -from config import NOTION_DB_ID, CREDENTIALS_FILE +from config import NOTION_DB_ID, CREDENTIALS_FILE, IGNORE_SENT_TO_NOTION from do_not_post import DO_NOT_POST from name_change import NAME_CHANGE from pprint import pprint @@ -19,6 +20,7 @@ def send_to_notion( post, overwrite=False, + ignore_sent_to_notion=IGNORE_SENT_TO_NOTION, update_score=False, updated_score_titles=set(), subreddit=None, @@ -47,7 +49,15 @@ def send_to_notion( return # If dupe and no overwrite, skip this post - elif handle_duplicates(post, overwrite, subreddit=subreddit) == False: + elif ( + handle_duplicates( + post, + overwrite, + ignore_sent_to_notion=ignore_sent_to_notion, + subreddit=subreddit, + ) + == False + ): return # Else if post does not exist, or if we are overwriting, make post as usual @@ -96,6 +106,7 @@ def send_to_notion( {"text": {"content": post["author"].lstrip("u/")}} ], }, + "Link": {"url": get_creator_link(post)}, "Score": {"type": "number", "number": post["score"]}, "Subreddit": { "type": "multi_select", @@ -269,6 +280,53 @@ def send_to_notion( body["children"].extend(child) + # Append first comment child if it exists, else skip and move on + try: + first_comment = post["comments"][0] + + # API only accepts 2000 characters per block - need to break it up + number_of_blocks = (len(first_comment) // 2000) + 1 + word_break_index = 0 + new_block_first_word = "" + + for i in range(0, number_of_blocks): + + # Logic to stop blocks from splitting words down the middle + word_block = first_comment[2000 * i : 2000 * (i + 1)] + + # On the last loop we need the last word, there will be no rightmost_space + # so set it to take the highest possible index + if i != number_of_blocks - 1: + rightmost_space = word_block.rfind(" ") + + else: + # Stops the last part of the post from getting cut off due to no space + rightmost_space = 2000 + + word_break_index = 2000 * i + rightmost_space + + child = [ + { + "object": "block", + "type": "quote", + "quote": { + "rich_text": parse_markdown_links( + new_block_first_word + + first_comment[2000 * i : word_break_index] + ) + }, + } + ] + + # Don't include the space in the new block + new_block_first_word = word_block[rightmost_space + 1 :] + + # print(child) + + body["children"].extend(child) + except IndexError as e: + pass + body = json.dumps(body) # print(body) @@ -298,10 +356,15 @@ def send_to_notion( def handle_duplicates( post, overwrite, + ignore_sent_to_notion=IGNORE_SENT_TO_NOTION, subreddit=None, credentials=CREDENTIALS_FILE, ): + # If rebuilding the whole database, no need for dupe handling (this speeds things up a ton) + if ignore_sent_to_notion: + return True + if not subreddit: subreddit = post["subreddit"] @@ -362,6 +425,48 @@ def name_in_do_not_post(post): return post["author"] in DO_NOT_POST +# Need a parser because Notion is wack and doesn't natively do markdown. Only doing it for links for now. +def parse_markdown_links(text): + pattern = r"\[([^\]]+)\]\((http[s]?://[^\)]+)\)|http[s]?://[\w./%?#=-]+" + segments = [] + last_end = 0 + + for match in re.finditer(pattern, text): + start_text = text[last_end : match.start()] + if start_text: + # At the risk of chopping off a few letters, do not let this be longer than 2000 or Notion's API will error + segments.append({"type": "text", "text": {"content": start_text[:2000]}}) + + if match.group(1) and match.group(2): + link_text = match.group(1) # Link text + link_url = match.group(2) # URL + segments.append( + { + "type": "text", + "text": {"content": link_text, "link": {"url": link_url}}, + "annotations": {"bold": True}, + } + ) + else: + link_url = match.group(0) # The entire match is the URL + segments.append( + { + "type": "text", + "text": {"content": link_url, "link": {"url": link_url}}, + } + ) + + last_end = match.end() + + # Text after the last link (if any) + remaining_text = text[last_end:] + if remaining_text: + # At the risk of chopping off a few letters, do not let this be longer than 2000 or Notion's API will error + segments.append({"type": "text", "text": {"content": remaining_text[:2000]}}) + + return segments + + def send_updated_score_to_notion( post, subreddit=None, @@ -457,3 +562,104 @@ def send_updated_username_to_notion(name, credentials=CREDENTIALS_FILE): ) # print(update_response.json()) + + +def get_creator_link(post, credentials=CREDENTIALS_FILE): + # Returns None if no url (empty string throws an API error) + creator_link = None + + # If there is a comment + if len(post["comments"]) > 0: + # Url regex + pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[_#?./%=-])+" + + urls = re.findall(pattern, post["comments"][0]) + + # Get the first Patreon link - Doing a loop to check for Patreon links first + # to set them as higher priority (if it goes in order it might grab a wikipedia link or something) + for url in urls: + if "patreon" in url: + creator_link = url + return creator_link + + # If no Patreon link just get the first other non-imgur non-reddit link + # (right 99% of the time, sometimes it grabs silly links like wikipedia) + for url in urls: + if not "imgur" in url and not "reddit" in url: + creator_link = url + return creator_link + + # If no url return None + return creator_link + + +# Untested, unused function - for updating links on existing pages only +# def send_creator_link_to_notion(post, credentials=CREDENTIALS_FILE): + +# print(f"Adding Patreon link for {post['author']} in Notion...") + +# with open(credentials) as credentials_json: +# credentials = json.load(credentials_json) + +# token = credentials["notion_token"] + +# headers = { +# "Authorization": "Bearer " + token, +# "Content-Type": "application/json", +# "Notion-Version": "2022-06-28", +# } + +# # Get page by title +# notion_search_url = f"https://api.notion.com/v1/databases/{NOTION_DB_ID}/query" +# search_payload = { +# "filter": {"property": "Name", "title": {"equals": post["title"]}} +# } + +# search_response = requests.post( +# notion_search_url, json=search_payload, headers=headers +# ) + +# # print(search_response.json()) + +# # Update score for all pages matching post title +# for page in search_response.json()["results"]: + +# notion_page_url = f"https://api.notion.com/v1/pages/{page['id']}" + +# # Returns None if no url (empty string throws an API error) +# creator_link = None + +# try: +# # Url regex +# pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[_#?./%=-])+" + +# urls = re.findall(pattern, post["comments"][0]) + +# Get the first Patreon link - Doing a loop to check for Patreon links first +# to set them as higher priority (if it goes in order it might grab a wikipedia link or something) +# for url in urls: +# if "patreon" in url: +# creator_link = url +# return creator_link + +# # Get the first Patreon link +# for url in urls: +# # Filter out Imgur and Reddit cause they're usually not going to a creator's site +# if not "imgur" in url and not "reddit" in url: +# creator_link = url +# break + +# update_payload = { +# "properties": { +# "Creator": {"url": creator_link}, +# }, +# } + +# update_response = requests.patch( +# notion_page_url, json=update_payload, headers=headers +# ) + +# except IndexError as e: +# print(post["title"], "error adding creator_link:", e) + +# # print(update_response.json())