Skip to content

Commit

Permalink
Merge pull request #5 from EpicRandomGuy2/legendlore/dev
Browse files Browse the repository at this point in the history
Add creator links and first post to database entries, switch to new databases
  • Loading branch information
EpicRandomGuy2 authored Apr 11, 2024
2 parents df6ce0a + c201f45 commit d2f02fb
Show file tree
Hide file tree
Showing 4 changed files with 239 additions and 22 deletions.
9 changes: 5 additions & 4 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,21 @@
APP_NAME = "LegendLore"
APP_VERSION = "1.0"
if env == "PROD":
CONNECTION_STRING = "mongodb://192.168.1.47:27017/"
CONNECTION_STRING = "mongodb://192.168.1.47:27017/" # Prod DB
DB_NAME = "MapTaggerReddit"
else:
CONNECTION_STRING = "mongodb://localhost:27017/"
CONNECTION_STRING = "mongodb://192.168.1.47:27018/" # Dev DB
DB_NAME = "MapTaggerReddit"
DEFAULT_SUBREDDIT = "all"
CREDENTIALS_FILE = "credentials.json"
if env == "PROD":
NOTION_DB_ID = "95830a9189804ba29e9681e78b0236af" # Prod Notion
NOTION_DB_ID = "9bc0c253895d4dbfa4a9ca62833af0d3" # Prod Notion
else:
NOTION_DB_ID = "e7d05d2c6280444698b59fa79df3f78f" # Dev Notion
NOTION_DB_ID = "2bbecdb9a56943eb886f0ecf11d427d8" # Dev Notion
NOTION_DB_NAME = "LegendLore"
NUMBER_OF_DAYS_OLD = 7
UPDATE_SCORES_LIMIT = 250
IGNORE_SENT_TO_NOTION = False
SUBREDDITS = [
"battlemaps",
"dndmaps",
Expand Down
6 changes: 5 additions & 1 deletion legend_lore.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
SUBREDDITS,
NUMBER_OF_DAYS_OLD,
UPDATE_SCORES_LIMIT,
IGNORE_SENT_TO_NOTION,
)
from name_change import NAME_CHANGE

Expand Down Expand Up @@ -130,11 +131,14 @@ def main():
try:
notion.send_to_notion(
post,
overwrite=False,
overwrite=True,
ignore_sent_to_notion=IGNORE_SENT_TO_NOTION,
update_score=update_scores,
updated_score_titles=updated_score_titles,
)

# notion.send_creator_link_to_notion(post)

break
except Exception as e:
# If failure, wait 10 seconds and try again (up to 5 times)
Expand Down
36 changes: 21 additions & 15 deletions mongodb_local.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
from pymongo import MongoClient
from pandas import DataFrame
from config import CONNECTION_STRING, DB_NAME, DEFAULT_SUBREDDIT, SUBREDDITS
from config import (
CONNECTION_STRING,
DB_NAME,
DEFAULT_SUBREDDIT,
SUBREDDITS,
IGNORE_SENT_TO_NOTION,
)


def get_database_client(connection_string=CONNECTION_STRING, db_name=DB_NAME):
Expand Down Expand Up @@ -181,23 +187,23 @@ def set_sent_to_notion(
db_name=DB_NAME,
database=None,
):
if not IGNORE_SENT_TO_NOTION:
if database == None:
database_client = get_database_client(connection_string, db_name)
database = database_client[subreddit]

if database == None:
database_client = get_database_client(connection_string, db_name)
database = database_client[subreddit]

# Get post from DB
query = {"title": post["title"]}
post_df = DataFrame(database.find(query))
# Get post from DB
query = {"title": post["title"]}
post_df = DataFrame(database.find(query))

# Update any matching items in the query (should only be 1)
for index, row in post_df.iterrows():
query = {"_id": row["_id"]}
new = {"$set": {"sent_to_notion": sent}}
database.update_many(query, new)
database_client["all"].update_many(query, new)
# Update any matching items in the query (should only be 1)
for index, row in post_df.iterrows():
query = {"_id": row["_id"]}
new = {"$set": {"sent_to_notion": sent}}
database.update_many(query, new)
database_client["all"].update_many(query, new)

# print(f"sent_to_notion {sent} added to {post_df['title']}")
# print(f"sent_to_notion {sent} added to {post_df['title']}")


def update_post_score(
Expand Down
210 changes: 208 additions & 2 deletions notion.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import time
import re
import requests
import json
import httpx
Expand All @@ -10,7 +11,7 @@
from bs4 import BeautifulSoup
from mongodb_local import get_database_client, get_post_from_db, set_sent_to_notion
from pandas import DataFrame
from config import NOTION_DB_ID, CREDENTIALS_FILE
from config import NOTION_DB_ID, CREDENTIALS_FILE, IGNORE_SENT_TO_NOTION
from do_not_post import DO_NOT_POST
from name_change import NAME_CHANGE
from pprint import pprint
Expand All @@ -19,6 +20,7 @@
def send_to_notion(
post,
overwrite=False,
ignore_sent_to_notion=IGNORE_SENT_TO_NOTION,
update_score=False,
updated_score_titles=set(),
subreddit=None,
Expand Down Expand Up @@ -47,7 +49,15 @@ def send_to_notion(
return

# If dupe and no overwrite, skip this post
elif handle_duplicates(post, overwrite, subreddit=subreddit) == False:
elif (
handle_duplicates(
post,
overwrite,
ignore_sent_to_notion=ignore_sent_to_notion,
subreddit=subreddit,
)
== False
):
return
# Else if post does not exist, or if we are overwriting, make post as usual

Expand Down Expand Up @@ -96,6 +106,7 @@ def send_to_notion(
{"text": {"content": post["author"].lstrip("u/")}}
],
},
"Link": {"url": get_creator_link(post)},
"Score": {"type": "number", "number": post["score"]},
"Subreddit": {
"type": "multi_select",
Expand Down Expand Up @@ -269,6 +280,53 @@ def send_to_notion(

body["children"].extend(child)

# Append first comment child if it exists, else skip and move on
try:
first_comment = post["comments"][0]

# API only accepts 2000 characters per block - need to break it up
number_of_blocks = (len(first_comment) // 2000) + 1
word_break_index = 0
new_block_first_word = ""

for i in range(0, number_of_blocks):

# Logic to stop blocks from splitting words down the middle
word_block = first_comment[2000 * i : 2000 * (i + 1)]

# On the last loop we need the last word, there will be no rightmost_space
# so set it to take the highest possible index
if i != number_of_blocks - 1:
rightmost_space = word_block.rfind(" ")

else:
# Stops the last part of the post from getting cut off due to no space
rightmost_space = 2000

word_break_index = 2000 * i + rightmost_space

child = [
{
"object": "block",
"type": "quote",
"quote": {
"rich_text": parse_markdown_links(
new_block_first_word
+ first_comment[2000 * i : word_break_index]
)
},
}
]

# Don't include the space in the new block
new_block_first_word = word_block[rightmost_space + 1 :]

# print(child)

body["children"].extend(child)
except IndexError as e:
pass

body = json.dumps(body)

# print(body)
Expand Down Expand Up @@ -298,10 +356,15 @@ def send_to_notion(
def handle_duplicates(
post,
overwrite,
ignore_sent_to_notion=IGNORE_SENT_TO_NOTION,
subreddit=None,
credentials=CREDENTIALS_FILE,
):

# If rebuilding the whole database, no need for dupe handling (this speeds things up a ton)
if ignore_sent_to_notion:
return True

if not subreddit:
subreddit = post["subreddit"]

Expand Down Expand Up @@ -362,6 +425,48 @@ def name_in_do_not_post(post):
return post["author"] in DO_NOT_POST


# Need a parser because Notion is wack and doesn't natively do markdown. Only doing it for links for now.
def parse_markdown_links(text):
pattern = r"\[([^\]]+)\]\((http[s]?://[^\)]+)\)|http[s]?://[\w./%?#=-]+"
segments = []
last_end = 0

for match in re.finditer(pattern, text):
start_text = text[last_end : match.start()]
if start_text:
# At the risk of chopping off a few letters, do not let this be longer than 2000 or Notion's API will error
segments.append({"type": "text", "text": {"content": start_text[:2000]}})

if match.group(1) and match.group(2):
link_text = match.group(1) # Link text
link_url = match.group(2) # URL
segments.append(
{
"type": "text",
"text": {"content": link_text, "link": {"url": link_url}},
"annotations": {"bold": True},
}
)
else:
link_url = match.group(0) # The entire match is the URL
segments.append(
{
"type": "text",
"text": {"content": link_url, "link": {"url": link_url}},
}
)

last_end = match.end()

# Text after the last link (if any)
remaining_text = text[last_end:]
if remaining_text:
# At the risk of chopping off a few letters, do not let this be longer than 2000 or Notion's API will error
segments.append({"type": "text", "text": {"content": remaining_text[:2000]}})

return segments


def send_updated_score_to_notion(
post,
subreddit=None,
Expand Down Expand Up @@ -457,3 +562,104 @@ def send_updated_username_to_notion(name, credentials=CREDENTIALS_FILE):
)

# print(update_response.json())


def get_creator_link(post, credentials=CREDENTIALS_FILE):
# Returns None if no url (empty string throws an API error)
creator_link = None

# If there is a comment
if len(post["comments"]) > 0:
# Url regex
pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[_#?./%=-])+"

urls = re.findall(pattern, post["comments"][0])

# Get the first Patreon link - Doing a loop to check for Patreon links first
# to set them as higher priority (if it goes in order it might grab a wikipedia link or something)
for url in urls:
if "patreon" in url:
creator_link = url
return creator_link

# If no Patreon link just get the first other non-imgur non-reddit link
# (right 99% of the time, sometimes it grabs silly links like wikipedia)
for url in urls:
if not "imgur" in url and not "reddit" in url:
creator_link = url
return creator_link

# If no url return None
return creator_link


# Untested, unused function - for updating links on existing pages only
# def send_creator_link_to_notion(post, credentials=CREDENTIALS_FILE):

# print(f"Adding Patreon link for {post['author']} in Notion...")

# with open(credentials) as credentials_json:
# credentials = json.load(credentials_json)

# token = credentials["notion_token"]

# headers = {
# "Authorization": "Bearer " + token,
# "Content-Type": "application/json",
# "Notion-Version": "2022-06-28",
# }

# # Get page by title
# notion_search_url = f"https://api.notion.com/v1/databases/{NOTION_DB_ID}/query"
# search_payload = {
# "filter": {"property": "Name", "title": {"equals": post["title"]}}
# }

# search_response = requests.post(
# notion_search_url, json=search_payload, headers=headers
# )

# # print(search_response.json())

# # Update score for all pages matching post title
# for page in search_response.json()["results"]:

# notion_page_url = f"https://api.notion.com/v1/pages/{page['id']}"

# # Returns None if no url (empty string throws an API error)
# creator_link = None

# try:
# # Url regex
# pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[_#?./%=-])+"

# urls = re.findall(pattern, post["comments"][0])

# Get the first Patreon link - Doing a loop to check for Patreon links first
# to set them as higher priority (if it goes in order it might grab a wikipedia link or something)
# for url in urls:
# if "patreon" in url:
# creator_link = url
# return creator_link

# # Get the first Patreon link
# for url in urls:
# # Filter out Imgur and Reddit cause they're usually not going to a creator's site
# if not "imgur" in url and not "reddit" in url:
# creator_link = url
# break

# update_payload = {
# "properties": {
# "Creator": {"url": creator_link},
# },
# }

# update_response = requests.patch(
# notion_page_url, json=update_payload, headers=headers
# )

# except IndexError as e:
# print(post["title"], "error adding creator_link:", e)

# # print(update_response.json())

0 comments on commit d2f02fb

Please sign in to comment.