From a9734002017a28159b160c36bc34e7266a91a290 Mon Sep 17 00:00:00 2001 From: Lauler Date: Sun, 17 Dec 2023 16:13:35 +0100 Subject: [PATCH] Update bot to use only PRAW, remove Pushshift API --- bot.py | 68 +++++++++------ src/comment.py | 12 +-- src/data.py | 199 ++++++++++++++++++++++++++++++------------ src/markdown.py | 67 +++++++------- src/pushshift/data.py | 143 ++++++++++++++++++++++++++++++ 5 files changed, 365 insertions(+), 124 deletions(-) create mode 100644 src/pushshift/data.py diff --git a/bot.py b/bot.py index f09314a..29ac033 100755 --- a/bot.py +++ b/bot.py @@ -1,29 +1,25 @@ +import logging import os import torch +import praw import pandas as pd import datetime as dt -import logging -import praw -from psaw import PushshiftAPI +from dotenv import load_dotenv from transformers import ( - AutoTokenizer, - AutoModelForTokenClassification, AutoModelForSeq2SeqLM, + AutoModelForTokenClassification, + AutoTokenizer, pipeline, ) from src.comment import choose_post, create_reply_msg from src.data import ( - download_comments, + analyze_comments, download_submission, - filter_comments, get_posted_comments, merge_comment_submission, - predict_comments, - preprocess_comments, save_feather, ) from src.translate import translation_preprocess -from dotenv import load_dotenv logging.basicConfig( filename="sprakpolisen.log", @@ -39,9 +35,14 @@ model = AutoModelForTokenClassification.from_pretrained("Lauler/deformer") model.to(device) +# NER pipeline +pipe = pipeline("ner", model=model, tokenizer=tokenizer, device=0) + # Machine Translation model tokenizer_translate = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-sv-en") -model_translate = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-sv-en", output_attentions=True) +model_translate = AutoModelForSeq2SeqLM.from_pretrained( + "Helsinki-NLP/opus-mt-sv-en", output_attentions=True +) model_translate.eval() model_translate.to(device) @@ -61,22 +62,27 @@ password=pw, ) -api = PushshiftAPI(reddit) +subreddit = reddit.subreddit("sweden") -df = download_comments(api, weeks=0, hours=4, minutes=45) -df = preprocess_comments(df) # Sentence splitting, and more -pipe = pipeline("ner", model=model, tokenizer=tokenizer, device=0) -df = predict_comments(df, pipe, threshold=0.98) # Only saves preds above threshold -df_comment = filter_comments(df) -#### Write comment info to file #### -date = dt.datetime.now().strftime("%Y-%m-%d_%H-%M") -save_feather(df_comment, type="comment", date=date) +df_subs = [] +df_comments = [] +for submission in subreddit.hot(limit=35): + if submission.num_comments == 0: + continue -#### Download info about submission thread #### -df_comment["id_sub"] = df_comment["link_id"].str.slice(start=3) -df_sub = download_submission(df_comment["id_sub"].tolist(), reddit_api=reddit) + df_sub = download_submission(submission) + df_comment = analyze_comments(submission, pipe=pipe) + df_subs.append(df_sub) + df_comments.append(df_comment) + +df_sub = pd.concat(df_subs).reset_index(drop=True) +df_comment = pd.concat(df_comments).reset_index(drop=True) + +#### Write comment and submission info to file #### +date = dt.datetime.now().strftime("%Y-%m-%d_%H-%M") +save_feather(df_comment, type="comment", date=date) save_feather(df_sub, type="submission", date=date) # Merge @@ -89,12 +95,17 @@ except: pass +df_all = df_all[~(df_all["n_mis_det"] == 1)].reset_index(drop=True) # Choose which comment to post reply to -df_post = choose_post(df_all, min_hour=0.7, max_hour=17) +df_post = choose_post(df_all, min_hour=0.7, max_hour=19) + +df_all.columns +# df_post = df_all.iloc[1:2].reset_index(drop=True) -# df_post = df_all.iloc[2:3].reset_index(drop=True) -df_post["sentences"] = df_post["sentences"].apply(lambda sens: [sen.replace("…", ".") for sen in sens]) +df_post["sentences"] = df_post["sentences"].apply( + lambda sens: [sen.replace("…", ".") for sen in sens] +) #### Translate to English pipes = translation_preprocess( @@ -106,8 +117,8 @@ reply_msg = create_reply_msg(df_post, pipes=pipes) -save_feather(df_all, type="all", date=date) +save_feather(df_all, type="all", date=date) for i in range(len(df_all)): try: @@ -123,7 +134,7 @@ # if a single comment author in the comment chain has blocked SprakpolisenBot. logging.error(f'Failed replying to comment id {df_post["id"][0]} because of block.') df_all = df_all[df_all["id"] != df_post["id"][0]] # Remove unsuccessful reply attempt - df_post = choose_post(df_all, min_hour=1, max_hour=17) + df_post = choose_post(df_all, min_hour=1, max_hour=19) #### Translate to English pipes = translation_preprocess( @@ -134,6 +145,7 @@ ) reply_msg = create_reply_msg(df_post, pipes=pipes) + logging.info("Succesfully replied.") # Save replies/posted comments diff --git a/src/comment.py b/src/comment.py index e7e25a0..6184089 100644 --- a/src/comment.py +++ b/src/comment.py @@ -41,11 +41,11 @@ def choose_post(df_all, min_hour, max_hour): logger.exception("No suitable comment reply candidates. Exiting.") raise e - if any(df_all["nr_mistakes"] > 1): - df_multimistake = df_all[df_all["nr_mistakes"] > 1].reset_index(drop=True) + if any(df_all["n_mis"] > 1): + df_multimistake = df_all[df_all["n_mis"] > 1].reset_index(drop=True) if len(df_multimistake) > 1: - max_mistake_idx = df_multimistake["nr_mistakes"].idxmax() + max_mistake_idx = df_multimistake["n_mis"].idxmax() df_post = df_multimistake.iloc[max_mistake_idx : (max_mistake_idx + 1), :] else: df_post = df_multimistake @@ -81,7 +81,6 @@ def correct_sentence(preds, sentences): offset = 0 correct_sens = [] for pred, sentence in zip(preds, sentences): - if len(pred) == 0: continue @@ -108,7 +107,7 @@ def correct_sentence(preds, sentences): def correct_sentence_en(preds, correct_sens): """ - We have correctly translated sentences already, but want to + We have correctly translated sentences already, but want to introduce the wrong form of they/them with a strikethrough next to the already corrected instance of they/them/the/those. """ @@ -131,14 +130,12 @@ def correct_sentence_en(preds, correct_sens): them_words = ["Them", "them"] for i, (pred, sentence) in enumerate(zip(preds_en, correct_sens)): - target_indices = [] # To check if we're trying to correct same word twice or more contains_pred_mismatch = False # Check if 'de' maps to they_words, and 'dem' to them_words if len(pred) == 0: continue for j, entity in enumerate(pred): - target_indices.append(entity["index"]) if entity["word"] not in (they_words + them_words): logger.info( @@ -211,4 +208,3 @@ def create_reply_msg(df_post, pipes): message += create_guide(df_post) return message - diff --git a/src/data.py b/src/data.py index e771e5f..4ffbc22 100644 --- a/src/data.py +++ b/src/data.py @@ -1,11 +1,10 @@ +import glob import os -import time import datetime as dt import re import pandas as pd import logging from nltk.tokenize import sent_tokenize -from .utils import join_insertion from .markdown import remove_emoji @@ -80,14 +79,30 @@ ] +def filter_dedem_comments(df): + """ + Keep only comments with de/dem. + """ + dedem_pattern = "(? 1, dedem_sentences) + return list(dedem_sentences) @@ -100,13 +115,22 @@ def predict_dedem(comment, pipe): de_pattern = "(? 0].reset_index(drop=True) + df = df[df["sentences"].apply(lambda x: any([len(y) != 0 for y in x])) > 0].reset_index( + drop=True + ) # Split sentences also on new paragraphs "\n\n" (in case someone doesn't use punctuation) df["sentences"] = df["sentences"].apply(lambda sens: [sen.splitlines() for sen in sens]) # Flatten list of lists and remove empty sentences consisting of only ''. - df["sentences"] = df["sentences"].apply(lambda sens: [sen for split_sens in sens for sen in split_sens]) + df["sentences"] = df["sentences"].apply( + lambda sens: [sen for split_sens in sens for sen in split_sens] + ) df["sentences"] = [[sen for sen in sens if len(sen) > 0] for sens in df["sentences"]] # Remove emojis @@ -206,7 +244,9 @@ def preprocess_comments(df): df["sentences"] = df["sentences"].apply(lambda sens: [sen.strip() for sen in sens]) # Remove 2 or more spaces in a row and replace by single space. - df["sentences"] = df["sentences"].apply(lambda sens: [re.sub(" {2,}", " ", sen) for sen in sens]) + df["sentences"] = df["sentences"].apply( + lambda sens: [re.sub(" {2,}", " ", sen) for sen in sens] + ) logger.info("Finished preprocessing.") @@ -230,11 +270,27 @@ def predict_comments(df, pipe, threshold=0.98): return df -def count_incorrect(preds, word): - +def count_incorrect_word(preds, word): + """ + Only use for "det". + """ count = 0 for pred in preds: + for entity in pred: + if len(pred) == 0: + break + + count += 1 if (entity["word"].lower() == word and entity["entity"] != "DET") else 0 + + return count + +def count_all_word(preds, word): + """ + Count all occurrences of a word in the original comment (entity["word"] is the original word). + """ + count = 0 + for pred in preds: for entity in pred: if len(pred) == 0: break @@ -244,18 +300,46 @@ def count_incorrect(preds, word): return count +def count_incorrect_entity(preds, word): + count = 0 + for pred in preds: + for entity in pred: + if len(pred) == 0: + break + + count += 1 if entity["entity"].lower() == word else 0 + + return count + + def filter_comments(df): logger.info("Filtering comments...") # Remove rows with only empty predictions (i.e. sentence preds under the threshold) - df_comment = df[df["pred"].apply(lambda x: any([len(y) != 0 for y in x])) > 0].reset_index(drop=True) + df_comment = df[df["pred"].apply(lambda x: any([len(y) != 0 for y in x])) > 0].reset_index( + drop=True + ) # Create extra variables - df_comment["nr_mistakes"] = df_comment["pred"].apply(lambda x: sum([len(y) for y in x])) - df_comment["nr_mistakes_de"] = df_comment["pred"].apply(lambda x: count_incorrect(x, word="de")) - df_comment["nr_mistakes_dem"] = df_comment["pred"].apply(lambda x: count_incorrect(x, word="dem")) + df_comment["n_mis"] = df_comment["pred"].apply(lambda x: sum([len(y) for y in x])) + df_comment["n_mis_de"] = df_comment["pred"].apply(lambda x: count_incorrect_word(x, word="de")) + df_comment["n_mis_dem"] = df_comment["pred"].apply( + lambda x: count_incorrect_word(x, word="dem") + ) + df_comment["n_mis_det"] = df_comment["pred"].apply( + lambda x: count_incorrect_entity(x, word="det") + ) + df_comment["n_mis_enda"] = df_comment["pred"].apply( + lambda x: count_incorrect_word(x, word="enda") + ) + df_comment["n_mis_ända"] = df_comment["pred"].apply( + lambda x: count_incorrect_word(x, word="ända") + ) + df_comment["author"] = df_comment["author"].apply(lambda x: x.name if x is not None else None) df_comment = df_comment[df_comment["author"] != "SprakpolisenBot"] # Filter out bot's comments - df_comment["subreddit"] = df_comment["subreddit"].apply(lambda x: x.display_name if x is not None else None) + df_comment["subreddit"] = df_comment["subreddit"].apply( + lambda x: x.display_name if x is not None else None + ) if len(df_comment) > 0: df_comment["time_downloaded"] = int(dt.datetime.now().timestamp()) @@ -283,35 +367,6 @@ def save_feather(df, type, date): df.to_feather(f"data/{type}/{date}_{type}.feather") -def download_submission(link_ids, reddit_api, backoff_factor=0.4): - - link_ids = list(set(link_ids)) - df_list = [] - for link_id in link_ids: - for i in range(5): - # Exponential backoff - backoff_time = backoff_factor * (2**i) - - try: - submission = reddit_api.submission(id=link_id) - logger.info(f"Downloading {submission.title} at {submission.permalink}") - submission_data = vars(submission) - submission_cols = {key: submission_data[key] for key in KEEP_SUBMISSION_COLUMNS} - df_list.append(submission_cols) - break - - except: - logger.error(f"Download of {link_id} failed. Retry {i}.") - - time.sleep(backoff_time) - - df_sub = pd.DataFrame(df_list) - df_sub = df_sub.add_suffix("_sub") - df_sub["author_sub"] = df_sub["author_sub"].apply(lambda x: x.name if x is not None else None) - - return df_sub - - def merge_comment_submission(df_comment, df_sub): df_all = df_comment.merge( df_sub[ @@ -340,6 +395,18 @@ def merge_comment_submission(df_comment, df_sub): return df_all +def get_previous_submissions(folder="data/submission"): + list_of_files = glob.glob(f"{folder}/*") + + # If empty folder, return empty dataframe + if len(list_of_files) == 0: + return pd.DataFrame() + + latest_file = max(list_of_files, key=os.path.getctime) + df_sub = pd.read_feather(latest_file) + return df_sub + + def get_posted_comments(folder="data/posted"): """ Retrieve and save posted comments to single file. @@ -374,7 +441,9 @@ def aggregate_posted_comments(folder="data/posted"): posted_files = [re.match(r"(.*)_posted\.feather", file).group(1) for file in posted_files] posted_files = [re.sub("_", " ", file) for file in posted_files] - posted_files = [re.sub(r"(\d{4}-\d{2}-\d{2}\s\d{2})-(\d{2})", r"\1:\2", file) for file in posted_files] + posted_files = [ + re.sub(r"(\d{4}-\d{2}-\d{2}\s\d{2})-(\d{2})", r"\1:\2", file) for file in posted_files + ] df["replied_time"] = posted_files df["replied_time"] = pd.to_datetime(df["replied_time"]) @@ -391,3 +460,21 @@ def aggregate_posted_comments(folder="data/posted"): df.to_feather(os.path.join(dest_dir, "df_posted.feather")) return df + + +def analyze_comments(submission, pipe): + """ + Run all filters and predictions on the comments of a submission. + """ + + df_comment = download_comments(submission) + # Regex and sentence splitting + df_comment = preprocess_comments(df_comment) + # Keep only comments with de/dem + df_comment = filter_dedem_comments(df_comment) + # Only saves preds above threshold + df_comment = predict_comments(df_comment, pipe, threshold=0.985) + # Keep only comments with incorrect usage of de/dem/det + df_comment = filter_comments(df_comment) + + return df_comment diff --git a/src/markdown.py b/src/markdown.py index 9129647..c0d0c68 100644 --- a/src/markdown.py +++ b/src/markdown.py @@ -52,8 +52,8 @@ def reverse_replace(text, old, new, n): def wrongful_de_dem(df_post): - de_nr = df_post["nr_mistakes_de"].iloc[0] - dem_nr = df_post["nr_mistakes_dem"].iloc[0] + de_nr = df_post["n_mis_de"].iloc[0] + dem_nr = df_post["n_mis_dem"].iloc[0] if de_nr == 0 and dem_nr > 0: wrongful_msg = f"**{dem_nr}** felaktiga användningar av `dem`" @@ -61,15 +61,10 @@ def wrongful_de_dem(df_post): wrongful_msg = f"**{de_nr}** felaktiga användningar av `de`" elif de_nr > 0 and dem_nr > 0: wrongful_msg = ( - f"**{de_nr}** felaktiga användningar av `de` " - f"samt **{dem_nr}** felaktiga användningar av `dem`" + f"**{de_nr}** felaktiga användningar av `de` " f"samt **{dem_nr}** felaktiga användningar av `dem`" ) - if ( - (de_nr == 1 and dem_nr == 0) - or (de_nr == 0 and dem_nr == 1) - or (de_nr == 1 and dem_nr == 1) - ): + if (de_nr == 1 and dem_nr == 0) or (de_nr == 0 and dem_nr == 1) or (de_nr == 1 and dem_nr == 1): wrongful_msg = wrongful_msg.replace("felaktiga", "felaktig") wrongful_msg = wrongful_msg.replace("användningar", "användning") @@ -98,12 +93,20 @@ def create_analysis_legend(): def create_header(df_post): - if df_post["nr_mistakes"][0] <= 2: + if df_post["n_mis_det"][0] >= 2: + ts = "" + for _ in range(0, df_post["n_mis_det"][0]): + ts += "**t** " + + message = ( + f'Tjenixen, SpråkpolisenBot här {add_emoji("police")}. Jag är en bot som hittar borttappade t:n i `det`. ' + f"Har du möjligtvis glömt dessa: {ts}? " + ) + elif df_post["n_mis"][0] <= 2: message = ( - f'Tjenixen, SpråkpolisenBot här {add_emoji("police")}. Jag är en bot som ' - f"skiljer på `de` och `dem`. " + f'Tjenixen, SpråkpolisenBot här {add_emoji("police")}. Jag är en bot som ' f"skiljer på `de` och `dem`. " ) - if df_post["nr_mistakes"][0] >= 3: + elif df_post["n_mis"][0] >= 3: message = ( f'Stopp {add_emoji("car")}{add_emoji("siren")}! ' f'Du har blivit gripen av SpråkpolisenBot {add_emoji("police")} ' @@ -123,16 +126,24 @@ def create_header(df_post): def create_guide(df_post): - guide_message = ( - f"En guide med tips för att skilja på `de` och `dem` finnes " - f"på [Språkpolisens hemsida](https://lauler.github.io/sprakpolisen/guide.html). " - # f"En interaktiv demo där användare själva kan skriva in meningar och få dem " - # f"rättade [finns här](https://lauler.github.io/sprakpolisen/demo.html)." - ) + + if df_post["n_mis_det"][0] >= 2: + guide_message = ( + f"Ett skippat **t** sparar dig kanske någon tiondels sekund, men kostar samtidigt " + f"minst lika mycket tid för varje enskild läsare av dina kommentarer. Respektera dina " + f"medredditörer: lås fast dina t:n i de**t**!" + ) + else: + guide_message = ( + f"En guide med tips för att skilja på `de` och `dem` finnes " + f"på [Språkpolisens hemsida](https://lauler.github.io/sprakpolisen/guide.html). " + # f"En interaktiv demo där användare själva kan skriva in meningar och få dem " + # f"rättade [finns här](https://lauler.github.io/sprakpolisen/demo.html)." + ) message = "" - if df_post["nr_mistakes_dem"][0] >= 2: + if df_post["n_mis_dem"][0] >= 2: added_message = ( f"Visste du att `de` är cirka 10 gånger vanligare än `dem` i svensk text? " f"Om du är osäker kring vilket som är rätt är det alltså statistiskt sett säkrast " @@ -140,9 +151,7 @@ def create_guide(df_post): ) message += add_paragraph(added_message) - if df_post["sentences"].apply( - lambda sens: any([bool(re.search("[Dd]em flesta", sen)) for sen in sens]) - )[0]: + if df_post["sentences"].apply(lambda sens: any([bool(re.search("[Dd]em flesta", sen)) for sen in sens]))[0]: added_message = ( f"Visste du att det aldrig kan heta ~~dem flesta~~ på svenska? **De flesta** " f"är den enda korrekta formen av uttrycket." @@ -150,9 +159,7 @@ def create_guide(df_post): message += add_paragraph(added_message) for word in ["andra", "värsta", "bästa", "sämsta", "första"]: - if df_post["sentences"].apply( - lambda sens: any([bool(re.search(f"[Dd]em {word}", sen)) for sen in sens]) - )[0]: + if df_post["sentences"].apply(lambda sens: any([bool(re.search(f"[Dd]em {word}", sen)) for sen in sens]))[0]: added_message = ( f"Visste du att det inte kan heta ~~dem {word}~~? **De {word}** " f"är den korrekta formen. När `de` används i en betydelse " @@ -178,9 +185,7 @@ def create_guide_en(df_post): message = "" - if df_post["sentences"].apply( - lambda sens: any([bool(re.search("[Dd]em flesta", sen)) for sen in sens]) - )[0]: + if df_post["sentences"].apply(lambda sens: any([bool(re.search("[Dd]em flesta", sen)) for sen in sens]))[0]: added_message = ( f"Visste du att det aldrig kan heta ~~dem flesta~~ på svenska? **De flesta** " f"är den enda korrekta formen av uttrycket." @@ -188,9 +193,7 @@ def create_guide_en(df_post): message += add_paragraph(added_message) for word in ["andra", "värsta", "bästa", "sämsta", "första"]: - if df_post["sentences"].apply( - lambda sens: any([bool(re.search(f"[Dd]em {word}", sen)) for sen in sens]) - )[0]: + if df_post["sentences"].apply(lambda sens: any([bool(re.search(f"[Dd]em {word}", sen)) for sen in sens]))[0]: added_message = ( f"Visste du att det inte kan heta ~~dem {word}~~? **De {word}** " f"är den korrekta formen. När `de` används i en betydelse " diff --git a/src/pushshift/data.py b/src/pushshift/data.py new file mode 100644 index 0000000..fe41c81 --- /dev/null +++ b/src/pushshift/data.py @@ -0,0 +1,143 @@ +import logging +import datetime as dt +import pandas as pd +import time + +""" +Legacy code from the first version of the bot based on Pushshift. +""" + +logger = logging.getLogger(__name__) + +KEEP_COMMENT_COLUMNS = [ + "id", + "link_id", + "score", + "author", + "body", + "subreddit_id", + "permalink", + "edited", + "ups", + "num_reports", + "total_awards_received", + "subreddit", + "gilded", + "can_mod_post", + "send_replies", + "parent_id", + "author_fullname", + "downs", + "collapsed", + "is_submitter", + "body_html", + "collapsed_reason", + "collapsed_reason_code", + "stickied", + "unrepliable_reason", + "score_hidden", + "locked", + "name", + "created", + "created_utc", + "subreddit_name_prefixed", + "controversiality", + "collapsed_because_crowd_control", + "mod_note", + "_fetched", +] + +KEEP_SUBMISSION_COLUMNS = [ + "author", + "author_flair_text", + "created", + "created_utc", + "gilded", + "id", + "is_meta", + "is_self", + "is_video", + "link_flair_text", + "locked", + "mod_note", + "name", + "num_comments", + "num_crossposts", + "num_duplicates", + "permalink", + "pinned", + "removal_reason", + "score", + "selftext", + "selftext_html", + "stickied", + "title", + "ups", + "upvote_ratio", + "url", +] + + +def start_time(weeks=0, days=0, hours=2, minutes=10): + """ + How many days, hours and minutes back to query reddit for comments. + """ + start_time = dt.datetime.now() - dt.timedelta( + weeks=weeks, days=days, hours=hours, minutes=minutes + ) + return int(start_time.timestamp()) + + +def download_comments(api, weeks=0, days=0, hours=2, minutes=10): + logger.info( + f"Downloading all comments from /r/swdeden from the last {weeks} weeks, {days} days, {hours} hours and {minutes} minutes." + ) + # Download comments this far back in time. + after_time = start_time(weeks=weeks, days=days, hours=hours, minutes=minutes) + gen = api.search_comments(after=after_time, q="de|dem", subreddit="sweden") + + # Get comments + df = pd.DataFrame([thing.__dict__ for thing in gen]) + df = df[KEEP_COMMENT_COLUMNS] + + return df + + +def download_comments_between(api, start_time, end_time, q="de|dem", subreddit="sweden"): + logger.info(f"Downloading all comments from /r/swdeden from {start_time} to {end_time}.") + + gen = api.search_comments(after=start_time, before=end_time, q=q, subreddit=subreddit) + + # Get comments + df = pd.DataFrame([thing.__dict__ for thing in gen]) + df = df[KEEP_COMMENT_COLUMNS] + + return df + + +def download_submission(link_ids, reddit_api, backoff_factor=0.4): + link_ids = list(set(link_ids)) + df_list = [] + for link_id in link_ids: + for i in range(5): + # Exponential backoff + backoff_time = backoff_factor * (2**i) + + try: + submission = reddit_api.submission(id=link_id) + logger.info(f"Downloading {submission.title} at {submission.permalink}") + submission_data = vars(submission) + submission_cols = {key: submission_data[key] for key in KEEP_SUBMISSION_COLUMNS} + df_list.append(submission_cols) + break + + except: + logger.error(f"Download of {link_id} failed. Retry {i}.") + + time.sleep(backoff_time) + + df_sub = pd.DataFrame(df_list) + df_sub = df_sub.add_suffix("_sub") + df_sub["author_sub"] = df_sub["author_sub"].apply(lambda x: x.name if x is not None else None) + + return df_sub