Bug: Synonym Sync: Duplicate rows #684

- Bug fix: Address issues related to mondo capitalization variation. - Update: Ensure the following columns are now in the output and are together: synonym_case_mondo, synonym_case_diff_mondo, synonym_case_mondo_is_many, synonym_case_source, synonym_case_diff_source, synonym_case_source_is_many. Note that previously, we had removed synonym_case_mondo & synonym_case_source, opting instead for the 'diff' columns, because it was previously only valuable to show the original capitalizations if there was a difference between the two. But now that we can have multiple variations in capitalization on the same syonym, it is useful to see the original case by itself, as wel as all the variations. - Update: For synonym_case_source_is_many, ensure that all variations show up in synonym_case_source and synonym_case_diff_source columns. Note that when there are multiple capitalization variations at the source, we only need 1 row. - Update: For all synonym_case_mondo_is_many, ensure that all variations show up in the synonym_case_diff_mondo column. But leave synonym_case_mondo as it is. We need to preserve the original case for that row, since unlike the source, we will retain multiple rows in the case that Mondo has multiple capitalization variations for a single synonym.
monarch-initiative · Dec 16, 2024 · 0bb9418 · 0bb9418
1 parent c323731
commit 0bb9418
Showing 1 changed file with 58 additions and 3 deletions.
diff --git a/src/scripts/sync_synonym.py b/src/scripts/sync_synonym.py
@@ -28,8 +28,15 @@
     'synonym_scope_source': '',
     'synonym_scope_mondo': '',
     'synonym': '',
+
+    # TODO temp: Don't know which of these columns I'll keep
+    'synonym_case_mondo': '',
     'synonym_case_diff_mondo': '',
+    'synonym_case_mondo_is_many': '',
+    'synonym_case_source': '',
     'synonym_case_diff_source': '',
+    'synonym_case_source_is_many': '',
+
     'source_id': '',
     'source_label': '',
     'synonym_type': '',
@@ -93,10 +100,58 @@ def _handle_synonym_casing_variations(df: pd.DataFrame) -> pd.DataFrame:
         row['synonym_case_mondo'] if row['synonym_case_mondo'] != row['synonym_case_source'] else '', axis=1)
     df['synonym_case_diff_source'] = df.apply(lambda row:
         row['synonym_case_source'] if row['synonym_case_source'] != row['synonym_case_mondo'] else '', axis=1)
-    # Capitalization variations for single source term synonym: aggregate
+
+    # todo: sorting here useful in the end? if not, drop
+    # sort_cols = [x for x in SORT_COLS if x in df.columns] + ['synonym_case_mondo', 'synonym_case_source', 'synonym_case_diff_source', 'synonym_case_diff_mondo']
+    # sort_cols = [x for x in SORT_COLS if x in df.columns]
+    # df = df.sort_values(by=sort_cols)
+    # a1 = df[df['source_id'] == 'DOID:6262']   # todo tmp
+
+    # Capitalization variations for single source term synonym: drop dupe rows & aggregate
     agg_dict: Dict[str, Union[str, Callable]] = {col: 'first' for col in df.columns}
-    agg_dict['synonym_case_diff_source'] = '|'.join
-    df = df.groupby(['mondo_id', 'source_id', 'synonym'], as_index=False).agg(agg_dict)
+    agg_dict['synonym_case_source'] = '|'.join
+    df = df.groupby(['mondo_id', 'source_id', 'synonym_scope', 'synonym'], as_index=False).agg(agg_dict)
+    # a2 = df[df['source_id'] == 'DOID:6262']  # todo tmp
+    # todo: decide what to do with this col for dupes in the end
+    df['synonym_case_source_is_many'] = df['synonym_case_source'].apply(lambda x: '|' in x)
+    # - Correct synonym_case_diff_source field for these cases
+    df['synonym_case_diff_source'] = df.apply(lambda row: row['synonym_case_source']
+        if row['synonym_case_source_is_many'] else row['synonym_case_diff_source'], axis=1)
+    # x1 = df[df['synonym_case_source_is_many'] == True]  # todo temp
+
+    # Capitalization variations for single Mondo term synonym: keep 2+ rows, but aggregate variations in 'diff' cols
+    # TODO decide what needs to be done here. is this good?
+    # todo: i think 3 lines below can be dropped
+    # counts = df.groupby(['synonym_lower']).size()
+    # df_unique = df[df['synonym_lower'].isin(counts[counts == 1].index)]
+    # df_duplicates = df[df['synonym_lower'].isin(counts[counts > 1].index)]
+    # - Mark which rows have multiple capitalizations
+    counts = df.groupby(['synonym_lower', 'mondo_id', 'source_id', 'synonym_scope']).size()
+    df_unique = df[
+        df.set_index(['synonym_lower', 'mondo_id', 'source_id', 'synonym_scope']).index.isin(counts[counts == 1].index)]
+    df_unique['synonym_case_mondo_is_many'] = False
+    df_duplicates = df[
+        df.set_index(['synonym_lower', 'mondo_id', 'source_id', 'synonym_scope']).index.isin(counts[counts > 1].index)]
+    df_duplicates['synonym_case_mondo_is_many'] = True
+    # df_duplicates = df_duplicates.sort_values(by=sort_cols)  # todo: needed?
+    # print('dupes + unique = tot?', len(df_duplicates) + len(df_unique) == len(df))  # todo temp
+    # a3 = df_duplicates[df_duplicates['source_id'] == 'DOID:6262']  # todo tmp
+    # x2 = df_duplicates[df_duplicates['synonym_case_source_is_many'] == True]  # todo temp: n=2 rows (1 case) of 550 rows
+    # df_duplicates = df_duplicates.sort_values(by=['synonym_case_source_is_many'], ascending=False)
+    # df_duplicates.head(10).to_csv('~/Desktop/dupes.csv', index=False)  # todo temp
+    # - Aggregate 'diff' col
+    #   First, create a mapping of the variations for each unique combination
+    variations = df_duplicates.groupby(['synonym_lower', 'mondo_id', 'source_id', 'synonym_scope'])[
+        'synonym_case_mondo'].agg(lambda x: '|'.join(sorted(set(x)))).reset_index()
+    #   Create a dictionary for easy lookup
+    variation_dict = variations.set_index(['synonym_lower', 'mondo_id', 'source_id', 'synonym_scope'])[
+        'synonym_case_mondo'].to_dict()
+    #   Apply this mapping to create the new column
+    df_duplicates['synonym_case_diff_mondo'] = df_duplicates.apply(lambda row:
+        variation_dict[(row['synonym_lower'], row['mondo_id'], row['source_id'], row['synonym_scope'])], axis=1)
+
+    df = pd.concat([df_unique, df_duplicates], ignore_index=True)
+
     return df