fixed similarity matrix issue, added documentation of caveats, and up…

…dated capitilization in plots (#143)
Arcadia-Science · Apr 12, 2024 · 128945c · 128945c
1 parent 8d93ae9
commit 128945c
Show file tree

Hide file tree

Showing 6 changed files with 16 additions and 14 deletions.
diff --git a/ProteinCartography/foldseek_clustering.py b/ProteinCartography/foldseek_clustering.py
@@ -257,7 +257,7 @@ def pivot_foldseek_results(input_file: str, output_file: str, column_prefix=""):
     with open(output_file, "w", newline="") as fh:
         csv_writer = csv.writer(fh, delimiter="\t")
 
-        header = ["protid"] + [f"{column_prefix}{target}" for target in sorted(targets)]
+        header = ["protid"] + [f"{column_prefix}{target}" for target in targets]
         csv_writer.writerow(header)
 
         for entry in sorted(entries.items()):

diff --git a/ProteinCartography/plot_cluster_distributions.py b/ProteinCartography/plot_cluster_distributions.py
@@ -29,7 +29,7 @@
         "edgecolor": apc.All["arcadia:aegean"],
     },
     "Annotation": {
-        "textlabel": "Annotation Score",
+        "textlabel": "Annotation score",
         "facecolor": apc.All["arcadia:wish"],
         "edgecolor": apc.All["arcadia:aster"],
     },

diff --git a/ProteinCartography/plot_cluster_similarity.py b/ProteinCartography/plot_cluster_similarity.py
@@ -117,7 +117,7 @@ def plot_group_similarity(
     fig = px.imshow(sim_df, color_continuous_scale=arcadia_viridis, range_color=[0, 1])
 
     colorbar_dict = dict(
-        title="similarity",
+        title="Similarity",
         x=1,
         y=0.5,
         xanchor="left",
@@ -131,8 +131,8 @@ def plot_group_similarity(
     fig.update_layout(width=plot_width, height=plot_height, coloraxis_colorbar=colorbar_dict)
     fig.update_layout(margin=dict(l=0, r=0, t=0, b=0), paper_bgcolor="rgba(0,0,0,0)")
 
-    fig.update_xaxes(side="top", title="target")
-    fig.update_yaxes(title="query")
+    fig.update_xaxes(side="top", title="Target")
+    fig.update_yaxes(title="Query")
 
     try:
         fig.update_layout(font=dict(family="Arial"))

diff --git a/ProteinCartography/plot_interactive.py b/ProteinCartography/plot_interactive.py
@@ -264,7 +264,7 @@ def generate_plotting_rules(
                 x
             ),  # Leiden cluster is often read as int; this forces it to be string
             "color_order": apc.Palettes["arcadia:AccentAllOrdered"].colors,
-            "textlabel": "Leiden Cluster",
+            "textlabel": "Leiden cluster",
         },
         "Annotation": {
             "type": "categorical",
@@ -273,7 +273,7 @@ def generate_plotting_rules(
                 int(x)
             ),  # Annotation score is parsed as float but we want it to be string
             "color_dict": ANNOTATION_SCORE_COLOR_DICT,
-            "textlabel": "Annotation Score",
+            "textlabel": "Annotation score",
         },
         "Lineage": {
             "type": "taxonomic",
@@ -283,7 +283,7 @@ def generate_plotting_rules(
             ),  # This converts the taxonomic groupings from a string-ified list to a real list
             "taxon_order": taxon_color_dict.keys(),
             "color_order": taxon_color_dict.values(),
-            "textlabel": "Broad Taxon",
+            "textlabel": "Broad taxon",
             "skip_hover": True,
         },
         "Length": {
@@ -302,7 +302,7 @@ def generate_plotting_rules(
             "type": "categorical",
             "fillna": "None",
             "color_dict": PDB_ORIGIN_COLOR_DICT,
-            "textlabel": "PDB Origin",
+            "textlabel": "PDB origin",
         },
         "pdb_confidence": {
             "type": "continuous",
@@ -943,7 +943,7 @@ def plot_interactive(
                     # [parameter to modify, value to modify it at]
                     args=[{"visible": True}, [len(fig.data) - 1]],
                     args2=[{"visible": False}, [len(fig.data) - 1]],
-                    label="Input Proteins",
+                    label="Input proteins",
                 )
             ],
             type="buttons",

diff --git a/ProteinCartography/semantic_analysis.py b/ProteinCartography/semantic_analysis.py
@@ -191,8 +191,8 @@ def ignore_function(x):
         plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
         plt.gca().tick_params(axis="y", length=0)
         plt.ylabel(f"{agg_col} {clu}", fontsize=14)
-        plt.xlabel("number of annotations")
-        plt.title(f"top {top_n} full annotations")
+        plt.xlabel("Number of annotations")
+        plt.title(f"Top {top_n} full annotations")
 
         # plot the word cloud
         plt.subplot(n_rows, n_cols * 2, i * 2 + 2)
@@ -201,7 +201,7 @@ def ignore_function(x):
         # hide xticks and yticks for word cloud
         plt.gca().set_xticks([])
         plt.gca().set_yticks([])
-        plt.title("proportional word cloud")
+        plt.title("Proportional word cloud")
 
     # tighten up the layout after plotting
     plt.tight_layout()
@@ -409,7 +409,7 @@ def semantic_multiplot_plotly(
         "showline": True,
         "linewidth": 1,
         "linecolor": apc.All["arcadia:crow"],
-        "title": "number of annotations",
+        "title": "Number of annotations",
         "title_standoff": 2,
     }
 

diff --git a/README.md b/README.md
@@ -15,6 +15,8 @@ Comparing protein structures across organisms can help us generate interesting b
 
 Our pipeline starts with user-provided protein(s) of interest and searches the available sequence and structure databases for matches. Using the full list of matches, we can build a "map" of all the similar proteins and look for clusters of proteins with similar features. Overlaying a variety of different parameters such as taxonomy, sequence divergence, and other features onto these spaces allows us to explore the features that drive differences between clusters.
 
+Because this tool is based on global structural comparisons, note that the results are not always useful for long proteins (>1200 amino acids), multi-domain proteins, or proteins with large unstructured regions. Additionally, while we find that the results for average length, well-structured proteins appear generally as expected, we have not yet comprehensively validated the clustering parameters, so users may find that different parameters work better for their specific analyses.
+
 ---
 ## Quickstart
 1. Clone the GitHub repository.