Skip to content

Commit

Permalink
fixed similarity matrix issue, added documentation of caveats, and up…
Browse files Browse the repository at this point in the history
…dated capitilization in plots (#143)
  • Loading branch information
braebigge authored Apr 12, 2024
1 parent 8d93ae9 commit 128945c
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 14 deletions.
2 changes: 1 addition & 1 deletion ProteinCartography/foldseek_clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def pivot_foldseek_results(input_file: str, output_file: str, column_prefix=""):
with open(output_file, "w", newline="") as fh:
csv_writer = csv.writer(fh, delimiter="\t")

header = ["protid"] + [f"{column_prefix}{target}" for target in sorted(targets)]
header = ["protid"] + [f"{column_prefix}{target}" for target in targets]
csv_writer.writerow(header)

for entry in sorted(entries.items()):
Expand Down
2 changes: 1 addition & 1 deletion ProteinCartography/plot_cluster_distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
"edgecolor": apc.All["arcadia:aegean"],
},
"Annotation": {
"textlabel": "Annotation Score",
"textlabel": "Annotation score",
"facecolor": apc.All["arcadia:wish"],
"edgecolor": apc.All["arcadia:aster"],
},
Expand Down
6 changes: 3 additions & 3 deletions ProteinCartography/plot_cluster_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def plot_group_similarity(
fig = px.imshow(sim_df, color_continuous_scale=arcadia_viridis, range_color=[0, 1])

colorbar_dict = dict(
title="similarity",
title="Similarity",
x=1,
y=0.5,
xanchor="left",
Expand All @@ -131,8 +131,8 @@ def plot_group_similarity(
fig.update_layout(width=plot_width, height=plot_height, coloraxis_colorbar=colorbar_dict)
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0), paper_bgcolor="rgba(0,0,0,0)")

fig.update_xaxes(side="top", title="target")
fig.update_yaxes(title="query")
fig.update_xaxes(side="top", title="Target")
fig.update_yaxes(title="Query")

try:
fig.update_layout(font=dict(family="Arial"))
Expand Down
10 changes: 5 additions & 5 deletions ProteinCartography/plot_interactive.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def generate_plotting_rules(
x
), # Leiden cluster is often read as int; this forces it to be string
"color_order": apc.Palettes["arcadia:AccentAllOrdered"].colors,
"textlabel": "Leiden Cluster",
"textlabel": "Leiden cluster",
},
"Annotation": {
"type": "categorical",
Expand All @@ -273,7 +273,7 @@ def generate_plotting_rules(
int(x)
), # Annotation score is parsed as float but we want it to be string
"color_dict": ANNOTATION_SCORE_COLOR_DICT,
"textlabel": "Annotation Score",
"textlabel": "Annotation score",
},
"Lineage": {
"type": "taxonomic",
Expand All @@ -283,7 +283,7 @@ def generate_plotting_rules(
), # This converts the taxonomic groupings from a string-ified list to a real list
"taxon_order": taxon_color_dict.keys(),
"color_order": taxon_color_dict.values(),
"textlabel": "Broad Taxon",
"textlabel": "Broad taxon",
"skip_hover": True,
},
"Length": {
Expand All @@ -302,7 +302,7 @@ def generate_plotting_rules(
"type": "categorical",
"fillna": "None",
"color_dict": PDB_ORIGIN_COLOR_DICT,
"textlabel": "PDB Origin",
"textlabel": "PDB origin",
},
"pdb_confidence": {
"type": "continuous",
Expand Down Expand Up @@ -943,7 +943,7 @@ def plot_interactive(
# [parameter to modify, value to modify it at]
args=[{"visible": True}, [len(fig.data) - 1]],
args2=[{"visible": False}, [len(fig.data) - 1]],
label="Input Proteins",
label="Input proteins",
)
],
type="buttons",
Expand Down
8 changes: 4 additions & 4 deletions ProteinCartography/semantic_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,8 @@ def ignore_function(x):
plt.gca().xaxis.set_major_locator(MaxNLocator(integer=True))
plt.gca().tick_params(axis="y", length=0)
plt.ylabel(f"{agg_col} {clu}", fontsize=14)
plt.xlabel("number of annotations")
plt.title(f"top {top_n} full annotations")
plt.xlabel("Number of annotations")
plt.title(f"Top {top_n} full annotations")

# plot the word cloud
plt.subplot(n_rows, n_cols * 2, i * 2 + 2)
Expand All @@ -201,7 +201,7 @@ def ignore_function(x):
# hide xticks and yticks for word cloud
plt.gca().set_xticks([])
plt.gca().set_yticks([])
plt.title("proportional word cloud")
plt.title("Proportional word cloud")

# tighten up the layout after plotting
plt.tight_layout()
Expand Down Expand Up @@ -409,7 +409,7 @@ def semantic_multiplot_plotly(
"showline": True,
"linewidth": 1,
"linecolor": apc.All["arcadia:crow"],
"title": "number of annotations",
"title": "Number of annotations",
"title_standoff": 2,
}

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ Comparing protein structures across organisms can help us generate interesting b

Our pipeline starts with user-provided protein(s) of interest and searches the available sequence and structure databases for matches. Using the full list of matches, we can build a "map" of all the similar proteins and look for clusters of proteins with similar features. Overlaying a variety of different parameters such as taxonomy, sequence divergence, and other features onto these spaces allows us to explore the features that drive differences between clusters.

Because this tool is based on global structural comparisons, note that the results are not always useful for long proteins (>1200 amino acids), multi-domain proteins, or proteins with large unstructured regions. Additionally, while we find that the results for average length, well-structured proteins appear generally as expected, we have not yet comprehensively validated the clustering parameters, so users may find that different parameters work better for their specific analyses.

---
## Quickstart
1. Clone the GitHub repository.
Expand Down

0 comments on commit 128945c

Please sign in to comment.