diff --git a/anvio/__init__.py b/anvio/__init__.py index 4e9e19503..8ec79fcab 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -181,6 +181,14 @@ def TABULATE(table, header, numalign="right", max_width=0): "You can provide these names as a comma-separated list of names, or you can put them in a file, " "where you have a single genome name in each line, and provide the file path."} ), + 'genomes-reverse': ( + ['-R', '--genome-reverse'], + {'metavar': "GENOME_NAMES", + 'required': False, + 'help': "Genome names to 'reverse'. You can use this parameter to choose the genomes whose synteny you want to reverse. " + "You can provide these names as a comma-separated list of names, or you can put them in a file, " + "where you have a single genome name in each line, and provide the file path."} + ), 'blank-profile': ( ['--blank-profile'], {'default': False, @@ -2115,6 +2123,74 @@ def TABULATE(table, header, numalign="right", max_width=0): 'type': str, 'help': "Directory path for output files"} ), + # start of anvi-pan-graph related input and output variables: + 'testing-yaml': ( + ['--testing-yaml'], + {'metavar': 'YAML', + 'type': str, + 'help': "A yaml file containing raw gene cluster order for testing the anvi'o pan graph."} + ), + 'gene-additional-data': ( + ['--gene-additional-data'], + {'metavar': 'CSV', + 'type': str, + 'help': "A CSV file containing at least one column with values per genome, contig and genecall column."} + ), + 'gc-additional-data': ( + ['--gc-additional-data'], + {'metavar': 'CSV', + 'type': str, + 'help': "A CSV file containing at least one column with values per genome, contig and gc column."} + ), + 'output-pan-graph-json': ( + ['-o', '--output-pan-graph-json'], + {'metavar': 'FILE_PATH', + 'type': str, + 'help': "File path to store the temporary pan-graph-json file."} + ), + 'output-graphml': ( + ['--output-graphml'], + {'metavar': 'FILE_PATH', + 'type': str, + 'help': "File path to store the final graph file."} + ), + 'output-raw-gc-additional-data': ( + ['--output-raw-gc-additional-data'], + {'metavar': 'FILE_PATH', + 'type': str, + 'help': "File path to create and store a raw gc additional data table for testing or join purposes."} + ), + 'output-raw-gene-additional-data': ( + ['--output-raw-gene-additional-data'], + {'metavar': 'FILE_PATH', + 'type': str, + 'help': "File path to create and store a raw gc additional data table for testing or join purposes."} + ), + 'output-testing-yaml': ( + ['--output-testing-yaml'], + {'metavar': 'FILE_PATH', + 'type': str, + 'help': "File path to create and store a testing yaml for a existing pangenome."} + ), + 'output-dir-summary': ( + ['--output-dir-summary'], + {'metavar': 'DIR_PATH', + 'type': str, + 'help': "File path to store a CSV file containing values describing the pan-graph for downstream analysis."} + ), + 'output-dir-graphics': ( + ['--output-dir-graphics'], + {'metavar': 'DIR_PATH', + 'type': str, + 'help': "File path to store the hierarchical clusterings for the paralog splitting."} + ), + 'pan-graph-json': ( + ['-i', '--pan-graph-json'], + {'metavar': 'JSON', + 'type': str, + 'help': "A JSON formatted anvi'o pan graph, which is typically generated by `anvi-pan-graph`."} + ), + # end of pan-graph related variables 'output-file': ( ['-o', '--output-file'], {'metavar': 'FILE_PATH', @@ -3701,6 +3777,7 @@ def set_version(): anvio_codename, \ t.contigs_db_version, \ t.pan_db_version, \ + t.pangraph_json_version, \ t.profile_db_version, \ t.genes_db_version, \ t.auxiliary_data_version, \ @@ -3717,6 +3794,7 @@ def get_version_tuples(): ("Genes DB version", __genes__version__), ("Auxiliary data storage version", __auxiliary_data_version__), ("Pan DB version", __pan__version__), + ("Pangraph JSON version", __pangraph__version__), ("Genome data storage version", __genomes_storage_version__), ("Structure DB version", __structure__version__), ("KEGG Modules DB version", __kegg_modules_version__), @@ -3729,6 +3807,7 @@ def print_version(): run.info("Profile database", __profile__version__) run.info("Contigs database", __contigs__version__) run.info("Pan database", __pan__version__) + run.info("Pangraph JSON", __pangraph__version__) run.info("Genome data storage", __genomes_storage_version__) run.info("Auxiliary data storage", __auxiliary_data_version__) run.info("Structure database", __structure__version__) @@ -3740,6 +3819,7 @@ def print_version(): __codename__, \ __contigs__version__, \ __pan__version__, \ +__pangraph__version__, \ __profile__version__, \ __genes__version__, \ __auxiliary_data_version__, \ diff --git a/anvio/bottleroutes.py b/anvio/bottleroutes.py index ef6e8dbcb..30bb53bd4 100644 --- a/anvio/bottleroutes.py +++ b/anvio/bottleroutes.py @@ -21,6 +21,7 @@ import argparse import datetime import importlib +import networkx as nx from hashlib import md5 from ete3 import Tree @@ -38,6 +39,7 @@ import anvio import anvio.dbops as dbops +import anvio.panops as panops import anvio.utils as utils import anvio.drivers as drivers import anvio.terminal as terminal @@ -197,7 +199,12 @@ def register_routes(self): self.route('/data/get_gene_info/', callback=self.get_gene_info) self.route('/data/get_metabolism', callback=self.get_metabolism) self.route('/data/get_scale_bar', callback=self.get_scale_bar, method='POST') - + self.route('/pangraph/settings', callback=self.get_pangraph_settings, method="POST") + self.route('/pangraph/get_json', callback=self.get_pangraph_json_data, method="POST") + self.route('/pangraph/alignment', callback=self.get_pangraph_gc_alignment, method="POST") + self.route('/pangraph/analyse', callback=self.get_pangraph_bin_summary, method="POST") + self.route('/pangraph/function', callback=self.get_pangraph_gc_function, method="POST") + self.route('/pangraph/filter', callback=self.get_pangraph_passed_gene_clusters, method="POST") def run_application(self, ip, port): # check for the wsgi module bottle will use. @@ -266,6 +273,8 @@ def redirect_to_app(self): homepage = 'structure.html' elif self.interactive.mode == 'metabolism': homepage = 'metabolism.html' + elif self.interactive.mode == 'pangraph': + homepage = 'pangraph.html' elif self.interactive.mode == 'inspect': redirect('/app/charts.html?id=%s&show_snvs=true&rand=%s' % (self.interactive.inspect_split_name, self.random_hash(8))) @@ -1069,7 +1078,7 @@ def gen_summary(self, collection_name): path = "summary/%s/index.html" % (collection_name) return json.dumps({'path': path}) - + def send_summary_static(self, collection_name, filename): if self.interactive.mode == 'pan': @@ -1534,7 +1543,7 @@ def get_functions_for_gene_clusters(self): message = (f"At least one of the gene clusters in your list (e.g., {gene_cluster_name}) is missing in " f"the functions summary dict :/") return json.dumps({'status': 1, 'message': message}) - + d[gene_cluster_name] = self.interactive.gene_clusters_functions_summary_dict[gene_cluster_name] return json.dumps({'functions': d, 'sources': list(self.interactive.gene_clusters_function_sources)}) @@ -1551,4 +1560,133 @@ def get_scale_bar(self): message = str(e.clear_text()) if hasattr(e, 'clear_text') else str(e) return json.dumps({'status': 1, 'message': message}) - return json.dumps({'scale_bar_value': total_branch_length}) \ No newline at end of file + return json.dumps({'scale_bar_value': total_branch_length}) + + def get_pangraph_json_data(self): + + return self.interactive.get_pangraph_json() + + + def get_pangraph_bin_summary(self): + + payload = request.json + selection = payload['selection'] + result = {} + + self.interactive.init_gene_clusters() + self.interactive.init_gene_clusters_functions_summary_dict() + + bin_functions = {} + all_functions = [] + + for bin in list(payload.keys())[1:]: + + bin_functions[bin] = [] + gene_cluster_list = payload[bin] + for gene_cluster_name in gene_cluster_list: + function = self.interactive.gene_clusters_functions_summary_dict[gene_cluster_name][selection]['function'] + + bin_functions[bin].append(function) + all_functions.append(function) + + for function in all_functions: + result[function] = {} + for bin in bin_functions.keys(): + if function in bin_functions[bin]: + result[function][bin] = 1 + else: + result[function][bin] = 0 + + return json.dumps(result) + + def get_pangraph_settings(self): + + payload = request.json + max_edge_length_filter = payload['maxlength'] + gene_cluster_grouping_threshold = payload['condtr'] + groupcompress = payload['groupcompress'] + ungrouping_area = payload['ungroup'] + print(ungrouping_area) + + args = argparse.Namespace( + pan_graph_json=self.interactive.pan_graph_json_path, + output_pan_graph_json=self.interactive.pan_graph_json_path, + max_edge_length_filter=max_edge_length_filter, + gene_cluster_grouping_threshold=gene_cluster_grouping_threshold, + gene_cluster_grouping_compression=groupcompress, + ungrouping_area = ungrouping_area + ) + + Pangraph = panops.Pangraph(args) + + Pangraph.load_graph_from_json_file() + Pangraph.run_synteny_layout_algorithm() + Pangraph.update_json_dict() + Pangraph.store_network() + + return({'status': 0}) + + def get_pangraph_gc_alignment(self): + + payload = request.json + result = {} + + for genome in payload.keys(): + genecall, contig, direction, name = payload[genome] + gene_cluster_alignment_dict = self.interactive.get_sequences_for_gene_clusters(gene_cluster_names=set([name]), skip_alignments=False, report_DNA_sequences=False)[name] + sequence = gene_cluster_alignment_dict[genome][int(genecall)] + + result[genome] = [genecall, contig, direction, sequence] + + return(result) + + def get_pangraph_gc_function(self): + + payload = request.json + result = {} + + self.interactive.init_gene_clusters() + self.interactive.init_gene_clusters_functions_summary_dict() + + gene_cluster_name = payload['genecluster'] + + for selection in self.interactive.gene_clusters_functions_summary_dict[gene_cluster_name].keys(): + result[selection] = self.interactive.gene_clusters_functions_summary_dict[gene_cluster_name][selection] + return json.dumps(result) + + def get_pangraph_passed_gene_clusters(self): + + payload = request.json + result = {'gene_clusters': []} + + self.interactive.init_gene_clusters() + self.interactive.init_gene_clusters_functions_summary_dict() + + functions_dict = self.interactive.gene_clusters_functions_summary_dict + + for gene_cluster in functions_dict.keys(): + for selection in payload.keys(): + if selection in functions_dict[gene_cluster].keys(): + for search in payload[selection]: + value = functions_dict[gene_cluster][selection] + + accession = value['accession'] + func = value['function'] + + if not accession or not func: + accession = '' + func = '' + + if search == '': + + print(search, accession, func) + if search == accession or search == func: + if gene_cluster not in result['gene_clusters']: + result['gene_clusters'] += [gene_cluster] + + else: + if search.lower() in accession.lower() or search.lower() in func.lower(): + if gene_cluster not in result['gene_clusters']: + result['gene_clusters'] += [gene_cluster] + + return json.dumps(result) \ No newline at end of file diff --git a/anvio/clustering.py b/anvio/clustering.py index d224d1640..c5430b851 100644 --- a/anvio/clustering.py +++ b/anvio/clustering.py @@ -103,6 +103,23 @@ def get_newick_tree_data_for_dict(d, transpose=False, linkage=constants.linkage_ return newick +def get_newick(node, parent_dist, leaf_names, newick=''): + """ + Modified from the solution at https://stackoverflow.com/questions/28222179/save-dendrogram-to-newick-format + """ + if node.is_leaf(): + return "%s:%.2f%s" % (leaf_names[node.id], parent_dist - node.dist, newick) + else: + if len(newick) > 0: + newick = "):%.2f%s" % (parent_dist - node.dist, newick) + else: + newick = ");" + newick = get_newick(node.get_left(), node.dist, leaf_names, newick=newick) + newick = get_newick(node.get_right(), node.dist, leaf_names, newick=",%s" % (newick)) + newick = "(%s" % (newick) + return newick + + def get_vectors_for_vectors_with_missing_data(vectors): """Get a distance matrix for vectors with missing data. diff --git a/anvio/data/interactive/css/pangraph.css b/anvio/data/interactive/css/pangraph.css new file mode 100644 index 000000000..fd0b7e590 --- /dev/null +++ b/anvio/data/interactive/css/pangraph.css @@ -0,0 +1,244 @@ +@import url('variable.css'); +@import url('main.css'); + +.gene-sequence{ + font-family: monospace !important; +} + +#svgbox { + position: relative; + height: 100vh; + width: 100vw; + } + + .offcanvas-btn { + visibility: visible; + } + + .offcanvas-btn span:last-child, + .offcanvas.show .offcanvas-btn span:first-child { + display: none; + } + + .offcanvas.show .offcanvas-btn span:last-child { + display: inline; + } + + h5 { + border-bottom: 1px solid #ccc; /* This is the line replacing the hr*/ + margin-bottom: 10px; /* This is the space below the line */ + margin-top: 20px; /* This is the space below the line */ + padding-bottom: 15px; /* This is the space between the heading text and the line */ + } + + #leftoffcanvas-button { + left: 400px; + } + + #rightoffcanvas-button { + right: 200px; + } + + #rightoffcanvas { + width: 200px; + } + + #left { + overflow-y: auto; + /* max-height: calc(100vh - 128px); */ + height: calc(100vh - 128px); + } + + .dropdown-toggle::after { + content: none; + } + + .offcanvas-body { + overflow: hidden; + } + + .dropdown-menu { + max-height: 200px; + overflow-y: auto; + } + + .modal-dialog, + .modal-content { + /* 80% of window height */ + height: 80%; + } + + .modal-body { + /* 100% = dialog height, 120px = header + footer */ + max-height: calc(100vh - 210px); + overflow-y: auto; + } + + #right { + overflow-y: auto; + max-height: 100%; + } + + + .btn-bot{ + position: absolute; + top:0px; + left:50%; + -ms-transform: translateX(-50%); + transform: translateX(-50%); + } + + .right { + overflow-y: scroll; + } + + #node_basics_table, #node_functions_table, #node_sequence_alignments_table { + font-size: x-small; + } + + #node_basics_table { + text-align: center; + } + + .modal_header { + font-variant: small-caps; + font-size: x-large; + background: linear-gradient(88deg, #f1f8ff, #ffffff); + padding-left: 10px; + margin-top: 20px; + margin-bottom: 5px; + } + + .gene_cluster_context_items { + display: block; + border: 1px #dee2e6 solid; + padding: 0px 3px; + font-size: x-small; + background: #f2f2f2; + } + + .gene_cluster_id { + display: inline-block; + margin: 5px 2px; + padding: 3px 5px; + border: 1px solid #9c9c9c; + border-radius: 6px; + font-family: monospace; + background: #fff0f1; + } + + .gene_cluster_id_current { + background: #d8ebde; + } + + .bin-icon-size{ + font-size: 14px; + } + + #panel-left { + z-index: 2; + } + + #toggle-panel-left{ + z-index: 1; + } + + #toggle-panel-top{ + z-index: 1; + } + + #title-panel{ + z-index: 1; + } + + #title-alter-panel{ + z-index: 1; + } + + .user-handle:hover{ + background: var(--secondary-100); + } + + .tableFixHead { overflow: auto; height: 100px; } + .tableFixHead thead th { position: sticky; top: 0; z-index: 1; } + + /* Just common table stuff. Really. */ + /* table { border-collapse: collapse; width: 100%; } + th, td { padding: 8px 16px; } + th { background:#eee; } */ + + #InfoModalBody > .card-body { + padding-top: 0; + } + + #gc-alignment-font span{ + font-family: monospace !important; + } + + #gc-alignment-font{ + background-color: #e4e4e4; + } + + #td-genome-cell{ + position: sticky; + left: 0; + background-color: #e4e4e4; + z-index: 2; + width: min-content; + } + + #td-value-cell{ + position: sticky; + background-color: #e4e4e4; + z-index: 1; + display: flex; + align-items: center; + justify-content: center; + } + +#td-contig-cell{ + position: sticky; + background-color: #e4e4e4; + z-index: 1; + } + +#td-direction-cell{ + position: sticky; + background-color: #e4e4e4; + z-index: 1; + display: flex; + align-items: center; + justify-content: center; + } + +.td-analysis-cell-0{ + background-color: red; +} + +.td-analysis-cell-1{ + background-color: green; +} + + +.scroll-wrapper { + overflow-x: auto; + max-height: 200px; +} + +.scrollable-content { + overflow-x: auto; + white-space: nowrap; + background-color: #e4e4e4; +} + +#th-sequence{ + position: sticky; + left: 25%; + z-index: 7; +} + +.gc-table-header{ + position: sticky; + top: 0; + z-index: 6; +} \ No newline at end of file diff --git a/anvio/data/interactive/js/pangraph.js b/anvio/data/interactive/js/pangraph.js new file mode 100644 index 000000000..16c7bdf6c --- /dev/null +++ b/anvio/data/interactive/js/pangraph.js @@ -0,0 +1,2804 @@ +//ANCHOR - Constants +const mapAS = { + 'A': 'A', 'R': 'R', + 'N': 'N', 'D': 'D', + 'C': 'C', 'Q': 'Q', + 'E': 'E', 'G': 'G', + 'H': 'H', 'I': 'I', + 'L': 'L', 'K': 'K', + 'M': 'M', 'F': 'F', + 'P': 'P', 'S': 'S', + 'T': 'T', 'W': 'W', + 'Y': 'Y', 'V': 'V', + '-': '-' +}; + +// we will fill this later from the incoming data +var functional_annotation_sources_available = []; +//ANCHOR - Bin creation +var bins = {"bin1": []}; +var binnum = 1; + +var current_groups = {} + +// NOTE - From https://stackoverflow.com/questions/1053843/get-the-element-with-the-highest-occurrence-in-an-array +function modeString(array) { + if (array.length == 0) return null; + + var modeMap = {}, + maxEl = array[0], + maxCount = 1; + + for (var i = 0; i < array.length; i++) { + var el = array[i]; + + if (modeMap[el] == null) modeMap[el] = 1; + else modeMap[el]++; + + if (modeMap[el] > maxCount) { + maxEl = el; + maxCount = modeMap[el]; + } else if (modeMap[el] == maxCount && maxCount == 1) { + maxEl = el; + maxCount = modeMap[el]; + } else if (modeMap[el] == maxCount) { + maxEl += ' & ' + el; + maxCount = modeMap[el]; + } + } + return [maxEl, Math.round(maxCount/i * 100)]; +} + +function get_passed_gene_clusters(searchfunction) { + + var d = $.ajax({ + url: "/pangraph/filter", + type: "POST", + data: JSON.stringify(searchfunction), + contentType: "application/json", + dataType: "json" + }) + + return d +} + +//ANCHOR - Fetch GC consensus functions +function get_gene_cluster_consensus_functions(gene_cluster_name) { + + var func = new Object(); + func['genecluster'] = gene_cluster_name + + var d = $.ajax({ + url: "/pangraph/function", + type: "POST", + data: JSON.stringify(func), + contentType: "application/json", + dataType: "json" + }) + + return d +} + +function get_layer_data(gene_cluster_id, data, add_align) { + + var layers = Object.keys(data['infos']['layers_data']) + var basic_info = {} + + for (var layer_name of layers) { + if ($('#flex' + layer_name).prop('checked') == true){ + + basic_info[layer_name] = data['elements']['nodes'][gene_cluster_id]['layer'][layer_name] + + } + } + + if (add_align == 1) { + basic_layer_table = ``; + basic_layer_table += ``; + } else { + basic_layer_table = '' + basic_layer_table += `
`; + } + + basic_layer_table += ``; + basic_layer_table += ``; + for (const [key, value] of Object.entries(basic_info)) { + basic_layer_table += ``; + } + basic_layer_table += ``; + + basic_layer_table += ``; + for (const [key, value] of Object.entries(basic_info)) { + basic_layer_table += ``; + } + basic_layer_table += `
` + key + `
` + value + `
`; + + return basic_layer_table; + +} + +function get_gene_cluster_basics_table(gene_cluster_id, gene_cluster_context, data, add_align) { + // first, learn a few basics about the gene cluster to be displayed + var x_pos = data['elements']['nodes'][gene_cluster_id]['position']['x_offset'] + // var position_in_graph = x_pos + " / " + (data["infos"]["meta"]["global_x_offset"] - 1); + var position_in_graph = x_pos; + // var num_contributing_genomes = Object.keys(data['elements']['nodes'][gene_cluster_id]['genome']).length + " / " + (data['infos']['num_genomes']); + var num_contributing_genomes = Object.keys(data['elements']['nodes'][gene_cluster_id]['genome']).length; + var gene_cluster_name = data['elements']['nodes'][gene_cluster_id]['name'] + + if (gene_cluster_context == null){ + var context = gene_cluster_id + } else { + var context = gene_cluster_context + } + + basic_info = {'ID': gene_cluster_id, 'Gene Cluster': gene_cluster_name, 'Contributing Genomes': num_contributing_genomes, 'Position in Graph': position_in_graph} + // build the basic information table + if (add_align == 1) { + basic_info_table = ``; + basic_info_table += ``; + } else { + basic_info_table = '' + basic_info_table += `
`; + } + + basic_info_table += ``; + basic_info_table += ``; + for (const [key, value] of Object.entries(basic_info)) { + basic_info_table += ``; + } + basic_info_table += ``; + + basic_info_table += ``; + for (const [key, value] of Object.entries(basic_info)) { + basic_info_table += ``; + } + basic_info_table += `
` + key + `
` + value + `
`; + + return basic_info_table; +} + + +async function get_gene_cluter_functions_table(gene_cluster_id, data, add_alig) { + + if (add_align == 1) { + functions_table = ``; + functions_table += ``; + } else { + functions_table = '' + functions_table += `
`; + } + functions_table += ``; + functions_table += ``; + functions_table += ``; + functions_table += ``; + functions_table += `\n\n`; + + var gene_cluster_name = data['elements']['nodes'][gene_cluster_id]['name'] + var d = await get_gene_cluster_consensus_functions(gene_cluster_name); + + // console.log(d) + + function_sources = Object.keys(d).sort(); + // console.log(function_sources); + for (source of function_sources) { + value = d[source]; + // console.log(source) + var accession = value['accession'] + accession === undefined | accession === null ? accession = '' : accession = accession + var func = value['function'] + func === undefined | func === null ? func = '' : func = func + + functions_table += ``; + functions_table += ``; + functions_table += ``; + functions_table += ``; + functions_table += ``; + } + functions_table += `
SourceAccessionFunction
` + source + `` + accession + `` + func + `
\n\n`; + + return functions_table +} + + +function get_gene_cluster_context_table(gene_cluster_id_current, gene_cluster_context, data, add_align) { + + if (gene_cluster_context == null){ + return ''; + } else { + var group_context = [] + for (item in data['infos']['groups'][gene_cluster_context]){ + group_context.push(data['infos']['groups'][gene_cluster_context][item]) + } + } + + // console.log(gene_cluster_id_current, gene_cluster_context, group_context) + if (add_align == 1) { + gene_cluster_context_table = ``; + }else { + gene_cluster_context_table = '' + } + gene_cluster_context_table += `
`; + for(index in group_context) { + gene_cluster_id = group_context[index]; + gene_cluster_name = data['elements']['nodes'][gene_cluster_id]['name'] + + if (gene_cluster_id == gene_cluster_id_current){ + gene_cluster_context_table += `` + gene_cluster_name + ``; + } else { + gene_cluster_context_table += `` + gene_cluster_name + ``; + } + } + gene_cluster_context_table += `
`; + + return gene_cluster_context_table; + +} + + +//ANCHOR - Get tables for GC basic info and functions +async function get_gene_cluster_display_tables(gene_cluster_id, gene_cluster_context, data, add_align) { + // The purpose of this function is to build HTML formatted tables to give access to + // the details of a gene cluster. The parameters here are, + // + // - `gene_cluster_id`: A singular gene cluster id to be detailed, + // - `gene_cluster_context`: A list of gene cluster ids that occur in the same context + // with `gene_cluster_id` (either they were binned together, or they were in the same + // group of gene clusters). + // - `data`: the primary data object from the JSON input + // + // If everything goes alright, this function will return a table that can be displayed in + // any modal window. + /////////////////////////////////////////////////////////////////////////////////////////// + // BUILD CONTEXT + // if this is a gene cluster that is a part of a context, then all others are going to be + // shown here + /////////////////////////////////////////////////////////////////////////////////////////// + + gene_cluster_context_table = get_gene_cluster_context_table(gene_cluster_id, gene_cluster_context, data, add_align); + + /////////////////////////////////////////////////////////////////////////////////////////// + // BUILD BASIC INFORMATION TABLE + /////////////////////////////////////////////////////////////////////////////////////////// + + basic_info_table = get_gene_cluster_basics_table(gene_cluster_id, gene_cluster_context, data, add_align); + + /////////////////////////////////////////////////////////////////////////////////////////// + // BUILD FUNCTIONS TABLE + /////////////////////////////////////////////////////////////////////////////////////////// + + basic_layer_table = get_layer_data(gene_cluster_id, data, add_align); + + functions_table = await get_gene_cluter_functions_table(gene_cluster_id, data, add_align); + + /////////////////////////////////////////////////////////////////////////////////////////// + // RETRIEVE AND BUILD SEQUENCE ALIGNMENTS + /////////////////////////////////////////////////////////////////////////////////////////// + + if (add_align == 1) { + + var alignment = {} + + // if (gene_cluster_id != 'start' && gene_cluster_id != 'stop') { + for (var genome of Object.keys(data['elements']['nodes'][gene_cluster_id]['genome'])) { + + // if ($('#flex' + genome).prop('checked') == true){ + + var genecall = data['elements']['nodes'][gene_cluster_id]['genome'][genome]['gene_call'] + var contig = data['elements']['nodes'][gene_cluster_id]['genome'][genome]['contig'] + var direction = data['elements']['nodes'][gene_cluster_id]['genome'][genome]['direction'] + var name = data['elements']['nodes'][gene_cluster_id]['name'] + alignment[genome] = [genecall, contig, direction, name] + + // } + } + // } + + gene_cluster_sequence_alignments_table = await appendalignment(gene_cluster_id, alignment) + + /////////////////////////////////////////////////////////////////////////////////////////// + // MERGE ALL AND RETURN + /////////////////////////////////////////////////////////////////////////////////////////// + + return gene_cluster_context_table + basic_info_table + basic_layer_table + functions_table + gene_cluster_sequence_alignments_table + + } else { + + return gene_cluster_context_table + basic_info_table + basic_layer_table + functions_table + + } +} + +//ANCHOR - Fetch GC alignment +function fetchalignment(alignment) { + + var d = $.ajax({ + url: "/pangraph/alignment", + type: "POST", + data: JSON.stringify(alignment), + contentType: "application/json", + dataType: "json" + }) + + return d + +}; + +//ANCHOR - Append GC alignment +async function appendalignment(gene_cluster_id, alignment) { + // FIXME: We need to come up with more accurate and explicit function names. + if (Object.keys(alignment).length == 0) { + return '' + } + + alignments_table = ``; + alignments_table += `
`; + alignments_table += ``; + alignments_table += ``; + alignments_table += ``; + alignments_table += ``; + alignments_table += ``; + alignments_table += ``; + alignments_table += `\n\n`; + + + var d = await fetchalignment(alignment) + + // console.log(d) + + for (var [genome, value] of Object.entries(d)) { + var colored = value[3].replace(/A|R|N|D|C|Q|E|G|H|I|L|K|M|F|P|S|T|W|Y|V|-/gi, function(matched){return mapAS[matched];}); + + alignments_table += `` + alignments_table += `` + alignments_table += `` + alignments_table += `` + alignments_table += `` + alignments_table += `` + alignments_table += `` + } + + alignments_table += `
GenomeGene CallContigDirectionSequence
` + genome + `` + value[0] + `` + value[1] + `` + value[2] + `
` + colored + `
\n\n` + + // I guess this shouldn't return anything but insert the results into a context? + // lol sorry if I screwed these thigns up badly :) IDK what I'm doing :p + return alignments_table; +} + +//ANCHOR - Color node and add/remove from bin +function marknode(e, data, binid, bins){ + + var bincolor = document.getElementById(binid + 'color').value + var id = e.id; + var current = '' + + var binkeys = Object.keys(bins) + for (var key of binkeys) { + if (bins[key].includes(id)) { + current = key + break; + } + } + + if ($('#flexsaturation').prop('checked') == true){ + var saturation = 1 + } else { + var saturation = 0 + } + + if (e.getAttribute('class') == 'group') { + + var node_color = $('#groups')[0].value; + var genome_size = Object.keys(data["infos"]["genomes"]).length; + + var group = data['infos']['groups'][id] + var node_name = group[0] + + var node = data['elements']['nodes'][node_name]; + var genome = Object.keys(node['genome']).length; + + } else if (e.getAttribute('class') == 'node') { + + var node_color = $('#nodes')[0].value; + var genome_size = Object.keys(data["infos"]["genomes"]).length; + + var node = data['elements']['nodes'][id]; + var genome = Object.keys(node['genome']).length; + + } + + if (current === binid) { + + if (saturation == 1){ + e.setAttribute("fill", lighter_color('#ffffff', node_color, genome / genome_size)) + } else { + e.setAttribute("fill", node_color) + } + + bins[binid] = bins[binid].filter(item => item !== id) + $('#' + binid + 'value')[0].value = bins[binid].length + + } else if (current === '') { + + if (saturation == 1){ + e.setAttribute("fill", lighter_color('#ffffff', bincolor, genome / genome_size)) + } else { + e.setAttribute("fill", bincolor) + } + + bins[binid].push(id) + $('#' + binid + 'value')[0].value = bins[binid].length + + } else { + + if (saturation == 1){ + e.setAttribute("fill", lighter_color('#ffffff', bincolor, genome / genome_size)) + } else { + e.setAttribute("fill", bincolor) + } + + bins[current] = bins[current].filter(item => item !== id) + bins[binid].push(id) + $('#' + binid + 'value')[0].value = bins[binid].length + $('#' + current + 'value')[0].value = bins[current].length + + } + + return bins +} + +// //ANCHOR - Information for the GC +async function nodeinfo(e, data) { + var id = e.id; + var element = document.getElementById(id); + + if (element.getAttribute('class') == 'group') { + gene_cluster_context = id; + gene_cluster_id = data['infos']['groups'][id][0] + } else { + gene_cluster_id = id; + gene_cluster_context = null; + } + + $('#InfoModalBody').empty() + var bodyinfo = $('
') + // var closeBtn = $('
') + // $('#InfoModalBody').append(closeBtn) + $('#InfoModalBody').append(bodyinfo) + + // console.log(gene_cluster_id, gene_cluster_context) + + var all_info = await get_gene_cluster_display_tables(gene_cluster_id, gene_cluster_context, data, add_align=1) + + // console.log(all_info) + + bodyinfo.append(all_info) + + $('#InfoModal').modal('show'); +} + +//ANCHOR - Degree to rad calculation +function deg2rad(degrees) +{ + return degrees * Math.PI/180; +} + +//ANCHOR - General download function +function downloadBlob(blob, name) { + + var blobUrl = URL.createObjectURL(blob); + var link = document.createElement("a"); + + link.href = blobUrl; + link.download = name; + + document.body.appendChild(link); + + link.dispatchEvent( + new MouseEvent('click', { + bubbles: true, + cancelable: true, + view: window + }) + ); + + document.body.removeChild(link); +} + +// NOTE - From https://coderwall.com/p/z8uxzw/javascript-color-blender +function int_to_hex(num) { + var hex = Math.round(num).toString(16); + if (hex.length == 1) + hex = '0' + hex; + return hex; +} + +//ANCHOR - Function to mix two colors +function lighter_color(color1, color2, percentage, threshold=0.25) { + + // console.log(color1, color2, percentage) + + percentage = threshold + (1 - threshold) * percentage + + // var color3 = $.xcolor.gradientlevel(color1, color2, percentage, 1); + + color1 = color1.substring(1); + color2 = color2.substring(1); + + color1 = [parseInt(color1[0] + color1[1], 16), parseInt(color1[2] + color1[3], 16), parseInt(color1[4] + color1[5], 16)]; + color2 = [parseInt(color2[0] + color2[1], 16), parseInt(color2[2] + color2[3], 16), parseInt(color2[4] + color2[5], 16)]; + + var color3 = [ + (1 - percentage) * color1[0] + percentage * color2[0], + (1 - percentage) * color1[1] + percentage * color2[1], + (1 - percentage) * color1[2] + percentage * color2[2] + ]; + + color3 = '#' + int_to_hex(color3[0]) + int_to_hex(color3[1]) + int_to_hex(color3[2]); + + // console.log(color3) + + return color3 +} + +function create_rectangle(i_x, i_y, j_x, j_y, theta, node_distance_x, linear, color, id='') { + + if (id != '') { + var extra = '" id="' + id + } else { + var extra = '' + } + + if (linear == 0) { + var [a_x, a_y] = transform(i_x, i_y, theta) + var [b_x, b_y] = transform(j_x, i_y, theta) + var [c_x, c_y] = transform(i_x, j_y, theta) + var [d_x, d_y] = transform(j_x, j_y, theta) + + var path = $('') + } else { + var [a_x, a_y] = [i_x * node_distance_x, -i_y] + var [b_x, b_y] = [j_x * node_distance_x, -i_y] + var [c_x, c_y] = [i_x * node_distance_x, -j_y] + var [d_x, d_y] = [j_x * node_distance_x, -j_y] + + var path = $('') + } + + return(path) +} + +//ANCHOR - Transformation function +function transform(x, y, theta) { + var circle_x = y * Math.sin(deg2rad(theta * x)) + var circle_y = y * Math.cos(deg2rad(theta * x)) + return [circle_x, circle_y] +} + +function pickcolor (edgecoloring, genomes) { + + var array = [] + + for (var name of genomes) { + array.push(edgecoloring[name]) + var sortedArray = array.sort(function(a, b) { + return a[0] - b[0]; + }); + } + + return sortedArray[0][1] +} + +function draw_newick(order, item_dist, max_dist, offset, max_size, line_thickness) { + + var output = '' + var saving_positions = {} + var saving_ids = {} + + var i = 0 + + for (var [item, start, end] of order){ + + var start_fraction = (max_size * (start / max_dist) - max_size) - offset + var end_fraction = (max_size * (end / max_dist) - max_size) - offset + + if (item != 'branching') { + var y_value = item_dist[item] + output += '' + + if (Object.values(saving_ids).includes(start_fraction)){ + var saving_id = Object.keys(saving_ids)[Object.values(saving_ids).indexOf(start_fraction)] + // console.log(saving_id) + saving_positions[saving_id].push(y_value) + } else { + saving_ids[i] = start_fraction + saving_positions[i] = [y_value] + i = i + 1 + } + if (end_fraction != max_size){ + output += '' + } + } else { + + if (Object.values(saving_ids).includes(end_fraction)){ + var saving_id_main = Object.keys(saving_ids)[Object.values(saving_ids).indexOf(end_fraction)] + sorted_positions = saving_positions[saving_id_main].sort() + // console.log(sorted_positions) + + // for (var j in sorted_positions) { + for (var j = 0; j < sorted_positions.length -1; j++) { + + var y_value_i = sorted_positions[j] + var y_value_j = sorted_positions[j+1] + + output += '' + + } + } else { + var saving_id_main = -1 + } + + y_value = Math.min(...sorted_positions) + (Math.max(...sorted_positions) - Math.min(...sorted_positions)) / 2 + output += '' + + if (Object.values(saving_ids).includes(start_fraction)) { + var saving_id = Object.keys(saving_ids)[Object.values(saving_ids).indexOf(start_fraction)] + saving_positions[saving_id].push(y_value) + } else { + saving_ids[i] = start_fraction + saving_positions[i] = [y_value] + i = i + 1 + } + + if (saving_id_main != -1) { + delete saving_ids[saving_id_main] + delete saving_positions[saving_id_main] + } + } + } + return(output) + +} + +function newick_to_order(string, prior = 0) { + var result = [] + var newick = string.replace(' ', '') + + if (newick[0] == '('){ + var bracket_open = 0 + var bracket_closed = 0 + + // for (var i in newick) { + for (var i = 0; i < newick.length; i++) { + + var sub_letter = newick[i] + if (sub_letter == '(') { + bracket_open += 1 + } else if (sub_letter == ')') { + bracket_closed += 1 + } + + if (bracket_open == bracket_closed) { + + var sub_newick = newick.slice(1, i) + var rest = newick.slice(i) + + if (rest.includes(',')) { + var parts = rest.split(',') + if (parts[0].includes(':')) { + var value = parts[0].split(':')[1] + } else { + var value = 0 + } + + result.push(...[['branching', prior, prior + parseFloat(value)]]) + + // console.log(sub_newick) + result.push(...newick_to_order(sub_newick, prior + parseFloat(value))) + + var next_iter = parts.slice(1).join(',') + // console.log(next_iter) + result.push(...newick_to_order(next_iter, prior)) + } else { + if (rest.includes(':')) { + var value = rest.split(':').slice(1) + } else { + var value = 0 + } + result.push(...[['branching', prior, prior + parseFloat(value)]]) + // console.log(sub_newick) + result.push(...newick_to_order(sub_newick, prior + parseFloat(value))) + } + break; + + } + + } + + } else { + + if (newick.includes(',')) { + var parts = newick.split(',') + if (parts[0].includes(':')){ + var value = parts[0].split(':')[1] + var branch = parts[0].split(':')[0] + } else { + var value = 0 + var branch = parts[0] + } + + var next_iter = parts.slice(1).join(',') + result.push(...[[branch, prior, prior + parseFloat(value)]]) + // console.log(next_iter) + result.push(...newick_to_order(next_iter, prior)) + + } else{ + if (newick.includes(':')) { + var value = newick.split(':')[1] + var branch = newick.split(':')[0] + } else { + var value = 0 + var branch = newick + } + result.push(...[[branch, prior, prior + parseFloat(value)]]) + } + } + + return(result) +} + +//ANCHOR - All in one SVG creation function +async function generate_svg(body, data) { + + // access all the relevant variables from UI + + var svg_search = []; + var svg_heatmaps = []; + var svg_edges = []; + var svg_nodes = []; + var svg_groups = []; + + var edgecoloring = {} + $("#genomecolors :input[type='color']").each((index, element) => { + edgecoloring[element.id] = [index, element.value] + }) + + if ($('#flexlinear').prop('checked') == true){ + var linear = 1 + } else { + var linear = 0 + } + + if ($('#flexsaturation').prop('checked') == true){ + var saturation = 1 + } else { + var saturation = 0 + } + + var outer_margin = parseInt($('#outer_margin')[0].value); + var inner_margin = parseInt($('#inner_margin')[0].value); + var node_size = parseInt($('#size')[0].value); + var node_thickness = parseInt($('#circ')[0].value); + var edge_thickness = parseInt($('#edge')[0].value); + var line_thickness = parseInt($('#line')[0].value); + var node_distance_x = parseInt($('#distx')[0].value); + var node_distance_y = parseInt($('#disty')[0].value); + var tree_length = parseInt($('#tree')[0].value); + var offset = parseInt($('#tree_offset')[0].value); + var tree_thickness = parseInt($('#tree_thickness')[0].value); + + var groups = data['infos']['groups'] + var edges = data['elements']['edges'] + var nodes = data['elements']['nodes'] + + var global_x = data["infos"]["meta"]["global_x_offset"] -1; + var global_y = data["infos"]["meta"]["global_y"] +1; + + var group_color = $('#groups')[0].value; + var node_color = $('#nodes')[0].value; + var layer_color = $('#layer_color')[0].value; + + var theta = 270 / (global_x) + var start_offset = parseInt($('#inner')[0].value); + // var start_offset = 0 + + var middle_layers = new Object(); + var outer_layers = new Object(); + + var search_size = parseInt($('#search_hit')[0].value); + middle_layers['search'] = [search_size, start_offset, search_size + start_offset] + + var arrow_size = parseInt($('#arrow')[0].value) + middle_layers['arrow'] = [arrow_size, search_size + start_offset, arrow_size + search_size + start_offset] + + var graph_size = node_size * 2 + node_thickness + outer_layers['graph'] = [graph_size, inner_margin, inner_margin + graph_size] + + var sum_middle_layer = start_offset + search_size + arrow_size + var sum_outer_layer = graph_size + inner_margin + + var current_middle_stop = sum_middle_layer + var current_outer_stop = sum_outer_layer + + var genomes = Object.keys(data["infos"]["genomes"]) + var enabled = [] + + var genome_size = genomes.length; + + var order = newick_to_order(data['infos']['newick']).reverse() + + max_dist = 0 + item_order = [] + for (var item of order) { + var [name, start, end] = item + if (name != 'branching') { + item_order.push(name) + } + if (end > max_dist) { + max_dist = end + } + } + +// layer size calculations + + for (var genome of item_order) { + + if ($('#flex' + genome).prop('checked') == true){ + enabled.push(genome) + } + + var layer_name = genome + 'layer' + if ($('#flex' + layer_name).prop('checked') == true){ + var layer_width = parseInt($('#' + layer_name)[0].value) + + var layer_middle_start = current_middle_stop + inner_margin + var layer_middle_stop = layer_middle_start + layer_width + + current_middle_stop = layer_middle_stop + sum_middle_layer += layer_width + inner_margin + + middle_layers[layer_name] = [layer_width, layer_middle_start, layer_middle_stop] + } + } + + var layers = Object.keys(data['infos']['layers_data']) + for (var layer_name of layers) { + if ($('#flex' + layer_name).prop('checked') == true){ + + var layer_width = parseInt($('#' + layer_name)[0].value) + // var layer_scale = data['infos']['layers_data'][layer_name]['scale'] + + // if (layer_scale == 'global') { + // var layer_middle_start = current_middle_stop + inner_margin + // var layer_middle_stop = layer_middle_start + layer_width + + // current_middle_stop = layer_middle_stop + // sum_middle_layer += layer_width + inner_margin + + // middle_layers[layer_name] = [layer_width, layer_middle_start, layer_middle_stop] + // } else { + var layer_outer_start = current_outer_stop + outer_margin + var layer_outer_stop = layer_outer_start + layer_width + + current_outer_stop = layer_outer_stop + sum_outer_layer += layer_width + outer_margin + + outer_layers[layer_name] = [layer_width, layer_outer_start, layer_outer_stop] + // } + } + } + + if (linear == 0){ + var radius = 0.5 * (node_distance_x / Math.sin(deg2rad(theta * (1/2)))) + var circle_dist = sum_middle_layer + graph_size * 0.5 + var extra_offset = 0 + + if (circle_dist < radius) { + extra_offset = radius - circle_dist + } + + sum_middle_layer += extra_offset + for (var layer in middle_layers) { + var [layer_width, layer_start, layer_stop] = middle_layers[layer] + middle_layers[layer] = [layer_width, layer_start + extra_offset, layer_stop + extra_offset] + } + + var y_size = (sum_middle_layer + (global_y * node_distance_y) + sum_outer_layer); + var x_size = (sum_middle_layer + (global_y * node_distance_y) + sum_outer_layer); + var svg_core = $('') + } else { + var x_size = global_x * node_distance_x; + var y_size = (sum_middle_layer + (global_y * node_distance_y) + sum_outer_layer); + var svg_core = $('') + } + + for (var genome of genomes) { + var layer_name = genome + 'layer' + if (Object.keys(middle_layers).includes(layer_name)){ + + var [layer_width, layer_start, layer_stop] = middle_layers[layer_name] + for (var x = 1; x < global_x+1; x++) { + + var add_start = 1 + var add_stop = 0 + + var i_x = x-add_start + var i_y = layer_start + var j_x = x+add_stop + var j_y = layer_stop + + // console.log(i_x, i_y, j_x, j_y) + + svg_heatmaps.push( + create_rectangle(i_x, i_y, j_x, j_y, theta, node_distance_x, linear, layer_color) + ) + } + } + } + + // var [search_size, search_start, search_stop] = middle_layers['search'] + + // if (linear == 0){ + // var [circle_h_x, circle_h_y] = transform(0, search_start + search_size * 0.5, theta) + // } else { + // var [circle_h_x, circle_h_y] = [0, -(search_start + search_size * 0.5)] + // } + + // svg_core.append( + // $('Search Hits') + // ) + + if ($('#flexarrow').prop('checked') == true){ + + var [arrow_size, arrow_start, arrow_stop] = middle_layers['arrow'] + + var pointer_length = 5/ theta + var pointer_height = arrow_stop - arrow_start + var arrow_thickness = pointer_height / 4 + var steps = Math.round((30 / theta)) + + if (steps < 1) { + steps = 1 + } + + if (linear == 0){ + var [circle_c_x, circle_c_y] = transform(global_x - pointer_length, arrow_start + arrow_thickness, theta) + var [circle_a_x, circle_a_y] = transform(0, arrow_start + arrow_thickness, theta) + var [circle_b_x, circle_b_y] = transform(0, arrow_stop - arrow_thickness, theta) + var [circle_d_x, circle_d_y] = transform(global_x - pointer_length, arrow_stop - arrow_thickness, theta) + var [circle_f_x, circle_f_y] = transform(global_x - pointer_length, arrow_stop, theta) + var [circle_g_x, circle_g_y] = transform(global_x, arrow_start + arrow_thickness * 2, theta) + var [circle_e_x, circle_e_y] = transform(global_x - pointer_length, arrow_start, theta) + + if ((global_x) * theta >= 180) { + var arc_flag = 1 + } else { + var arc_flag = 0 + } + + svg_core.append( + $('') + ) + + var [circle_h_x, circle_h_y] = transform(0, arrow_start + arrow_thickness * 2, theta) + + } else { + var [circle_c_x, circle_c_y] = [(global_x - pointer_length) * node_distance_x, -(arrow_start + arrow_thickness)] + var [circle_a_x, circle_a_y] = [0, -(arrow_start + arrow_thickness)] + var [circle_b_x, circle_b_y] = [0, -(arrow_stop - arrow_thickness)] + var [circle_d_x, circle_d_y] = [(global_x - pointer_length) * node_distance_x , -(arrow_stop - arrow_thickness)] + var [circle_f_x, circle_f_y] = [(global_x - pointer_length) * node_distance_x, -arrow_stop] + var [circle_g_x, circle_g_y] = [global_x * node_distance_x, -(arrow_start + arrow_thickness * 2)] + var [circle_e_x, circle_e_y] = [(global_x - pointer_length) * node_distance_x, -arrow_start] + + svg_core.append( + $('') + ) + + var [circle_h_x, circle_h_y] = [0, -(arrow_start + arrow_thickness * 2)] + } + + svg_core.append( + $('Orientation') + ) + + var l = steps + while (l < global_x) { + + if (l+steps <= global_x){ + var k = steps; + } else { + var k = steps - ((l+steps) - global_x); + } + + if (linear == 0){ + + var [circle_l_x, circle_l_y] = transform(l-0.5, arrow_start + arrow_thickness * 2, theta) + var rotate = theta * (l-0.5) + if (rotate >= 90 && rotate <= 180) { + rotate += 180; + } else if (rotate >= 180 && rotate <= 270) { + rotate -= 180; + } + svg_core.append( + $('' + l + '') + ) + + } else { + var [circle_l_x, circle_l_y] = [(l-0.5) * node_distance_x, -(arrow_start + arrow_thickness * 2)] + svg_core.append( + $('' + l + '') + ) + + } + + l += k + }; + } + + for(var i in edges) { + + var edge = data['elements']['edges'][i]; + var edge_genomes = Object.keys(edge['genome']) + + var intersection = edge_genomes.filter(x => enabled.includes(x)); + if (intersection.length > 0) { + var edge_genomes_length = edge_genomes.length; + + var color = pickcolor (edgecoloring, Object.keys(edge['genome'])) + + if (saturation == 1){ + var pick = lighter_color('#ffffff', color, edge_genomes_length / genome_size); + } else { + var pick = color; + } + + var source = edge['source'] + var target = edge['target'] + + if (source != 'start' && target != 'stop' && edge['shown'] == 1){ + + var i_x = nodes[source]['position']['x_offset'] + var i_y = nodes[source]['position']['y'] + var j_x = nodes[target]['position']['x_offset'] + var j_y = nodes[target]['position']['y'] + + for (let e = 0; e <= genomes.length; e++) { + + if (e == genomes.length || (genomes[e] + 'layer' in middle_layers && edge_genomes.includes(genomes[e])) ) { + + if (e == genomes.length) { + + if (edge['direction'] == 'L') { + var stroke = ' stroke-dasharray="' + line_thickness * 5 + ',' + line_thickness * 5 + '" ' + } else if (edge['direction'] == 'B') { + var stroke = ' stroke-dasharray="' + line_thickness * 20 + ',' + line_thickness * 5 + '" ' + } else { + var stroke = '' + } + + var [graph_size, graph_start, graph_stop] = outer_layers['graph'] + var i_y_size = sum_middle_layer + graph_start + graph_size * 0.5 + i_y * node_distance_y + var j_y_size = sum_middle_layer + graph_start + graph_size * 0.5 + j_y * node_distance_y + var draw = pick + var thickness = edge_thickness + } else { + var [layer_width, layer_start, layer_stop] = middle_layers[genomes[e] + 'layer'] + + if (layer_width < line_thickness) { + var draw = '' + } else { + layer_width -= line_thickness + layer_start += line_thickness * 0.5 + layer_stop -= line_thickness * 0.5 + + var i_y_size = layer_start + (i_y + 0.5) * (layer_width / global_y) + var j_y_size = layer_start + (j_y + 0.5) * (layer_width / global_y) + var draw = edgecoloring[genomes[e]][1] + var thickness = line_thickness + var stroke = '' + } + } + + if (linear == 0){ + var [circle_i_x, circle_i_y] = transform(i_x-0.5, i_y_size, theta); + var [circle_j_x, circle_j_y] = transform(j_x-0.5, j_y_size, theta); + } else { + var [circle_i_x, circle_i_y] = [(i_x-0.5) * node_distance_x, -i_y_size]; + var [circle_j_x, circle_j_y] = [(j_x-0.5) * node_distance_x, -j_y_size]; + } + + if (draw !== "") { + + if (edge['bended'] == ""){ + + if (i_y == j_y) { + svg_edges.push( + $('') + ) + } else { + svg_edges.push( + $('') + ) + } + + + } else { + + var bended_edge = '' + } else { + bended_edge += 'L ' + circle_j_x + ' ' + circle_j_y + '"' + stroke + ' stroke="' + draw + '" stroke-width="' + thickness + '" fill="none"/>' + } + } else { + bended_edge += 'L ' + circle_j_x + ' ' + circle_j_y + '"' + stroke + ' stroke="' + draw + '" stroke-width="' + thickness + '" fill="none"/>' + } + + svg_edges.push( + $(bended_edge) + ) + } + } + } + } + } + } + }; + + var group_nodes = []; + var group_dict = {} + for(var g in groups) { + var group = data['infos']['groups'][g] + + for (var node of group) { + group_dict[node] = g + } + }; + group_nodes = Object.keys(group_dict) + + var global_values = [] + for(var k in nodes) { + + if (k != 'start' && k != 'stop') { + + var node = data['elements']['nodes'][k]; + var node_genomes = Object.keys(node['genome']); + + var intersection = node_genomes.filter(x => enabled.includes(x)); + if (intersection.length > 0) { + + var node_genomes_length = node_genomes.length + + var k_x = parseInt(node['position']['x_offset']) + var k_y = parseInt(node['position']['y']) + + if (!group_nodes.includes(k)) { + var node_class = 'class="node' + + var x_value_start = parseInt(k_x) - 1 + var x_value_stop = parseInt(k_x) + + } else { + var node_class = 'stroke-opacity="0" fill-opacity="0" class="pseudo' + var g = group_dict[k] + + var group = data['infos']['groups'][g] + var group_size = group.length + var group_compress = $('#groupcompress')[0].value + var group_size_compressed = Math.round(group_size * group_compress) + + if (group_size_compressed == 0) { + group_size_compressed = 1 + } + + var z_x = parseInt(data['elements']['nodes'][group[0]]['position']['x_offset']) + + var fraction = group_size_compressed / (group_size) + var group_id = group.findIndex(x => x === k) + + var x_value_start = parseInt(z_x) - (1 - group_id * fraction) + var x_value_stop = parseInt(z_x) - (1 - (group_id + 1) * fraction) + + } + + var color = pickcolor (edgecoloring, Object.keys(node['genome'])) + + if (saturation == 1) { + var draw = lighter_color('#ffffff', color, node_genomes_length / genome_size); + var draw2 = lighter_color('#ffffff', node_color, node_genomes_length / genome_size) + } else { + var draw = color; + var draw2 = node_color + } + + var [graph_size, graph_start, graph_stop] = outer_layers['graph'] + var k_y_size = sum_middle_layer + graph_start + graph_size * 0.5 + k_y * node_distance_y + + if (linear == 0) { + var [circle_k_x, circle_k_y] = transform(k_x-0.5, k_y_size, theta); + } else { + var [circle_k_x, circle_k_y] = [(k_x-0.5) * node_distance_x, -k_y_size]; + } + + svg_nodes.push( + $('') + ) + + var add_start = 1 + var add_stop = 0 + + var [search_size, search_start, search_stop] = middle_layers['search'] + + var i_x = parseInt(k_x)-add_start + var i_y = search_start + var j_x = parseInt(k_x)+add_stop + var j_y = search_stop + + if (!global_values.includes(k_x)) { + svg_search.push(create_rectangle(i_x, i_y, j_x, j_y, theta, node_distance_x, linear, 'none', k_x)) + } + + for (var layer_name of layers) { + + if ($('#flex' + layer_name).prop('checked') == true){ + + var value = node['layer'][layer_name] + var max = data['infos']['layers_data'][layer_name]['max'] + + var [layer_width, layer_start, layer_stop] = outer_layers[layer_name] + var k_y_size = sum_middle_layer + k_y * node_distance_y + + var i_x = x_value_start + var i_y = layer_start + k_y_size + var j_x = x_value_stop + var j_y = layer_stop + k_y_size + var color = lighter_color('#00ff00', '#ff0000', value / max) + + svg_heatmaps.push(create_rectangle(i_x, i_y, j_x, j_y, theta, node_distance_x, linear, color)) + } + } + } + } + + global_values.push(k_x) + + }; + + for(var l in groups) { + + var group = data['infos']['groups'][l] + + var group_length = group.length + var left_node_name = group[0] + var right_node_name = group[group_length-1] + + var left_node = data['elements']['nodes'][left_node_name]; + var right_node = data['elements']['nodes'][right_node_name]; + + var group_genomes = Object.keys(left_node['genome']); + var intersection = group_genomes.filter(x => enabled.includes(x)); + if (intersection.length > 0) { + + var group_genomes_length = group_genomes.length; + + var color = pickcolor (edgecoloring, Object.keys(left_node['genome'])) + + if (saturation == 1) { + var draw = lighter_color('#ffffff', color, group_genomes_length / genome_size); + } else { + var draw = color; + } + + var l_x = parseInt(left_node['position']['x_offset']) + var l_y = parseInt(left_node['position']['y']) + + var m_x = parseInt(right_node['position']['x_offset']) + var m_y = parseInt(right_node['position']['y']) + + var [graph_size, graph_start, graph_stop] = outer_layers['graph'] + var l_y_size = sum_middle_layer + graph_start + graph_size * 0.5 + l_y * node_distance_y + var m_y_size = sum_middle_layer + graph_start + graph_size * 0.5 + m_y * node_distance_y + + var i_x = l_x-0.5 + var i_y = l_y_size + node_size + var j_x = m_x-0.5 + var j_y = m_y_size - node_size + + if (saturation == 1) { + var color = lighter_color('#ffffff', group_color, group_genomes_length / genome_size) + } else { + var color = group_color + } + + if ((l_x - m_x) * theta >= 180) { + var arc_flag = 1 + } else { + var arc_flag = 0 + } + + if (linear == 0) { + var [a_x, a_y] = transform(i_x, i_y, theta) + var [b_x, b_y] = transform(i_x, j_y, theta) + var [c_x, c_y] = transform(j_x, i_y, theta) + var [d_x, d_y] = transform(j_x, j_y, theta) + + var path = $('') + + } else { + var [a_x, a_y] = [i_x * node_distance_x, -i_y] + var [b_x, b_y] = [i_x * node_distance_x, -j_y] + var [c_x, c_y] = [j_x * node_distance_x, -i_y] + var [d_x, d_y] = [j_x * node_distance_x, -j_y] + + var path = $('') + } + + svg_groups.push(path) + } + }; + + for (var layer_name of layers) { + + if ($('#flex' + layer_name).prop('checked') == true){ + var [layer_width, layer_start, layer_stop] = outer_layers[layer_name] + var y_size = sum_middle_layer + layer_width * 0.5 + + if (linear == 0){ + var [circle_x, circle_y] = transform(0, (layer_start + y_size), theta) + } else { + var [circle_x, circle_y] = [0, -(layer_start + y_size)] + } + + svg_heatmaps.push( + $('' + layer_name + '') + ) + } + } + + item_dist = {} + for (var genome_name of item_order) { + if ($('#flex' + genome_name + 'layer').prop('checked') == true){ + var [layer_width, layer_start, layer_stop] = middle_layers[genome_name + 'layer'] + + if (layer_width >= edge_thickness) { + + var y_size = layer_start + layer_width * 0.5 + + if (linear == 0){ + var [circle_x, circle_y] = transform(0, y_size, theta) + } else { + var [circle_x, circle_y] = [0, -y_size] + } + + item_dist[genome_name] = circle_y + + svg_edges.push( + $('' + genome_name + '') + ) + } + } + } + + // tree_length + // console.log(Object.keys(item_dist).length, item_order.length) + + if ($('#flextree').prop('checked') == true){ + svg_core.append(draw_newick(order, item_dist, max_dist, offset, tree_length, tree_thickness)) + } + + for (var item of svg_search) svg_core.append(item); + for (var item of svg_heatmaps) svg_core.append(item); + for (var item of svg_edges) svg_core.append(item); + for (var item of svg_nodes) svg_core.append(item); + for (var item of svg_groups) svg_core.append(item); + + body.append(svg_core) + body.html(body.html()); +} + +//ANCHOR - Check node +async function checknode(searchfunction) { + + var keys = Object.keys(searchfunction) + if (keys.length > 0) { + var d = await get_passed_gene_clusters(searchfunction) + var result = d['gene_clusters'] + } else { + var result = [] + } + + return result +} + +function main () { + + $.ajax({ + url: "/pangraph/get_json", + type: "POST", + cache: false, + //data: JSON.stringify(data), + contentType: "application/json", + dataType: "json", + success: function(data){ + + var layers = Object.keys(data['infos']['layers_data']) + + for (var layer_name of layers) { + + var layer_scale = data['infos']['layers_data'][layer_name]['scale'] + + if (layer_scale == 'global') { + var value = '50' + } else { + var value = '10' + } + + if ($('#flex' + layer_name + '').length > 0) { + // console.log(layer_name) + } else { + var element = $('
').append( + $('
').append( + $('
').append( + $('') + ) + ) + ).append( + $('
').append( + layer_name + ) + ).append( + $('
').append( + $('') + ) + ) + + $('#layers').append(element) + } + + } + + // This title must update with Generic data from JSON + $('#title-panel-first-line').text(data['infos']['meta']['title']); + $('#title-panel-second-line').text('Pangraph Detail'); + + // It seems unused function after UI changes + $('#redraw').on('click', function() { + + var data = new Object; + + data['condtr'] = parseInt($('#condtr')[0].value) + data['maxlength'] = parseInt($('#maxlength')[0].value) + data['groupcompress'] = parseFloat($('#groupcompress')[0].value) + data['ungroup'] = [parseInt($('#ungroupfrom')[0].value), parseInt($('#ungroupto')[0].value)] + + $("#genomecolors :input[type='checkbox']").each((index, element) => { + + var genome = $(element).attr('name') + if ($(element).prop('checked') == true){ + data[genome] = 'on' + } else { + data[genome] = 'off' + } + }) + + $.ajax({ + url: "/pangraph/settings", + type: "POST", + async: false, + data: JSON.stringify(data), + contentType: "application/json", + dataType: "json", + success: function(){ + $(document).off("click", ".group_choice"); + $(document).off("click", ".binremove"); + $(document).off("click", ".binchoice li a"); + $(document).off("change", ".colorchange"); + $(document).find("*").off(); + main() + } + }); + }) + + $("#condtr")[0].value = data['infos']['gene_cluster_grouping_threshold'] + $("#maxlength")[0].value = data['infos']['max_edge_length_filter'] + $("#groupcompress")[0].value = data['infos']['groupcompress'] + $("#ungroupfrom")[0].value = data['infos']['ungroup'][0] + $("#ungroupto")[0].value = data['infos']['ungroup'][1] + + if (!$('#genomecolors').children().length) { + + for (var [genome, value] of Object.entries(data['infos']['genomes'])) { + + var state = '' + if (value == 'on') { + state = ' checked' + } + + $('#genomecolors').append( + $('
').append( + $('
').append( + $('
').append( + $('') + ) + ) + ).append( + $('
').append( + genome + ) + ).append( + $('
').append( + $('') + ) + ).append( + $('
').append( + $('') + ) + ) + ) + // ) + + $('#RightOffcanvasBodyTop').append( + $('').append( + $('').append( + genome + ) + ).append( + $('').append( + 0 + ) + ) + ) + + if ($('#flex' + genome + 'layer').length > 0) { + // console.log(layer_name) + } else { + var element = $('
').append( + $('
').append( + $('
').append( + $('') + ) + ) + ).append( + $('
').append( + genome + ) + ).append( + $('
').append( + $('') + ) + ) + + $('#layers').append(element) + } + } + } + + var body = $('#svgbox') + body.empty() + + // learn about the functional annotation sources + functional_annotation_sources_available = data['infos']['functional_annotation_sources_available']; + $('#searchSources').empty() + for (var annotation_source of functional_annotation_sources_available){ + $('#searchSources').append( + $('
').append( + $('
').append( + $('
').append( + $('') + ) + ).append( + $('
').append( + annotation_source + ) + ).append( + $('
') + ) + ) + ) + } + + var layers = Object.keys(data['infos']['layers_data']) + + $('#expressiondrop').empty() + $('#expressiondrop').append($('')) + $('#expressiondrop').append($('')) + $('#expressiondrop').append($('')) + + $('#filter').empty() + $('#filter').append( + $('
').append( + $('
').append( + $('
').append( + $('') + ) + ).append( + $('
').append( + 'Min graph position' + ) + ).append( + $('
').append( + $('') + ) + ).append( + $('
').append( + $('') + ) + ).append( + $('
').append( + 'Max graph position' + ) + ).append( + $('
').append( + $('') + ) + ) + ) + ) + + for (var layer_name of layers) { + if ($('#flex' + layer_name).prop('checked') == true){ + + $('#expressiondrop').append($('')) + $('#filter').append( + $('
').append( + $('
').append( + $('
').append( + $('') + ) + ).append( + $('
').append( + 'Min ' + layer_name + ) + ).append( + $('
').append( + $('') + ) + ).append( + $('
').append( + $('') + ) + ).append( + $('
').append( + 'Max ' + layer_name + ) + ).append( + $('
').append( + $('') + ) + ) + ) + ) + } + } + + generate_svg(body, data); + + window.zoomSVG = svgPanZoom('#result', { + zoomEnabled: true, + panEnabled: false, + controlIconsEnabled: false, + minZoom: 0.1, + maxZoom: 100 + }); + + //ANCHOR - Zoom pan functions + // $('#plus').on('click', function() { + // window.zoomSVG.zoomIn(); + // }) + + // $('#minus').on('click', function() { + // window.zoomSVG.zoomOut(); + // }) + + $('#fit').on('click', function() { + window.zoomSVG.resize(); + window.zoomSVG.fit(); + window.zoomSVG.center(); + }) + + //ANCHOR - Main panel response functions + + + + if ($('#condtr')[0].value == -1){ + // $('#customRange2').prop('disabled', true); + $('#flexcondtr').prop('checked', false); + } + + if ($('#maxlength')[0].value == -1){ + // $('#customRange3').prop('disabled', true); + $('#flexmaxlength').prop('checked', false); + } + + if ($('#groupcompress')[0].value == -1){ + $('#flexgroupcompress').prop('checked', false); + } + + $('#flexcondtr').change(function() { + if ($(this).prop('checked') == true){ + $('#condtr')[0].value = 2; + $('#condtr').prop('disabled', false) + // $('#customRange2')[0].value = 2; + // $('#customRange2').prop('disabled', false); + } else { + $('#condtr')[0].value = -1; + $('#condtr').prop('disabled', true) + // $('#customRange2')[0].value = 2; + // $('#customRange2').prop('disabled', true); + } + }) + + $('#flexmaxlength').change(function() { + if ($(this).prop('checked') == true){ + $('#maxlength')[0].value = 1; + $('#maxlength').prop('disabled', false) + // $('#customRange3')[0].value = 1; + // $('#customRange3').prop('disabled', false); + } else { + $('#maxlength')[0].value = -1; + $('#maxlength').prop('disabled', true) + // $('#customRange3')[0].value = 1; + // $('#customRange3').prop('disabled', true); + } + }) + + $('#flexgroupcompress').change(function() { + if ($(this).prop('checked') == true){ + $('#groupcompress')[0].value = 0.0; + $('#groupcompress').prop('disabled', false) + // $('#customRange4')[0].value = 0; + // $('#customRange4').prop('disabled', false); + } else { + $('#groupcompress')[0].value = 1.0; + $('#groupcompress').prop('disabled', true) + // $('#customRange4')[0].value = 0; + // $('#customRange4').prop('disabled', true); + } + }) + + $('#flexungroup').change(function() { + if ($(this).prop('checked') == true){ + $('#ungroupfrom')[0].value = 0; + $('#ungroupfrom').prop('disabled', false) + $('#ungroupto')[0].value = 0; + $('#ungroupto').prop('disabled', false) + } else { + $('#ungroupfrom')[0].value = -1; + $('#ungroupfrom').prop('disabled', true) + $('#ungroupto')[0].value = -1; + $('#ungroupto').prop('disabled', true) + } + }) + + $('#flextree').change(function() { + if ($(this).prop('checked') == true){ + var genomes = Object.keys(data["infos"]["genomes"]) + for (var genome of genomes) { + $('#flex' + genome + 'layer').prop('checked', true); + $('#flex' + genome + 'layer').prop('disabled', true); + } + } else { + var genomes = Object.keys(data["infos"]["genomes"]) + for (var genome of genomes) { + $('#flex' + genome + 'layer').prop('disabled', false); + } + } + }) + + sortable('#genomecolors', { + forcePlaceholderSize: true, + handle: '.user-handle', + items: 'div' + }); + + $(document).on("click", ".group_choice", async function() { + + // console.log(this) + + var gene_cluster_id = this.getAttribute("name_id"); + var gene_cluster_context = this.getAttribute("group"); + var add_align = parseInt(this.getAttribute("context")) + + var body = $(this).parent().parent().parent().parent() + + body.empty() + var bodyinfo = $('
') + body.append(bodyinfo) + + // console.log(gene_cluster_id, gene_cluster_context) + + var all_info = await get_gene_cluster_display_tables(gene_cluster_id, gene_cluster_context, data, add_align) + + // console.log(all_info) + + bodyinfo.append(all_info) + // nodeinfo(e.target, data); + }) + + //ANCHOR - Recolor nodes after redraw + var groups = data['infos']['groups'] + for (var binid of Object.keys(bins)) { + var nodes = bins[binid]; + var updated_nodes = [] + + for (var node of nodes) { + + var name = document.getElementById(node); + + if(name) { + if (name.getAttribute('class') == 'pseudo') { + for(var g in groups) { + var group = data['infos']['groups'][g] + if (group.includes(node)) { + if (!updated_nodes.includes(g)){ + updated_nodes.push(g) + } + } + } + } else { + updated_nodes.push(node) + } + } else { + updated_nodes.push(...current_groups[node]) + }; + + } + + bins[binid] = updated_nodes + // console.log(nodes) + // console.log(updated_nodes) + for (var node of bins[binid]) { + + bins[binid] = bins[binid].filter(item => item !== node); + var name = document.getElementById(node); + bins = marknode(name, data, binid, bins); + + } + } + + // console.log(bins) + + current_groups = data['infos']['groups'] + + // console.log(current_groups) + + //ANCHOR - Bin dropdown choice function + $(document).on("click", ".binremove", function() { + + var id = $(this).attr('name_id'); + var binid = $(this).attr('bin'); + + var name = document.getElementById(id); + // console.log(name) + bins = marknode(name, data, binid, bins); + + $(this).parent().parent().parent().remove(); + + // console.log(bins) + + }) + + // ANCHOR - Add bin + $('#binadd').on('click', function() { + binnum += 1; + + $('#bingrid').append( + $('
').append( + $('
').append( + $('
').append( + $('') + ) + ).append( + $('
').append( + $('') + ) + ).append( + $('
').append( + $('') + ) + ).append( + $('
').append( + $('') + ) + ) + ) + ); + + bins['bin' + binnum] = []; + }); + + $('#AddBin').on('click', function() { + + var selection = document.querySelector('input[name="binradio"]:checked'); + var binid = selection.value; + + var basics = $('#node_basics_table') + var node = basics[0].getAttribute("gc_context") + var name = document.getElementById(node); + + bins = marknode(name, data, binid, bins); + + }); + + //ANCHOR - Remove bin + $('#binremove').on('click', function() { + + var selection = document.querySelector('input[name="binradio"]:checked'); + + if (selection !== null) { + var binid = selection.value; + + for (var node of bins[binid]) { + var name = document.getElementById(node); + bins = marknode(name, data, binid, bins); + } + + $("#" + binid).remove(); + delete bins[binid] + var nextbin = document.querySelector('input[name="binradio"]') + + if (nextbin) { + nextbin.checked = true; + } else { + $('#binadd').click(); + } + } + }) + + //ANCHOR - Change bin + $(document).on("change", ".colorchange", function() { + + var binid = this.name; + var nodes = bins[binid]; + + for (var node of nodes) { + + bins[binid] = bins[binid].filter(item => item !== node); + var name = document.getElementById(node); + bins = marknode(name, data, binid, bins); + + } + + }); + + $('#binanalyse').on('click', async function() { + var set = {} + var groups = data['infos']['groups'] + var groups_keys = Object.keys(groups) + var bin_keys = Object.keys(bins) + + var selection = 'COG20_FUNCTION' + set['selection'] = selection + // console.log(bins, bin_keys) + + for (var binid of bin_keys) { + set[binid] = [] + for (var id of bins[binid]) { + if (groups_keys.includes(id)) { + for (var item of groups[id]) { + set[binid].push(data['elements']['nodes'][item]['name']) + } + } else { + set[binid].push(data['elements']['nodes'][id]['name']) + } + } + } + + $.ajax({ + url: "/pangraph/analyse", + type: "POST", + data: JSON.stringify(set), + contentType: "application/json", + dataType: "json", + success: function(result){ + + $('#BinAnalyseBody').empty(); + var bin_analyses_table = '' + + bin_analyses_table += '' + + bin_analyses_table += '' + for (bin_key of bin_keys) { + bin_analyses_table += '' + } + + bin_analyses_table += '' + bin_analyses_table += '' + + for (func of Object.keys(result)) { + bin_analyses_table += '' + for (bin_key of Object.keys(result[func])) { + var value = result[func][bin_key] + if (value == 1) { + bin_analyses_table += '' + } else { + bin_analyses_table += '' + } + } + bin_analyses_table += '' + } + + bin_analyses_table += '
' + 'Function' + '' + bin_key + '
' + func + '
' + + $('#BinAnalyseBody')[0].innerHTML = bin_analyses_table + + $('#BinAnalyse').modal('show'); + + } + }) + // console.log(set) + }) + + //ANCHOR - Info bin + $('#bininfo').on('click', async function() { + + var selection = document.querySelector('input[name="binradio"]:checked'); + + // if (selection !== null) { + var binid = selection.value; + var appendlist = []; + + $('#BinModalBody').empty(); + for (var id of bins[binid]) { + var element = document.getElementById(id); + + if (element.getAttribute('class') == 'group') { + gene_cluster_context = id; + gene_cluster_id = data['infos']['groups'][id][0] + } else { + gene_cluster_id = id; + gene_cluster_context = null; + } + + appendlist.push([gene_cluster_id, gene_cluster_context]); + + } + + for (var [gene_cluster_id, gene_cluster_context] of appendlist) { + + var body = $('
').append( + $('
').append( + await get_gene_cluster_display_tables(gene_cluster_id, gene_cluster_context, data, add_align=0) + ) + ) + + if (gene_cluster_context == null) { + var element_id = gene_cluster_id + } else { + var element_id = gene_cluster_context + } + + $('#BinModalBody').append( + $('
').append( + $('
').append( + $('
').append( + $('') + ) + ) + ).append( + body + ) + ) + + // basic_info = {'Gene Cluster': id, 'Genomes': genomes, 'Position': position} + // body.append(get_gene_cluster_display_tables(id, basic_info, gene_cluster_data)) + } + + $('#BinModal').modal('show'); + // } + }) + + $('#InfoDownload').on('click', function() { + + // Variable to store the final csv data + var csv_data = []; + var basics = $('#node_basics_table') + var title = basics[0].getAttribute("gc_id") + var layers = $('#node_layers_table') + var functions = $('#node_functions_table') + + var basics_rows = basics[0].getElementsByTagName('tr'); + var layers_rows = layers[0].getElementsByTagName('tr'); + + var function_rows = functions[0].getElementsByTagName('tr'); + + for (let i = 0; i < function_rows.length; i++) { + + if (i >= basics_rows.length) { + var basics_cols = [] + basics_cols = Array.prototype.concat.apply(basics_cols, basics_rows[1].querySelectorAll('td,th')); + basics_cols = Array.prototype.concat.apply(basics_cols, layers_rows[1].querySelectorAll('td,th')); + + var function_cols = function_rows[i].querySelectorAll('td,th'); + } else { + var basics_cols = [] + basics_cols = Array.prototype.concat.apply(basics_cols, basics_rows[i].querySelectorAll('td,th')); + basics_cols = Array.prototype.concat.apply(basics_cols, layers_rows[i].querySelectorAll('td,th')); + + var function_cols = function_rows[i].querySelectorAll('td,th'); + } + + let csvrow = []; + for (let j = 0; j < basics_cols.length; j++) { + + var info = basics_cols[j].innerHTML + csvrow.push(info); + } + + for (let k = 0; k < function_cols.length; k++) { + + var info = function_cols[k].innerHTML + csvrow.push(info); + } + + // console.log(csvrow) + + // Combine each column value with comma + csv_data.push(csvrow.join(",")); + } + // Combine each row data with new line character + csv_data = csv_data.join('\n'); + + var blob = new Blob([csv_data]); + // var title = data['infos']['meta']['title'] + downloadBlob(blob, title + ".csv"); + }); + + $('#AlignmentDownload').on('click', function() { + + // Variable to store the final csv data + var csv_data = ''; + var alignment = $('#node_sequence_alignments_table') + var basics = $('#node_basics_table') + var title = alignment[0].getAttribute("gc_id") + var xpos = basics[0].getAttribute("gc_pos") + + var alignment_rows = alignment[0].getElementsByTagName('tr'); + + for (let i = 1; i < alignment_rows.length; i++) { + + var alignment_cols = alignment_rows[i].querySelectorAll('td,th'); + + csv_data += ">" + title + "|Genome:" + alignment_cols[0].innerHTML +"|Genecall:" + alignment_cols[1].innerHTML + "|Contig:" + alignment_cols[2].innerHTML + "|Position:" + xpos + "|Direction:" + alignment_cols[3].innerHTML + '\n'; + var genome = '' + + var alignment_nucs = alignment_cols[4].getElementsByTagName('span'); + // console.log(alignment_nucs) + + for (let k = 0; k < alignment_nucs.length; k++) { + + genome += alignment_nucs[k].innerHTML + } + + csv_data += genome.match(/.{1,60}/g).join("\r\n") + "\n" + } + // Combine each row data with new line character + + var blob = new Blob([csv_data]); + // var title = data['infos']['meta']['title'] + downloadBlob(blob, title + ".fa"); + }); + + $('#svgDownload').on('click', function() { + var blob = new Blob([$('#svgbox')[0].innerHTML]); + var title = data['infos']['meta']['title'] + downloadBlob(blob, title + ".svg"); + }); + + $('#searchadd').on('click', function() { + + var selection = document.querySelector('input[name="binradio"]:checked') + var binid = selection.value + + for (var [id, members] of Object.entries(searched)) { + if (!(bins[binid].includes(id))) { + + var e = document.getElementById(id); + bins = marknode(e, data, binid, bins); + + } + } + }) + + $('#searchremove').on('click', function() { + + var selection = document.querySelector('input[name="binradio"]:checked') + var binid = selection.value + + for (var [id, members] of Object.entries(searched)) { + if ((bins[binid].includes(id))) { + + var e = document.getElementById(id); + bins = marknode(e, data, binid, bins); + + } + } + }) + + $('#searcherase').on('click', function() { + + for (var [id, members] of Object.entries(searched)) { + for (var mem of members) { + var xpos = data['elements']['nodes'][mem]['position']['x_offset'] + var e = document.getElementById(xpos); + e.setAttribute("fill", "white") + } + } + + searched = {}; + }) + + $('#searchcolor').on('click', function() { + + for (var [id, members] of Object.entries(searched)) { + for (var mem of members) { + var xpos = data['elements']['nodes'][mem]['position']['x_offset'] + var e = document.getElementById(xpos); + e.setAttribute("fill", "#ff0000") + } + } + }) + + const searchEraseButton = document.getElementById('searcherase'); + const searchColorButton = document.getElementById('searchcolor'); + const searchAppendBin = document.getElementById('searchadd'); + const searchRemoveBin = document.getElementById('searchremove'); + + var searched = {} + $('#search').on('click', async function() { + + var searchpos = false + + //Filter Search + var layers_filter = new Object + var mingenomes = ($("#mingenomes").prop('checked') == true && !isNaN(parseFloat($("#mingenomestext")[0].value))) ? parseFloat($("#mingenomestext")[0].value) : -1 + var maxgenomes = ($("#maxgenomes").prop('checked') == true && !isNaN(parseFloat($("#maxgenomestext")[0].value))) ? parseFloat($("#maxgenomestext")[0].value) : -1 + var minposition = ($("#minposition").prop('checked') == true && !isNaN(parseFloat($("#minpositiontext")[0].value))) ? parseFloat($("#minpositiontext")[0].value) : -1 + var maxposition = ($("#maxposition").prop('checked') == true && !isNaN(parseFloat($("#maxpositiontext")[0].value))) ? parseFloat($("#maxpositiontext")[0].value) : -1 + + layers_filter['genomes'] = {'min': mingenomes, 'max': maxgenomes} + layers_filter['position'] = {'min': minposition, 'max': maxposition} + + var layers = Object.keys(data['infos']['layers_data']) + for (var layer_name of layers) { + + // console.log("#min" + layer_name + "text") + var minlayer = ($("#min" + layer_name).prop('checked') == true && !isNaN(parseFloat($("#min" + layer_name + "text")[0].value))) ? parseFloat($("#min" + layer_name + "text")[0].value) : -1 + var maxlayer = ($("#max" + layer_name).prop('checked') == true && !isNaN(parseFloat($("#max" + layer_name + "text")[0].value))) ? parseFloat($("#max" + layer_name + "text")[0].value) : -1 + + layers_filter[layer_name] = {'min': minlayer, 'max': maxlayer} + } + + //Expression Search + var expressioncomparison = '' + var expressiondrop = $('#expressiondrop')[0].value + var expressionrel = $('#expressionrel')[0].value + var expressiontext = $('#expressiontext')[0].value + + // console.log(expressiondrop, expressionrel, expressiontext) + + if (expressiondrop != "Choose item" && expressionrel != "Choose operator" && expressiontext != '') { + if (expressionrel == '=') { + if (!isNaN(expressiontext)) { + expressioncomparison = '== ' + expressiontext + } else { + expressioncomparison = '== "' + expressiontext + '"' + } + } else if (expressionrel == '\u{2260}') { + if (!isNaN(expressiontext)) { + expressioncomparison = '!= ' + expressiontext + } else { + expressioncomparison = '!= "' + expressiontext + '"' + } + } else if (expressionrel == '\u{2264}' && !isNaN(expressiontext)) { + expressioncomparison = '<= ' + expressiontext + } else if (expressionrel == '\u{2265}' && !isNaN(expressiontext)) { + expressioncomparison = '>= ' + expressiontext + } else if (expressionrel == '\u{003C}' && !isNaN(expressiontext)) { + expressioncomparison = '< ' + expressiontext + } else if (expressionrel == '\u{003E}' && !isNaN(expressiontext)) { + expressioncomparison = '> ' + expressiontext + } else if (expressionrel == '\u{25C2}\u{25AA}\u{25B8}') { + expressioncomparison = '.includes("' + expressiontext + '")' + } else if (expressionrel == '\u{25C2}\u{25AA}') { + expressioncomparison = '.endsWith("' + expressiontext + '")' + } else if (expressionrel == '\u{25AA}\u{25B8}') { + expressioncomparison = '.startsWith("' + expressiontext + '")' + } + } + + // console.log(expressioncomparison) + + //Function Search + var searchfunction = {} + var searchterms = $('#searchFunctionsValue')[0].value.split(","); + for (var source of functional_annotation_sources_available) { + if ($("#flex" + source).prop('checked') == true) { + searchfunction[source] = searchterms + } + } + + console.log(searchfunction) + + var layers_positions = {} + for (var layer_name of Object.keys(layers_filter)) { + + var minlayer = layers_filter[layer_name]['min'] + var maxlayer = layers_filter[layer_name]['max'] + + if (minlayer != -1 || maxlayer != -1 || (minlayer != -1 && maxlayer != -1)) { + + layers_positions[layer_name] = [] + searchpos = true + if (layer_name == 'position') { + var global_x = data["infos"]["meta"]["global_x_offset"] -1; + var layerobjects = new Array(global_x - 1).fill().map((d, i) => i + 1); + // console.log(layerobjects) + } else { + var layerobjects = document.querySelectorAll("." + layer_name) + } + + for (var o of layerobjects) { + + var append = true + + if (layer_name == 'position') { + var value = o + var result = o + } else { + var value = o.getAttribute("name") + var result = o.getAttribute("xpos") + } + + if (minlayer != '-1'){ + if (eval(value + '<' + minlayer)){ + append = false + } + } + + if (maxlayer != '-1'){ + if (eval(value + '>' + maxlayer)){ + append = false + } + } + + if (append == true) { + layers_positions[layer_name].push(parseInt(result)) + } + + } + } + } + + // console.log(layers_positions) + + var positions = [] + var keys = Object.keys(layers_positions) + + if (keys.length > 0) { + for (var pos of layers_positions[keys[0]]) { + + if (keys.length > 0) { + var add = true + for (let k = 1; k < keys.length; k++) { + if (!layers_positions[keys[k]].includes(pos)) { + add = false + } + } + + if (add == true) { + positions.push(pos) + } + } else { + positions.push(pos) + } + } + } + + // console.log(positions) + + if (expressioncomparison == '' && Object.keys(searchfunction).length == 0 && searchpos == false) { + var message = "Please enter valid search parameters." + } else { + // console.log(expressioncomparison, searchfunction, searchpos) + + var passed_gcs = await checknode(searchfunction) + + var nodes = document.querySelectorAll(".node") + for (var node of nodes) { + + var id = node.getAttribute("id") + var node = data['elements']['nodes'][id] + + if (passed_gcs.includes(node['name'])) { + var append = true + } else { + var append = false + } + + if (searchpos == true) { + if (!positions.includes(parseInt(node['position']['x_offset']))) { + append = false + } + } + + if (append == true) { + if (expressiondrop == "Name") { + if (eval('"' + node["name"] + '"' + expressioncomparison)) { + if (!(id in searched)) { + searched[id] = [id] + } + } + } else { + if (!(id in searched)) { + searched[id] = [id] + } + } + } + } + + var groups = document.querySelectorAll(".group") + for (var group of groups) { + var groupid = group.getAttribute("id") + var members = data["infos"]["groups"][groupid] + for (var id of members) { + + node = data['elements']['nodes'][id] + var append = true + + if (passed_gcs.includes(node['name'])) { + var append = true + } else { + var append = false + } + + if (searchpos == true) { + if (!positions.includes(parseInt(node['position']['x_offset']))) { + append = false + } + } + + if (append == true) { + if (expressiondrop == "Name") { + if (eval('"' + node["name"] + '"' + expressioncomparison) || eval('"' + group + '"' + expressioncomparison)) { + + if (!(groupid in searched)) { + searched[groupid] = [id] + } else { + searched[groupid].push(id) + } + } + } else { + if (!(groupid in searched)) { + searched[groupid] = [id] + } else { + searched[groupid].push(id) + } + } + } + } + } + + var table = $('
') + + for (var key of Object.keys(searched)) { + + table.append( + $('
').append( + $('
').append( + $('
' + searched[key] + '') + ) + ) + ) + } + + $('#searchtable').empty() + $('#searchtable').append( + $('
').append( + $('
').append( + table + ) + ) + ) + + var message = 'You have ' + Object.keys(searched).length + ' item(s) in your queue.' + + if (Object.keys(searched).length != 0) { + searchEraseButton.disabled = false; + searchColorButton.disabled = false; + searchAppendBin.disabled = false; + searchRemoveBin.disabled = false; + } + } + + var toastbody = $('#searchtoastbody') + toastbody.empty() + toastbody.append( + message + ) + // var searchtoast = bootstrap.Toast.getOrCreateInstance($('#searchtoast')) + $('#searchtoast').toast('show') + }) + + var nodes = document.querySelectorAll(".node") + var divs = document.querySelectorAll(".node, .group"); + for (var el of divs) { + + if (el.getAttribute("class") == 'group'){ + var id = data["infos"]["groups"][el.getAttribute("id")][0] + var name = el.getAttribute("id") + } else { + var id = el.getAttribute("id") + var name = data['elements']['nodes'][el.getAttribute("id")]['name'] + } + + tippy(el, { + content: '' + name + '' + '
', + allowHTML: true, + onHide(instance) { + if (instance.reference.id.startsWith('GCG_')){ + var id = data["infos"]["groups"][instance.reference.id][0] + } else { + var id = instance.reference.id + } + var elements = Object.keys(data['elements']['nodes'][id]['genome']) + + for (var element of elements) { + $('#number_' + element)[0].innerText = '0'; + } + }, + onShow(instance) { + if (instance.reference.id.startsWith('GCG_')){ + var id = data["infos"]["groups"][instance.reference.id][0] + } else { + var id = instance.reference.id + } + var elements = Object.keys(data['elements']['nodes'][id]['genome']) + + for (var element of elements) { + $('#number_' + element)[0].innerText = '1'; + } + }, + arrow: false, + duration: 0, + followCursor: true, + theme: "light", + }); + }; + + var isDown = false + var diff = 0 + + var old_xpos = 0 + var old_ypos = 0 + + var xpos = 0 + var ypos = 0 + + var new_xpos = 0 + var new_ypos = 0 + + $("#svgbox").on('mousedown', function(e) { + old_xpos = e.offsetX + old_ypos = e.offsetY + + xpos = old_xpos + ypos = old_ypos + + isDown = true + diff = 0 + }) + + $("#svgbox").on('mousemove', function(e) { + if (isDown === true) { + new_xpos = e.offsetX + new_ypos = e.offsetY + + diff += Math.sqrt((new_xpos-xpos)^2+(new_ypos-ypos)^2) + + if (!e.shiftKey) { + window.zoomSVG.panBy({x: new_xpos-xpos, y: new_ypos-ypos}) + } + + xpos = new_xpos + ypos = new_ypos + } + }) + + $("#svgbox").on('mouseup', function(e) { + if (isDown === true) { + + var selection = document.querySelector('input[name="binradio"]:checked') + + isDown = false + + if (diff < 10) { + if (e.target.getAttribute('class') === 'group' || e.target.getAttribute('class') === 'node') { + + if (e.shiftKey && selection !== null) { + + var binid = selection.value + bins = marknode(e.target, data, binid, bins); + + } else { + //show_node_details_modal(e.target, data); + // console.log(e.target) + nodeinfo(e.target, data); + } + + } else { + } + + } else { + if (e.shiftKey && selection !== null) { + + var binid = selection.value + + var max_xpos = Math.max(old_xpos, xpos) + var min_xpos = Math.min(old_xpos, xpos) + + var max_ypos = Math.max(old_ypos, ypos) + var min_ypos = Math.min(old_ypos, ypos) + + for (var n of nodes) { + + var bounding = n.getBoundingClientRect(); + var left = bounding.left + var right = bounding.right + var bottom = bounding.bottom + var top = bounding.top + + if ( + min_xpos < left && + max_xpos > right && + min_ypos < bottom && + max_ypos > top + ) { + bins = marknode(n, data, binid, bins); + + } + } + + var groups = data['infos']['groups'] + for(var g in groups) { + // var inside = true; + var group = data['infos']['groups'][g] + for (var k of group) { + var node = document.getElementById(k); + + var bounding = node.getBoundingClientRect(); + var left = bounding.left + var right = bounding.right + var bottom = bounding.bottom + var top = bounding.top + + if ( + min_xpos < left && + max_xpos > right && + min_ypos < bottom && + max_ypos > top + ) { + var name = document.getElementById(g); + bins = marknode(name, data, binid, bins); + break + } + // if ( + // min_xpos < left && + // max_xpos > right && + // min_ypos < bottom && + // max_ypos > top + // ) { + // } else { + // inside = false + // } + } + + // if (inside === true){ + // var name = document.getElementById(g); + // bins = marknode(name, data, binid, bins); + // } + } + } + } + } + }) + document.body.addEventListener('keydown', function(ev) { + if ((/^(?:input|select|textarea|button)$/i).test(ev.target.nodeName)) + { + // shortcuts should not work if user is entering text to input. + return false; + } + if (ev.keyCode === 83) { // S = 83 + $('#toggle-panel-left').trigger('click'); + } + if (ev.keyCode === 84) { // T = 84 + $('#title-panel').toggle(); + $('#toggle-panel-top').toggleClass('invisible visible'); + } + }); + }}) + +} + +//ANCHOR - Main function after loading DOM +$(document).ready(function() { + + // console.log('start pangrah layout creation!') + main() + +}); diff --git a/anvio/data/interactive/package.json b/anvio/data/interactive/package.json index 55bba502f..7e33253a7 100644 --- a/anvio/data/interactive/package.json +++ b/anvio/data/interactive/package.json @@ -34,6 +34,7 @@ "randomcolor": "0.6.2", "sortable-tablesort": "3.2.2", "svg-pan-zoom": "3.6.1", + "tippy.js": "6.3.7", "toastr": "2.1.2" } } diff --git a/anvio/data/interactive/pangraph.html b/anvio/data/interactive/pangraph.html new file mode 100644 index 000000000..58dc6c93c --- /dev/null +++ b/anvio/data/interactive/pangraph.html @@ -0,0 +1,714 @@ + + + + + + + Gene Cluster Network + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+
+ +
+ +
+
+
+
+ SETTINGS +
+
+ +
+ +
+ +
+ + + + + + + +
+ +
+ + + + diff --git a/anvio/docs/__init__.py b/anvio/docs/__init__.py index 14d26e78a..09e9e037f 100644 --- a/anvio/docs/__init__.py +++ b/anvio/docs/__init__.py @@ -621,6 +621,18 @@ "provided_by_anvio": True, "provided_by_user": False }, + "pan-graph-json": { + "name": "PAN GRAPH JSON", + "type": "JSON", + "provided_by_anvio": True, + "provided_by_user": False + }, + "pan-graph": { + "name": "PAN GRAPH", + "type": "CONCEPT", + "provided_by_anvio": True, + "provided_by_user": False + }, "interactive": { "name": "INTERACTIVE DISPLAY", "type": "DISPLAY", diff --git a/anvio/filesnpaths.py b/anvio/filesnpaths.py index 2ac5ea540..f7d429bd7 100644 --- a/anvio/filesnpaths.py +++ b/anvio/filesnpaths.py @@ -269,14 +269,18 @@ def is_file_json_formatted(file_path): return True -def is_file_fasta_formatted(file_path): +def is_file_fasta_formatted(file_path, dont_raise=False): is_file_exists(file_path) try: f = u.SequenceSource(file_path) except u.FastaLibError as e: - raise FilesNPathsError("Someone is not happy with your FASTA file '%s' (this is " - "what the lib says: '%s'." % (file_path, e)) + if dont_raise: + f.close() + return False + else: + raise FilesNPathsError("Someone is not happy with your FASTA file '%s' (this is " + "what the lib says: '%s'." % (file_path, e)) f.close() diff --git a/anvio/interactive.py b/anvio/interactive.py index 07e9c3cb2..13afed649 100644 --- a/anvio/interactive.py +++ b/anvio/interactive.py @@ -5,6 +5,7 @@ import os import sys import copy +import json import numpy import argparse import textwrap @@ -2979,6 +2980,80 @@ def get_metabolism_data(self): return self.estimator.get_metabolism_data_for_visualization() +class PangraphInteractive(PanSuperclass): + def __init__(self, args, run=run, progress=progress): + self.mode = "pangraph" + + self.args = args + self.run = run + self.progress = progress + + A = lambda x: self.args.__dict__[x] if x in self.args.__dict__ else None + self.pan_graph_json_path = A('pan_graph_json') + self.pan_graph_summary_path = A('pan_graph_summary') + + if self.pan_graph_json_path: + filesnpaths.is_file_json_formatted(self.pan_graph_json_path) + + if self.pan_graph_summary_path: + filesnpaths.is_file_json_formatted(self.pan_graph_summary_path) + + if not self.pan_graph_json_path: + raise ConfigError("Unfortunately you can only use this program with the `--pan-graph-json` parameter.") + + if A('pan_db'): + PanSuperclass.__init__(self, self.args) + else: + self.run.warning("Since you did not provide anvi'o with a pan-db file, this program will initiate " + "the pangenome graph interface with the JSON file you have provided", + header="NO PAN DB, BUT ALL GOOD 👍", lc="yellow") + + self.pan_graph_json = self.get_pangraph_json() + self.pan_graph_summary = self.get_pangraph_summary() + + + def get_pangraph_json(self): + """A function to 'get' pangraph JSON data from wherever appropriate. + + Currently this function returns the user-provided JSON data, but it + will take into consideration other parameters and pan-db to determine + which resource is the most appropriate to retrieve the pangraph data. + """ + + if self.pan_graph_json_path: + json_data = json.load(open(self.pan_graph_json_path)) + else: + # FIXME: this is where we will get things from the pan-db, but it + # is not yet implemented + raise ConfigError("Not implemented.") + + # make sure the JSON data we have access is compatible with the codebase + # we are running. + version_we_have = str(json_data['infos']['meta']['version']) if 'version' in json_data['infos']['meta'] else None + version_we_want = str(t.pangraph_json_version) + + if version_we_have == None: + raise ConfigError("Bad news: the pan-graph data you have is outdated :/ You will have to re-run " + "the program `anvi-pan-graph` on these data to generate an up-to-date " + "pan-graph artifact that your current codebase can work with.") + + if version_we_have != version_we_want: + raise ConfigError(f"We have a problem here. The data for your pan-graph is at version {version_we_have}. " + f"Yet the anvi'o codebase you have here can only work with pan-graph version {version_we_want}. " + f"The best solution to address this mismatch is to re-run the program `anvi-pan-graph` " + f"on your dataset.") + + # all good with the version stuff: + return json_data + + + def get_pangraph_summary(self): + if self.pan_graph_summary_path: + summary_data = json.load(open(self.pan_graph_summary_path)) + + return summary_data + + class ContigsInteractive(): def __init__(self, args, run=run, progress=progress): self.mode = 'contigs' diff --git a/anvio/panops.py b/anvio/panops.py index 3126dcb7b..2db8069c5 100644 --- a/anvio/panops.py +++ b/anvio/panops.py @@ -10,7 +10,17 @@ import json import math import copy +import argparse import pandas as pd +import networkx as nx +import numpy as np +import itertools as it +import matplotlib.pyplot as plt +# from scipy.stats import entropy +from scipy.cluster.hierarchy import linkage, fcluster, dendrogram, to_tree +from scipy.spatial.distance import cdist, squareform +import random + from itertools import chain # multiprocess is a fork of multiprocessing that uses the dill serializer instead of pickle @@ -30,6 +40,7 @@ import anvio.filesnpaths as filesnpaths import anvio.tables.miscdata as miscdata +from anvio.dbinfo import DBInfo from anvio.drivers.blast import BLAST from anvio.drivers.diamond import Diamond from anvio.drivers.mcl import MCL @@ -56,6 +67,7 @@ additional_param_sets_for_sequence_search = {'diamond' : '--masking 0', 'ncbi_blast': ''} + class Pangenome(object): def __init__(self, args=None, run=run, progress=progress): self.args = args @@ -1091,3 +1103,2179 @@ def process(self): self.run.quit() + + +class Pangraph(): + def __init__(self, args, run=run, progress=progress): + self.args = args + self.run = run + self.progress = progress + + A = lambda x: args.__dict__[x] if x in args.__dict__ else None + # important Anvi'o artifacts that are necessary for successfull execution + self.pan_db = A('pan_db') + self.external_genomes_txt = A('external_genomes') + self.genomes_storage_db = A('genomes_storage') + + # additional optinal input variables + self.testing_yaml = A('testing_yaml') + self.pan_graph_json = A('pan_graph_json') + self.gene_additional_data = A('gene_additional_data') + self.gc_additional_data = A('gc_additional_data') + + # additional variables for presetting the UI values + self.max_edge_length_filter = A('max_edge_length_filter') + self.gene_cluster_grouping_threshold = A('gene_cluster_grouping_threshold') + self.groupcompress = A('gene_cluster_grouping_compression') + self.ungroup = A('ungrouping_area') + self.pass_filter = 0 + + # additional variables for special cases e.g. a user wants to tune the tool + # in a very specific direction + # TODO: skip genomes might be a very useful function + self.priority_genome = A('priority_genome') + + if A('genome_names'): + self.genomes_names = A('genome_names').split(',') + else: + self.genomes_names = [] + + if A('genome_reverse'): + self.genomes_reverse = A('genome_reverse').split(',') + else: + self.genomes_reverse = [] + + # the different data storage related variables e.g. input and output files + # TODO: DB storage is not yet implemented -> will be GRAPH.db at one point + self.skip_storing_in_pan_db = True + self.json_output_file_path = A('output_pan_graph_json') + + self.output_graphml = A('output_graphml') + self.output_yaml = A('output_testing_yaml') + self.output_summary = A('output_dir_summary') + self.output_graphics = A('output_dir_graphics') + + self.output_raw_gc_additional_data = A('output_raw_gc_additional_data') + self.output_raw_gene_additional_data = A('output_raw_gene_additional_data') + + # learn what gene annotation sources are present across all genomes if we + # are running things in normal mode + if not self.testing_yaml and not self.pan_graph_json: + self.functional_annotation_sources_available = DBInfo(self.genomes_storage_db, expecting='genomestorage').get_functional_annotation_sources() + else: + self.functional_annotation_sources_available = [] + + # dictionary containing the layer information that will be saved in the + # json file to be used in the front end + self.data_table_dict = {} + + # this is the dictionary that wil keep all data that is going to be loaded + # from anvi'o artifacts + self.gene_synteny_data_dict = {} + self.genomes = [] + + # these are the main graph data structures for the different steps of the tools execution + self.initial_graph = nx.DiGraph() + self.pangenome_graph = nx.DiGraph() + self.edmonds_graph = nx.DiGraph() + self.ancest = nx.DiGraph() + + # additional self variables that are used during the tools execution + self.single_copy_core = set() + self.grouping = {} + self.project_name = '' + self.global_y = 0 + self.global_x = 1 + self.global_x_offset = 0 + self.k = 0 + self.genome_gc_occurence = {} + self.ghost = 0 + self.debug = False + self.jsondata = {} + self.position = {} + self.offset = {} + self.x_list = {} + self.path = {} + self.edges = {} + self.layers = [] + self.removed = set() + + def sanity_check(self): + + """Sanity check for incompatible settings, like skip storing and no output path""" + + if self.skip_storing_in_pan_db and not self.json_output_file_path: + raise ConfigError("You are initializing the Pangraph class with `--skip-storing-in-pan-db` without an `--output-pan-graph-json` " + "parameter for the graph results to be stored. Please set an output file path so anvi'o has at least one " + "way to store results.") + + # if not self.genomes_names: + # raise ConfigError("Please be nice enough to use a genome name that is present in your dataset, you know, to let the code take the wheel.") + + + def process(self): + """Primary driver function for the class""" + + self.sanity_check() + + # the pathway over the pan_graph_json serves the purpose of updating the + # json file + if self.pan_graph_json: + self.load_graph_from_json_file() + + if not self.pan_graph_json and self.testing_yaml: + # skip sanity check EVERYTHING in case of testing mode + self.prepare_yaml() + + if not self.pan_graph_json and not self.testing_yaml: + # populate self.gene_synteny_data_dict + self.get_gene_synteny_data_dict() + + if not self.pan_graph_json and self.output_yaml: + self.export_gene_synteny_to_yaml() + + if not self.pan_graph_json and not self.output_yaml: + # contextualize paralogs + # TODO Incorporate gene direction + self.run_contextualize_paralogs_algorithm() + + # build graph + self.build_graph() + + # reconnect open leaves in the graph to generate + # a flow network from left to right + self.run_tree_to_flow_network_algorithm() + + self.prepare_synteny_graph() + + if not self.output_yaml: + self.run_synteny_layout_algorithm() + + if not self.pan_graph_json and not self.output_yaml and not self.testing_yaml: + # run Alex's layout algorithm + self.generate_data_table() + + if not self.output_yaml and self.output_raw_gc_additional_data: + self.get_additional_gc_layer_table() + + if not self.output_yaml and self.output_raw_gene_additional_data: + self.get_additional_gene_layer_table() + + if not self.pan_graph_json and not self.output_yaml and self.gc_additional_data: + self.add_additional_gc_layer_values() + + if not self.pan_graph_json and not self.output_yaml and self.gene_additional_data: + self.add_additional_gene_layer_values() + + if not self.pan_graph_json and not self.output_yaml and self.output_summary: + self.get_genomic_motifs() + + if not self.pan_graph_json and not self.output_yaml: + self.get_json_dict_for_graph() + + if self.pan_graph_json: + self.update_json_dict() + + if not self.output_yaml: + # store network in the database + self.store_network() + + + def prepare_yaml(self): + yaml_genomes = utils.get_yaml_as_dict(self.testing_yaml) + self.project_name = 'YAML TEST' + + for genome in yaml_genomes.keys(): + self.gene_synteny_data_dict[genome] = {} + + if genome in self.genomes_names: + self.genomes.append(genome) + self.run.info_single(f"Loaded genome {genome}.") + else: + self.run.info_single(f"Skipped genome {genome}.") + + for i, gcs in enumerate(yaml_genomes[genome]): + + contig = genome + '_' + str(i).zfill(5) + + self.gene_synteny_data_dict[genome][contig] = {} + + for j, gc in enumerate(gcs.split(' ')): + if gc.endswith('!'): + gc = gc[:-1] + direction = 'r' + else: + direction = 'f' + + self.gene_synteny_data_dict[genome][contig][j] = { + 'gene_cluster_name': gc, + 'gene_cluster_id': '', + 'direction': direction, + 'rev_compd': 'False', + 'max_num_paralogs': 0 + } + + + def export_gene_synteny_to_yaml(self): + + self.run.warning(None, header="Export the gene synteny to yaml file", lc="green") + gene_synteny_yaml_dict = {} + for genome in self.gene_synteny_data_dict.keys(): + gene_synteny_yaml_dict[genome] = [] + for contig in self.gene_synteny_data_dict[genome].keys(): + contig_data = self.gene_synteny_data_dict[genome][contig] + + contig_string = [] + + for gene_call in contig_data.keys(): + if contig_data[gene_call]['direction'] == 'f': + contig_string += [contig_data[gene_call]['gene_cluster_name']] + else: + contig_string += [contig_data[gene_call]['gene_cluster_name'] + '!'] + + gene_synteny_yaml_dict[genome] += [' '.join(contig_string)] + + utils.save_dict_as_yaml(gene_synteny_yaml_dict, self.output_yaml) + + self.run.info_single("Done") + + + def get_gene_synteny_data_dict(self): + """A function to reduce a comprehensive data structure from anvi'o artifacts for + downstream analyses. + """ + self.run.warning(None, header="Loading data from database", lc="green") + + filesnpaths.is_file_tab_delimited(self.external_genomes_txt) + + if not utils.is_all_columns_present_in_TAB_delim_file(["name","contigs_db_path"], self.external_genomes_txt): + raise ConfigError("Your external genomes file does not seem to contain that anvi'o expects to find " + "in an external genomes file :/") + + pan_db = dbops.PanSuperclass(self.args, r=terminal.Run(verbose=False), p=terminal.Progress(verbose=False)) + + self.project_name = pan_db.p_meta['project_name'] + + pan_db.init_gene_clusters() + pan_db.init_gene_clusters_functions_summary_dict() + pan_db.init_items_additional_data() + + gene_cluster_dict = pan_db.gene_callers_id_to_gene_cluster + additional_info_cluster = pan_db.items_additional_data_dict + + external_genomes = pd.read_csv(self.external_genomes_txt, header=0, sep="\t", names=["name","contigs_db_path"]) + external_genomes.set_index("name", inplace=True) + + for genome, contigs_db_path in external_genomes.iterrows(): + + if genome in self.genomes_names or not self.genomes_names: + + self.genomes.append(genome) + + args = argparse.Namespace(contigs_db=contigs_db_path.item()) + contigs_db = dbops.ContigsSuperclass(args, r=terminal.Run(verbose=False), p=terminal.Progress(verbose=False)) + + caller_id_cluster = gene_cluster_dict[genome] + caller_id_cluster_df = pd.DataFrame.from_dict(caller_id_cluster, orient="index", columns=["gene_cluster_name"]).rename_axis("gene_caller_id").reset_index() + caller_id_cluster_df["gene_cluster_id"] = "" + + contigs_db.init_functions() + gene_function_calls_df = pd.DataFrame.from_dict(contigs_db.gene_function_calls_dict, orient="index").rename_axis("gene_caller_id").reset_index() + + all_gene_calls = caller_id_cluster_df['gene_caller_id'].values.tolist() + genes_in_contigs_df = pd.DataFrame.from_dict(contigs_db.get_sequences_for_gene_callers_ids(all_gene_calls, include_aa_sequences=True, simple_headers=True)[1], orient="index").rename_axis("gene_caller_id").reset_index() + additional_info_df = pd.DataFrame.from_dict(additional_info_cluster, orient="index").rename_axis("gene_cluster_name").reset_index() + additional_info_df.rename(columns={entry: entry + "_known" for entry in self.functional_annotation_sources_available}, inplace=True) + + joined_contigs_df = caller_id_cluster_df.merge(genes_in_contigs_df, on="gene_caller_id", how="left").merge(gene_function_calls_df, on="gene_caller_id", how="left").merge(additional_info_df, on="gene_cluster_name", how="left") + + if genome in self.genomes_reverse: + joined_contigs_df.sort_values(["contig", "start", "stop"], axis=0, ascending=True, inplace=True) + else: + joined_contigs_df.sort_values(["contig", "start", "stop"], axis=0, ascending=False, inplace=True) + + joined_contigs_df.set_index(["contig", "gene_caller_id"], inplace=True) + + self.gene_synteny_data_dict[genome] = joined_contigs_df.fillna("None").groupby(level=0).apply(lambda df: df.xs(df.name).to_dict("index")).to_dict() + + self.run.info_single(f"Loaded genome {genome}.") + + else: + self.run.info_single(f"Skipped genome {genome}.") + + if not self.genomes: + raise ConfigError(f"Please keep at least one genome in the dataset. With the current setting you skip over all.") + else: + if not self.genomes_names: + self.genomes_names = self.genomes + + self.run.info_single("Done") + + + def context_distance(self, a, b): + r_val = 0 + f_val = 0 + div = len(a) + + if not a[int(len(a) / 2)] == b[int(len(b) / 2)]: + raise ConfigError(f"This is very weird and should not have happend, we are very sorry :(.") + + if set(self.genome_gc_occurence[tuple(a)].keys()).isdisjoint(set(self.genome_gc_occurence[tuple(b)].keys())): + for i, (n, m) in enumerate(zip(a, b)): + + if not i == int(len(a) / 2): + if (n[0] == '-' and m[0] == '-') or (n[0] == '+' and m[0] == '+'): + if n[1] != m[1]: + r_val += 0.5 + elif n[0] == '-' or m[0] == '-' or n[0] == '+' or m[0] == '+': + r_val += 0 + elif n == m: + r_val += 1 + + for i, (n, m) in enumerate(zip(a, b[::-1])): + + if not i == int(len(a) / 2): + if (n[0] == '-' and m[0] == '-') or (n[0] == '+' and m[0] == '+'): + if n[1] != m[1]: + f_val += 0.5 + elif n[0] == '-' or m[0] == '-' or n[0] == '+' or m[0] == '+': + f_val += 0 + elif n == m: + f_val += 1 + + if r_val >= f_val: + return (1 - r_val / div) + else: + return (1 - f_val / div) + + elif tuple(a) == tuple(b): + return (0) + + else: + return (1) + + + def context_split(self, label, gcp): + + if len(label) == 1: + result = list(zip(label, [1])) + return (result) + + else: + X = cdist(np.asarray(label), np.asarray(label), metric=self.context_distance) + condensed_X = squareform(X) + + Z = linkage(condensed_X, 'ward') + + for t in sorted(set(sum(Z.tolist(), [])), reverse=True): + clusters = fcluster(Z, t, criterion='distance') + + valid = True + for c in set(clusters.tolist()): + pos = np.where(clusters == c)[0] + + for i, j in it.combinations(pos, 2): + + if X[i][j] == 1.: + valid = False + + if valid is True: + break + + result = list(zip(label, clusters)) + + if self.output_graphics: + fig = plt.figure(figsize=(25, 10)) + ax = plt.axes() + dn = dendrogram(Z, ax=ax, labels=label, orientation='right') + ax.axvline(linestyle='--', x=t) + plt.tight_layout() + fig.savefig(self.output_graphics + gcp + '.pdf') + + return (result) + + + def run_contextualize_paralogs_algorithm(self): + """A function that resolves the graph context of paralogs based on gene synteny information across genomes""" + self.run.warning(None, header="Select paralog context", lc="green") + + unresolved = True + solved = set() + + genome_gc_order_values = {} + genome_gc_order_left_placeholder = {} + genome_gc_order_right_placeholder = {} + + while unresolved: + + unresolved = False + drop = set() + + for genome in self.gene_synteny_data_dict.keys(): + for contig in self.gene_synteny_data_dict[genome].keys(): + genome_gc_order = [self.gene_synteny_data_dict[genome][contig][gene_call]["gene_cluster_name"] for gene_call in self.gene_synteny_data_dict[genome][contig].keys()] + + if self.k == 0: + if genome not in genome_gc_order_values: + genome_gc_order_values[genome] = {contig: genome_gc_order} + genome_gc_order_left_placeholder[genome] = {contig: '-0'} + genome_gc_order_right_placeholder[genome] = {contig: '+0'} + else: + items = list(genome_gc_order_values[genome].values()) + times = items.count(genome_gc_order) + if times == 0: + genome_gc_order_values[genome][contig] = genome_gc_order + genome_gc_order_left_placeholder[genome][contig] = '-0' + genome_gc_order_right_placeholder[genome][contig] = '+0' + else: + genome_gc_order_values[genome][contig] = genome_gc_order + genome_gc_order_left_placeholder[genome][contig] = '-' + chr(ord('a')+times-1) + genome_gc_order_right_placeholder[genome][contig] = '+' + chr(ord('a')+times-1) + + left_place = genome_gc_order_left_placeholder[genome][contig] + right_place = genome_gc_order_right_placeholder[genome][contig] + + for i in range(0, len(genome_gc_order)): + + start = (i - self.k) if (i - self.k) >= 0 else 0 + stop = (i + self.k + 1) if (i + self.k + 1) <= len(genome_gc_order) else len(genome_gc_order) + entry = genome_gc_order[start:stop] + + if len(entry) == 1 + (2 * self.k): + gc_k = tuple(entry) + elif start == 0 and stop == len(genome_gc_order): + gc_k = tuple([left_place] * (self.k - i) + entry + [right_place] * ((i + self.k + 1) - len(genome_gc_order))) + elif start == 0: + gc_k = tuple([left_place] * (self.k - i) + entry) + elif stop == len(genome_gc_order): + gc_k = tuple(entry + [right_place] * ((i + self.k + 1) - len(genome_gc_order))) + else: + raise ConfigError(f"The search frame of entry {entry} is malformed. It is not tuple of size {1 + (2 * self.k)}" + f"neiter is the frame overlapping with the beginning and the end of a very short contig" + f"nor with either of those. This is weird and should not happen, please check your input data" + f"and make sure there is nothing very weird going on there. Sorry :/") + + if len(gc_k) != 1 + (2 * self.k): + raise ConfigError(f"Wow, for some unforseeable reason the length of the search frame is not matching the" + f"current iterration. I thought this is impossible and I'm very curious on how you accomplished" + f"that. Jokes aside errors can happen and I probably forgot some specific exception. Sorry...") + + gc = gc_k[int(len(gc_k) / 2)] + if gc not in solved: + + if gc_k not in self.genome_gc_occurence.keys(): + self.genome_gc_occurence[gc_k] = {genome: 1} + + else: + if genome not in self.genome_gc_occurence[gc_k].keys(): + self.genome_gc_occurence[gc_k][genome] = 1 + else: + self.genome_gc_occurence[gc_k][genome] += 1 + + for gc, genome_gc_frequency in self.genome_gc_occurence.items(): + if max(genome_gc_frequency.values()) > 1: + + unresolved = True + drop.add(gc[int(len(gc)/2)]) + + elif self.k == 0 and len(genome_gc_frequency) == len(self.genomes): + self.single_copy_core.add(gc) + + if self.k == 0: + self.paralog_dict = copy.deepcopy(self.genome_gc_occurence) + + if unresolved: + keys = list(self.genome_gc_occurence.keys()) + for gc in keys: + if gc[int(len(gc)/2)] in drop: + self.genome_gc_occurence.pop(gc) + + solved = set([gc[int(len(gc)/2)] for gc in self.genome_gc_occurence.keys()]) + self.k += 1 + + g_cleaned = {} + gcs = set([gc[int(len(gc)/2)] for gc in self.genome_gc_occurence.keys()]) + + for gcp in gcs: + label = [gc for gc in self.genome_gc_occurence.keys() if gc[int(len(gc)/2)] == gcp] + context = self.context_split(label, gcp) + + for name, cluster in context: + g_cleaned[name] = gcp + '_' + str(cluster) + + self.run.info_single(f"{pp(self.k+1)} iteration(s) to expand {pp(len(self.paralog_dict.keys()))} GCs to {pp(len(self.genome_gc_occurence.keys()))} GCs without paralogs") + syn_calls = 0 + + for genome in self.gene_synteny_data_dict.keys(): + + for contig in self.gene_synteny_data_dict[genome].keys(): + genome_gc_order = [(self.gene_synteny_data_dict[genome][contig][gene_call]["gene_cluster_name"], gene_call) for gene_call in self.gene_synteny_data_dict[genome][contig].keys()] + + left_place = genome_gc_order_left_placeholder[genome][contig] + right_place = genome_gc_order_right_placeholder[genome][contig] + + for i in range(0, len(genome_gc_order)): + start = i-self.k if i-self.k >= 0 else 0 + stop = i+self.k+1 if i+self.k+1 <= len(genome_gc_order) else len(genome_gc_order) + entry = [item[0] for item in genome_gc_order[start:stop]] + gene_call = genome_gc_order[i][1] + name = genome_gc_order[i][0] + + if len(entry) == 1 + (2 * self.k): + gc_k = tuple(entry) + elif start == 0 and stop == len(genome_gc_order): + gc_k = tuple([left_place] * (self.k - i) + entry + [right_place] * ((i + self.k + 1) - len(genome_gc_order))) + elif start == 0: + gc_k = tuple([left_place] * (self.k - i) + entry) + elif stop == len(genome_gc_order): + gc_k = tuple(entry + [right_place] * ((i + self.k + 1) - len(genome_gc_order))) + else: + raise ConfigError(f"The search frame of entry {entry} is malformed. It is not tuple of size {1 + (2 * self.k)}" + f"neiter is the frame overlapping with the beginning and the end of a very short contig" + f"nor with either of those. This is weird and should not happen, please check your input data" + f"and make sure there is nothing very weird going on there. Sorry :/") + + for j in range(0, self.k+1): + + gc_group = gc_k[int(len(gc_k)/2)-j:int(len(gc_k)/2)+j+1] + + if gc_group in self.genome_gc_occurence.keys(): + + self.gene_synteny_data_dict[genome][contig][gene_call]["gene_cluster_id"] = g_cleaned[gc_group] + syn_calls += 1 + break + + else: + pass + + num_calls = 0 + for _, value in self.genome_gc_occurence.items(): + num_calls += sum(value.values()) + + if num_calls != syn_calls: + raise ConfigError(f"It looks like {abs(num_calls - syn_calls)} calls were not captured by the algorithm. Proceeding from" + f"here is a very bad idea. We will try to do better next time. For now please check your data to make" + f"sure you did not try some very crazy stuff.") + + self.run.info_single("Done") + + + def add_node_to_graph(self, gene_cluster, name, info): + + if not self.initial_graph.has_node(gene_cluster): + self.initial_graph.add_node( + gene_cluster, + name=name, + pos=(0, 0), + weight=1, + layer={}, + genome=info + ) + + else: + self.initial_graph.nodes[gene_cluster]['weight'] += 1 + self.initial_graph.nodes[gene_cluster]['genome'].update(info) + + + def add_edge_to_graph(self, gene_cluster_i, gene_cluster_j, info): + + if self.priority_genome in info.keys(): + weight_add = 100 + elif gene_cluster_i in self.single_copy_core and gene_cluster_j in self.single_copy_core: + weight_add = 2 + elif gene_cluster_i in self.single_copy_core or gene_cluster_j in self.single_copy_core: + weight_add = 1 + else: + weight_add = 0 + + draw = {genome: {'gene_call': -1} for genome in info.keys()} + + if not self.initial_graph.has_edge(*(gene_cluster_i, gene_cluster_j)): + self.initial_graph.add_edge( + *(gene_cluster_i, gene_cluster_j), + weight=1 + weight_add, + genome=draw, + bended=[], + direction='R' + ) + + else: + self.initial_graph[gene_cluster_i][gene_cluster_j]['weight'] += 1 + weight_add + self.initial_graph[gene_cluster_i][gene_cluster_j]['genome'].update(draw) + + + def build_graph(self): + + self.run.warning(None, header="Building directed gene cluster graph G", lc="green") + + for genome in self.gene_synteny_data_dict.keys(): + + for contig in self.gene_synteny_data_dict[genome].keys(): + gene_cluster_kmer = [] + for gene_call in self.gene_synteny_data_dict[genome][contig].keys(): + gene_cluster_kmer.append((self.gene_synteny_data_dict[genome][contig][gene_call]['gene_cluster_id'], self.gene_synteny_data_dict[genome][contig][gene_call]['gene_cluster_name'], {genome: {'contig':contig, 'gene_call':gene_call, **self.gene_synteny_data_dict[genome][contig][gene_call]}})) + + if len(gene_cluster_kmer) > 1: + gene_cluster_pairs = map(tuple, zip(gene_cluster_kmer, gene_cluster_kmer[1:])) + first_pair = next(gene_cluster_pairs) + + self.add_node_to_graph(first_pair[0][0], first_pair[0][1], first_pair[0][2]) + self.add_node_to_graph(first_pair[1][0], first_pair[1][1], first_pair[1][2]) + self.add_edge_to_graph(first_pair[0][0], first_pair[1][0], first_pair[1][2]) + + for gene_cluster_pair in gene_cluster_pairs: + + self.add_node_to_graph(gene_cluster_pair[1][0], gene_cluster_pair[1][1], gene_cluster_pair[1][2]) + self.add_edge_to_graph(gene_cluster_pair[0][0], gene_cluster_pair[1][0], gene_cluster_pair[1][2]) + + else: + self.add_node_to_graph(gene_cluster_kmer[0][0], gene_cluster_kmer[0][1], gene_cluster_kmer[0][2]) + + self.run.info_single(f"Adding {pp(len(self.initial_graph.nodes()))} nodes and {pp(len(self.initial_graph.edges()))} edges to G") + + if self.pass_filter == 0: + self.run.info_single("Setting algorithm to 'no pass filter'") + else: + self.run.info_single(f"Setting algorithm to 'remove edges < {pp(self.pass_filter)}'") + + edges_to_remove = [(i, j) for i,j,k in self.initial_graph.edges(data=True) if k['weight'] < self.pass_filter] + + self.run.info_single(f"Removed {pp(len(edges_to_remove))} edges due to pass filter.") + + self.initial_graph.remove_edges_from(edges_to_remove) + + connectivity = nx.is_connected(self.initial_graph.to_undirected()) + self.run.info_single(f"Connectivity is {connectivity}") + + if connectivity == False: + + weakly_components = list(nx.weakly_connected_components(self.initial_graph)) + self.run.info_single(f"Graph contains {pp(len(weakly_components))} independant components.") + + self.pangenome_graph = nx.DiGraph(self.initial_graph.subgraph(max(weakly_components, key=len))) + self.run.info_single(f"Keeping longest subgraph with {pp(len(self.pangenome_graph.nodes()))} nodes and {pp(len(self.pangenome_graph.edges()))} edges") + + connectivity = nx.is_connected(self.pangenome_graph.to_undirected()) + self.run.info_single(f"Connectivity is {connectivity}") + else: + self.pangenome_graph = nx.DiGraph(self.initial_graph) + + self.run.info_single("Done") + self.run.warning(None, header="Building maximum branching graph M of G", lc="green") + + selfloops = list(nx.selfloop_edges(self.pangenome_graph)) + self.run.info_single(f"Found and removed {pp(len(selfloops))} selfloop edge(s)") + self.pangenome_graph.remove_edges_from(selfloops) + + edmonds = nx.algorithms.tree.branchings.Edmonds(self.pangenome_graph) + self.edmonds_graph = edmonds.find_optimum( + attr="weight", + default=1, + kind="max", + style="arborescence", + preserve_attrs=False, + partition=None, + ) + + if not nx.algorithms.tree.recognition.is_arborescence(self.edmonds_graph): + self.run.info_single('No maximum aborescence. Entering fallback mode.') + edmonds_sub_graph = max(nx.weakly_connected_components(self.edmonds_graph), key=len) + self.edmonds_graph = nx.DiGraph(self.edmonds_graph.subgraph(edmonds_sub_graph)) + self.pangenome_graph = nx.DiGraph(self.pangenome_graph.subgraph(edmonds_sub_graph)) + if nx.algorithms.tree.recognition.is_arborescence(self.edmonds_graph): + self.run.info_single('Found aborescence.') + + #TODO calculate number of missing edges + self.run.info_single(f'{len(self.edmonds_graph.nodes())-len(edmonds_sub_graph)} nodes removed to capture synteny.') + self.run.info_single('Be warned the quality of the pangraph might have suffered.') + self.run.info_single('Proceed.') + else: + raise ConfigError(f"I'm very sorry to inform you that your data is not solvable by the current version of" + f"anvi'o pangraph. The fallback mode tried to solve your dataset by sacrificing some" + f"of the included information, but at this scale it will not lead to a acceptable result :(") + + else: + self.run.info_single('Found aborescence. Proceed.') + + nx.set_edge_attributes(self.edmonds_graph, {(i, j): d for i, j, d in self.pangenome_graph.edges(data=True) if (i, j) in self.edmonds_graph.edges()}) + nx.set_node_attributes(self.edmonds_graph, {k: d for k, d in self.pangenome_graph.nodes(data=True) if k in self.edmonds_graph.nodes()}) + + add_start = [] + for node in self.edmonds_graph.nodes(): + if len(list(self.edmonds_graph.predecessors(node))) == 0: + add_start.append(node) + + for graph in [self.pangenome_graph, self.edmonds_graph]: + graph.add_node( + 'start', + name='start', + pos=(0, 0), + weight=len(self.genomes), + layer={}, + genome={genome: {'gene_call': -1, 'contig': '', 'direction': ''} for genome in self.genomes} + ) + + for u in add_start: + + weight = graph.nodes[u]['weight'] + genomes = graph.nodes[u]['genome'].keys() + + graph.add_edge( + *('start', u), + genome={genome: {'gene_call': -1} for genome in genomes}, + weight=weight, + bended=[], + direction='R' + ) + + self.run.info_single(f"Removing {pp(len(self.pangenome_graph.edges()) - len(self.edmonds_graph.edges()))} edges from G to create M") + self.run.info_single("Done") + + def find_next_branching_point(self, current): + if current != 'start': + pred = list(self.edmonds_graph.predecessors(current))[0] + pred_successors = list(self.edmonds_graph.successors(pred)) + if len(pred_successors) > 1: + return(pred) + else: + self.find_next_branching_point(pred) + + + def mean_edmonds_graph_path_weight(self, source, target): + path = nx.shortest_path(G=self.edmonds_graph, source=source, target=target, weight='weight') + path_weight = nx.path_weight(G=self.edmonds_graph, path=path, weight='weight') + return(path_weight) + + + def get_edge(self, node_i, node_j, reverse = False): + + if reverse == False: + pangenome_graph_edge_data = self.pangenome_graph.get_edge_data(node_i, node_j) + return(node_i, node_j, pangenome_graph_edge_data) + else: + pangenome_graph_edge_data = {y:z if y != 'direction' else 'L' for y,z in self.pangenome_graph.get_edge_data(node_i, node_j).items()} + return(node_j, node_i, pangenome_graph_edge_data) + + + def get_leaves(self, current_branch_successor, edmonds_graph_successors): + leaves = set() + while True: + successors = edmonds_graph_successors[current_branch_successor] + + if successors: + if len(successors) > 1: + for successor in successors: + leaves.update(self.get_leaves(successor, edmonds_graph_successors)) + break + else: + current_branch_successor = successors[0] + else: + leaves.update(set([current_branch_successor])) + break + + return(leaves) + + + def edge_check(self, node_i, node_j, data): + + new_data = copy.deepcopy(data) + + if self.edmonds_graph.has_edge(node_i, node_j): + old_data = self.edmonds_graph[node_i][node_j] + new_data['genome'].update(old_data['genome']) + new_data['weight'] += old_data['weight'] + new_data['direction'] = 'B' + + return(new_data) + + + def run_tree_to_flow_network_algorithm(self): + + self.run.warning(None, header="Building flow network F from M and G", lc="green") + + edmonds_graph_edges = set(self.edmonds_graph.edges()) + edmonds_graph_nodes = set(self.edmonds_graph.nodes()) + + pangenome_graph_edges = set(self.pangenome_graph.edges()) + pangenome_graph_nodes = set(self.pangenome_graph.nodes()) + + edmonds_graph_removed_edges = pangenome_graph_edges - edmonds_graph_edges + edmonds_graph_end = max([(self.mean_edmonds_graph_path_weight('start', node), node) for node in edmonds_graph_nodes if len(list(self.edmonds_graph.successors(node))) == 0])[1] + + for graph in [self.pangenome_graph, self.edmonds_graph]: + + graph.add_node( + 'stop', + name='stop', + pos=(0, 0), + weight=len(self.genomes), + layer={}, + genome={genome: {'gene_call': -1, 'contig': '', 'direction': ''} for genome in self.genomes} + ) + graph.add_edge( + *(edmonds_graph_end, 'stop'), + genome={genome: {'gene_call': -1} for genome in self.genomes}, + weight=len(self.genomes), + bended=[], + direction='R' + ) + + edmonds_graph_nodes.add('stop') + edmonds_graph_predecessors = {edmonds_graph_node: list(self.edmonds_graph.predecessors(edmonds_graph_node))[0] for edmonds_graph_node in edmonds_graph_nodes if edmonds_graph_node != 'start'} + edmonds_graph_successors = {edmonds_graph_node: list(self.edmonds_graph.successors(edmonds_graph_node)) for edmonds_graph_node in edmonds_graph_nodes} + pangenome_graph_successors = {pangenome_graph_node: list(self.pangenome_graph.successors(pangenome_graph_node)) for pangenome_graph_node in pangenome_graph_nodes} + edmonds_graph_distances = {edmonds_graph_node: self.mean_edmonds_graph_path_weight('start', edmonds_graph_node) for edmonds_graph_node in edmonds_graph_nodes} + + i = 0 + resolved_nodes = set(['stop']) + + pred = '' + x = 0 + self.progress.new("Solving complex graph") + while len(resolved_nodes) != len(pangenome_graph_nodes) + 1 or x < 1: + if len(resolved_nodes) == len(pangenome_graph_nodes) + 1: + x += 1 + + i += 1 + + visited_nodes = set(['stop']) + current_node = 'stop' + while len(visited_nodes) != len(pangenome_graph_nodes) + 1: + + self.progress.update(f"{str(len(resolved_nodes)).rjust(len(str(len(pangenome_graph_nodes) + 1)), ' ')} / {len(pangenome_graph_nodes) + 1}") + + if pred: + current_branch_root = pred + pred = '' + else: + current_branch_root = edmonds_graph_predecessors[current_node] + + current_forward_connected = [] + current_backward_connected = [] + successor_branch_leaves = set() + + for current_branch_successor in edmonds_graph_successors[current_branch_root]: + if current_branch_successor not in visited_nodes and current_branch_successor != current_node: + successor_branch_leaves.update(self.get_leaves(current_branch_successor, edmonds_graph_successors)) + + if not successor_branch_leaves: + current_node = current_branch_root + else: + current_node = max([(edmonds_graph_distances[successor_branch_leaf], successor_branch_leaf) for successor_branch_leaf in successor_branch_leaves])[1] + + if current_node in resolved_nodes: + connected = True + else: + connected = False + + if connected != True or x == 1: + for current_node_successor in pangenome_graph_successors[current_node]: + + if current_node_successor in resolved_nodes: + + if current_node_successor in nx.ancestors(self.edmonds_graph, current_node) or (current_node_successor not in visited_nodes and current_node_successor not in resolved_nodes): + if (current_node, current_node_successor) in edmonds_graph_removed_edges: + current_backward_connected.append(current_node_successor) + + else: + if (current_node, current_node_successor) in edmonds_graph_removed_edges: + current_forward_connected.append(current_node_successor) + connected = True + + else: + connected = True + + if connected == False: + if len(list(self.pangenome_graph.successors(current_node))) == 0: + pangenome_graph_edge_data = { + 'genome':{genome: {'gene_call': -1} for genome in self.genomes}, + 'weight':len(self.genomes), + 'bended': [], + 'direction': 'R' + } + + new_data = self.edge_check(current_node, 'stop', pangenome_graph_edge_data) + self.edmonds_graph.add_edge(current_node, 'stop', **new_data) + connected = True + + if connected == True: + for current_forward in current_forward_connected: + node_i, node_j, data = self.get_edge(current_node, current_forward, reverse = False) + + new_data = self.edge_check(node_i, node_j, data) + self.edmonds_graph.add_edge(node_i, node_j, **new_data) + edmonds_graph_removed_edges.remove((current_node, current_forward)) + + for current_backward in current_backward_connected: + node_i, node_j, data = self.get_edge(current_node, current_backward, reverse = True) + + new_data = self.edge_check(node_i, node_j, data) + self.edmonds_graph.add_edge(node_i, node_j, **new_data) + edmonds_graph_removed_edges.remove((current_node, current_backward)) + + resolved_nodes.add(current_node) + + else: + if current_backward_connected: + + number = max([(self.pangenome_graph.get_edge_data(current_node, backward)['weight'], i) for (i, backward) in enumerate(current_backward_connected)])[1] + + node_i, node_j, data = self.get_edge(current_node, current_backward_connected[number], reverse = True) + self.edmonds_graph.remove_edge(edmonds_graph_predecessors[current_node], current_node) + + new_data = self.edge_check(node_i, node_j, data) + self.edmonds_graph.add_edge(node_i, node_j, **new_data) + + edmonds_graph_removed_edges.remove((current_node, current_backward_connected[number])) + edmonds_graph_removed_edges.add((edmonds_graph_predecessors[current_node], current_node)) + + edmonds_graph_successors[edmonds_graph_predecessors[current_node]].remove(current_node) + edmonds_graph_successors[current_backward_connected[number]] += [current_node] + + pred = edmonds_graph_predecessors[current_node] + + edmonds_graph_predecessors.pop(current_node, None) + edmonds_graph_predecessors[current_node] = current_backward_connected[number] + + edmonds_graph_distances[current_node] = self.mean_edmonds_graph_path_weight('start', current_node) + + resolved_nodes.add(current_node) + + visited_nodes.add(current_node) + + if not nx.is_directed_acyclic_graph(self.edmonds_graph): + raise ConfigError(f"Oh no. It looks like your graph is so complex or includes a motif I haven't seen before" + f"therefore the reattachement algorithm itself included a loop to the graph. We had multiple" + f"sanity checks to prevent this but unfortunatly nobody is perfect. We will include more" + f"checks in the next version. Sorry :/") + + self.progress.end() + + remaining_stops = [node for node in self.edmonds_graph.nodes() if self.edmonds_graph.out_degree(node) == 0 and node != 'stop'] + self.run.info_single(f"{pp(i)} iterations to solve the graph") + + for stop in remaining_stops: + + pangenome_graph_edge_data = { + 'genome':{genome: {'gene_call': -1} for genome in self.genomes}, + 'weight':len(self.genomes), + 'bended': [], + 'direction': 'R' + } + + new_data = self.edge_check(stop, 'stop', pangenome_graph_edge_data) + self.edmonds_graph.add_edge(stop, 'stop', **new_data) + + edmonds_graph_successors[stop] += ['stop'] + + if not nx.is_directed_acyclic_graph(self.edmonds_graph): + raise ConfigError(f"Oh no. It looks like your graph is so complex or includes a motif I haven't seen before" + f"therefore the reattachement algorithm itself included a loop to the graph. We had multiple" + f"sanity checks to prevent this but unfortunatly nobody is perfect. We will include more" + f"checks in the next version. Sorry :/") + + self.run.info_single("Done") + + + def load_graph_from_json_file(self): + + self.run.warning(None, header="Import graph F from json file", lc="green") + + filesnpaths.is_file_json_formatted(self.pan_graph_json) + + self.jsondata = json.load(open(self.pan_graph_json)) + + self.global_x = int(self.jsondata["infos"]["meta"]["global_x"]) + self.priority_genome = self.jsondata["infos"]["priority_genome"] + + self.global_y = 0 + self.ancest = nx.DiGraph() + self.x_list = {} + self.position = {} + self.edges = {} + + for x in range(0, self.global_x+1): + self.x_list[x] = [] + + for node in self.jsondata["elements"]["nodes"]: + name = self.jsondata["elements"]["nodes"][node]["name"] + weight = self.jsondata["elements"]["nodes"][node]["weight"] + genome = self.jsondata["elements"]["nodes"][node]["genome"] + x = self.jsondata["elements"]["nodes"][node]["position"]["x"] + + self.x_list[x].append(node) + self.position[node] = (x, -1) + self.ancest.add_node(node, name=name, pos=(0,0), weight=weight, layer={}, genome=genome) + + for edge in self.jsondata["elements"]["edges"]: + source = self.jsondata["elements"]["edges"][edge]["source"] + target = self.jsondata["elements"]["edges"][edge]["target"] + genome = self.jsondata["elements"]["edges"][edge]["genome"] + direction = self.jsondata["elements"]["edges"][edge]["direction"] + + self.ancest.add_edge(source, target, weight=-1, genome=genome, bended=[], direction=direction) + + self.run.info_single("Done") + + + def prepare_synteny_graph(self): + + for x, generation in enumerate(nx.topological_generations(self.edmonds_graph)): + self.x_list[x] = generation + for node in generation: + self.position[node] = (x, -1) + + self.global_x = x + + self.ancest = nx.DiGraph(self.edmonds_graph) + nx.set_edge_attributes(self.ancest, values=-1, name='weight') + + + def run_synteny_layout_algorithm(self): + + self.run.warning(None, header="Calculating coordinates on the nodes from F", lc="green") + + layout_graph_nodes = list(self.ancest.nodes()) + layout_graph_successors = {layout_graph_node: list(self.ancest.successors(layout_graph_node)) for layout_graph_node in layout_graph_nodes} + + n_removed = 0 + ghost = 0 + for x in range(self.global_x-1, 0, -1): + for node in self.x_list[x]: + node_x_position = self.position[node][0] + + change = [] + for successor in layout_graph_successors[node]: + if successor != 'stop': + successor_x_position = self.position[successor][0] + + if successor_x_position <= node_x_position: + raise ConfigError(f"The node {node} is succeded by the node {successor} which is in front of the node {node}" + f"that does not make a lot of sense and should not happen. We are sorry for the inconvenience :/") + else: + change.append((successor_x_position, successor)) + + if change: + if min(change)[0] > 1: + new_x_position = min([(x, n) for (x, n) in change if x > 1])[0] - 1 + self.position[node] = (new_x_position, -1) + + for (position, extend_successor) in change: + x_difference = position - new_x_position + + if x_difference <= self.max_edge_length_filter or self.max_edge_length_filter == -1: + + path_list = [node] + + for i in range(1, x_difference): + path_list += ['Ghost_' + str(ghost)] + self.position['Ghost_' + str(ghost)] = (new_x_position + i, -1) + ghost += 1 + + path_list += [extend_successor] + + self.edges[(node, extend_successor)] = (path_list, self.ancest.edges[node, extend_successor]) + self.ancest.remove_edge(node, extend_successor) + + if len(path_list) == 2: + self.ancest.add_edges_from(map(tuple, zip(path_list, path_list[1:])), weight=-1) + else: + self.ancest.add_edges_from(map(tuple, zip(path_list, path_list[1:])), weight=-0.5) + + else: + n_removed += 1 + self.removed.add((node, extend_successor)) + + for i, j in self.ancest.edges(): + + if self.position[j][0] - self.position[i][0] != 1 and i != 'start' and j != 'stop' and (i,j) not in self.removed: + raise ConfigError(f"Hmmm. This situation would create a very weird looking connection." + f"The ede {(i, j)} is longer than it should be. I don't know what created" + f"this but we will work on a solution on the next release. Sorry :(") + + if self.max_edge_length_filter == -1: + self.run.info_single("Setting algorithm to 'keep all edges'") + else: + self.run.info_single(f"Setting algorithm to 'max edge length < {pp(self.max_edge_length_filter)}'") + + self.run.info_single(f"Removed {pp(n_removed)} edge(s) due to length cutoff") + + longest_path = nx.bellman_ford_path(G=self.ancest, source='start', target='stop', weight='weight') + m = set(longest_path) + + starts = [node for node in self.ancest.nodes() if self.ancest.in_degree(node) == 0 if node != 'start'] + for st in starts: + self.ancest.add_edge('start', st, weight=-1) + dfs_list = list(nx.dfs_edges(self.ancest, source='start')) + + group = 0 + groups = {} + groups_rev = {} + + # TODO Currently no seperation between unequal genome context + if self.gene_cluster_grouping_threshold == -1: + self.run.info_single("Setting algorithm to 'no grouping'") + else: + self.run.info_single(f"Setting algorithm to 'Grouping single connected chains size > {pp(self.gene_cluster_grouping_threshold)}'") + + for node_v, node_w in dfs_list: + if node_v != 'start' and self.ancest.in_degree(node_v) == 1 and self.ancest.out_degree(node_v) == 1 and self.ancest.in_degree(node_w) == 1 and self.ancest.out_degree(node_w) == 1: + + if self.ungroup[0] != -1 and self.ungroup[1] != -1 and self.position[node_v][0] > self.ungroup[0] and self.position[node_v][0] < self.ungroup[1] and self.position[node_w][0] > self.ungroup[0] and self.position[node_w][0] < self.ungroup[1]: + continue + else: + if node_v not in groups_rev.keys(): + group_name = 'GCG_' + str(group).zfill(8) + groups[group_name] = [node_v, node_w] + groups_rev[node_v] = group_name + groups_rev[node_w] = group_name + group += 1 + + else: + group_name = groups_rev[node_v] + groups[group_name] += [node_w] + groups_rev[node_w] = group_name + + for label, condense_nodes in groups.items(): + + condense_nodes = [node for node in condense_nodes if not node.startswith('Ghost_')] + + if len(condense_nodes) >= self.gene_cluster_grouping_threshold and self.gene_cluster_grouping_threshold != -1: + self.grouping[label] = condense_nodes + + self.run.info_single(f"Grouped {pp(len(sum(self.grouping.values(), [])))} nodes in {pp(len(self.grouping.keys()))} groups") + + for st in starts: + self.ancest.remove_edge('start', st) + + self.ancest.remove_edges_from(self.removed) + + branches = {} + sortable = [] + for g in groups.keys(): + branch = groups[g] + + if not set(branch).isdisjoint(m) and not set(branch).issubset(m): + raise ConfigError(f"A group is neither disjoint from the main path nor subset of the main path" + f"we should not continue from here as this is not something that should happen.") + # print('Sanity Error. Code 10.') + # break + + elif set(branch).isdisjoint(m): + + start = self.position[branch[0]][0] + length = len(branch) + + if start in branches.keys(): + if length in branches[start].keys(): + if branches[start][length].keys(): + num = max(branches[start][length].keys()) + 1 + else: + num = 1 + + branches[start][length][num] = branch + else: + num = 1 + branches[start][length] = {num: branch} + else: + num = 1 + branches[start] = {length: {num: branch}} + + sortable += [(start, length, num)] + + left_nodes = set(self.ancest.nodes()) - set(groups_rev.keys()) + for n in left_nodes: + + if not set([n]).isdisjoint(m) and not set([n]).issubset(m): + raise ConfigError(f"A group is neither disjoint from the main path nor subset of the main path" + f"we should not continue from here as this is not something that should happen.") + # print('Sanity Error. Code 11.') + # break + + elif set([n]).isdisjoint(m): + start = self.position[n][0] + length = 1 + if n != 'start' and n != 'stop': + if start in branches.keys(): + if length in branches[start].keys(): + if branches[start][length].keys(): + num = max(branches[start][length].keys()) + 1 + else: + num = 1 + + branches[start][length][num] = [n] + else: + num = 1 + branches[start][length] = {num: [n]} + else: + num = 1 + branches[start] = {length: {num: [n]}} + + sortable += [(start, length, num)] + + used = set() + finished = set() + + y_new = 0 + for node in longest_path: + x_pos = self.position[node][0] + self.position[node] = (x_pos, y_new) + used.add((x_pos, y_new)) + finished.add(node) + + stack = [longest_path] + while stack: + + current = stack[0] + + remove = True + for i,j,k in sorted(sortable, key=lambda x: (x[1], x[0]), reverse = False): + branch = branches[i][j][k] + branch_pred = set(self.ancest.predecessors(branch[0])) + branch_succ = set(self.ancest.successors(branch[-1])) + if (not branch_pred.isdisjoint(set(current))) or (not branch_succ.isdisjoint(set(current))) or (not branch_pred.isdisjoint(set(current)) and not branch_succ.isdisjoint(set(current))): + + remove = False + sortable.remove((i,j,k)) + y_new = max(sum([[self.position[ypred][1] for ypred in branch_pred], [self.position[ysucc][1] for ysucc in branch_succ]], [])) + + stack = [branch] + stack + while True: + repeat = False + for xnew in range(i, i+j): + if (xnew, y_new) in used: + repeat = True + + if repeat == False: + break + else: + y_new += 1 + + for node in branch: + x_pos = self.position[node][0] + self.position[node] = (x_pos, y_new) + used.add((x_pos, y_new)) + finished.add(node) + + self.global_y = y_new if y_new > self.global_y else self.global_y + + break + + if remove == True: + stack.remove(current) + + if len(set(self.position.values())) != len(self.position.values()): + # print(len(self.position.values()) - len(set(self.position.values()))) + raise ConfigError(f"No no no no. Something went very wrong here. Some nodes overlap in the UI." + f"We don't want this, we definitely don't want this...") + + for edge_i, edge_j in self.edges.keys(): + path_list, infos = self.edges[(edge_i, edge_j)] + + self.ancest.add_edge(edge_i, edge_j, **infos) + self.ancest[edge_i][edge_j]['weight'] = sum([1 if value != self.priority_genome else 10 for value in self.ancest[edge_i][edge_j]['genome'].keys()]) + self.ancest[edge_i][edge_j]['bended'] = [self.position[p] for p in path_list[1:-1]] + self.ancest.remove_nodes_from(path_list[1:-1]) + + for node in self.ancest.nodes(): + self.ancest.nodes[node]['pos'] = self.position[node] + + self.offset[node] = self.position[node][0] + + x_positions_list = [] + + for group_name in self.grouping.keys(): + + group = self.grouping[group_name] + + group_size = len(group) + group_size_compressed = round(group_size * self.groupcompress) + compressed_factor = group_size_compressed if group_size_compressed == 0 else group_size_compressed - 1 + + node_distance_factor = compressed_factor / (group_size - 1) + + start_x = self.ancest.nodes[group[0]]['pos'][0] + + for i, node in enumerate(group): + self.offset[node] = round(start_x + i * node_distance_factor) + + compressed_length = int(self.offset[group[-1]] - self.offset[group[0]]) + + x_positions_list += [i for i in range(start_x, start_x + compressed_length + 1)] + + x_positions_list += list(self.offset.values()) + x_positions = set(x_positions_list) + empty_spots = [] + + for x in range(self.global_x+1): + if x not in x_positions: + empty_spots.append(x) + + for node in self.ancest.nodes(): + decrease = 0 + x = self.offset[node] + for e in empty_spots: + if x > e: + decrease += 1 + + if e == empty_spots[-1]: + self.offset[node] = x - decrease + break + + else: + self.offset[node] = x - decrease + break + + for edge_i, edge_j, data in self.ancest.edges(data=True): + if edge_i != 'start' and edge_j != 'stop': + if data['bended']: + for i, (x, y) in enumerate(data['bended']): + decrease = 0 + for e in empty_spots: + if x > e-1: + decrease += 1 + else: + data['bended'][i] = (x - decrease, y) + break + else: + data['bended'][i] = (x - decrease, y) + + else: + if self.offset[edge_j] - self.offset[edge_i] != 1: + + y = self.position[edge_i][1] + x = self.offset[edge_i] + 1 + + while x < self.offset[edge_j]: + data['bended'].append((x, y)) + x += 1 + + self.global_x_offset = self.offset['stop'] + + self.run.info_single(f"Final graph {pp(len(self.ancest.nodes()))} nodes and {pp(len(self.ancest.edges()))} edges") + + self.run.info_single("Done") + + + def generate_data_table(self): + + self.run.warning(None, header="Import layer values inherited from the pangenome", lc="green") + + max_paralogs = 0 + + for gene_cluster in self.ancest.nodes(): + + if gene_cluster != 'start' and gene_cluster != 'stop': + node = self.ancest.nodes()[gene_cluster] + num = len(node['genome'].keys()) + max_paralog_list = [] + num_dir_r = 0 + num_dir_l = 0 + + func_homogeneity_list = [] + geo_homogeneity_list = [] + comb_homogeneity_list = [] + + for genome in node['genome'].keys(): + info = node['genome'][genome] + + max_paralog_list.append(info['max_num_paralogs']) + func_homogeneity_list.append(info['functional_homogeneity_index']) + geo_homogeneity_list.append(info['geometric_homogeneity_index']) + comb_homogeneity_list.append(info['combined_homogeneity_index']) + + if info['direction'] == 'r': + num_dir_r += 1 + else: + num_dir_l += 1 + + max_paralogs = max(max_paralog_list) if max(max_paralog_list) > max_paralogs else max_paralogs + + node['layer'] = { + 'Paralogs': max(max_paralog_list) - 1, + 'Direction': 1 - max(num_dir_r, num_dir_l)/num, + 'Functional_Homogeneity': 1 - max(func_homogeneity_list), + 'Geometric_Homogeneity': 1 - max(geo_homogeneity_list), + 'Combined_Homogeneity': 1 - max(comb_homogeneity_list) + } + + self.data_table_dict['Functional_Homogeneity'] = { + 'max': 1, + 'min': 0 + } + self.data_table_dict['Geometric_Homogeneity'] = { + 'max': 1, + 'min': 0 + } + self.data_table_dict['Combined_Homogeneity'] = { + 'max': 1, + 'min': 0 + } + self.data_table_dict['Paralogs'] = { + 'max': max_paralogs - 1 if max_paralogs - 1 != 0 else 1, + 'min': 0 + } + self.data_table_dict['Direction'] = { + 'max': 0.5, + 'min': 0 + } + + self.run.info_single("Done") + + + def get_additional_gene_layer_table(self): + + self.run.warning(None, header="Export empty dataframe for genecall annotations", lc="green") + + gene_layer_dict = {} + + i = 0 + for node, data in self.ancest.nodes(data=True): + if node != 'start' and node != 'stop': + for genome in data['genome'].keys(): + contig = data['genome'][genome]['contig'] + genecall = data['genome'][genome]['gene_call'] + + gene_layer_dict[i] = {'genome': genome, 'contig': contig, 'genecall': genecall, 'value': 0} + i += 1 + + gene_layer_df = pd.DataFrame.from_dict(gene_layer_dict, orient='index') + + gene_layer_df.to_csv(self.output_raw_gene_additional_data, index=False) + + self.run.info_single("Done") + + + def get_additional_gc_layer_table(self): + + self.run.warning(None, header="Export empty dataframe for gc annotations", lc="green") + + gc_layer_dict = {} + included = set() + + i = 0 + for node, data in self.ancest.nodes(data=True): + if node != 'start' and node != 'stop': + name = data['name'] + if name not in included: + gc_layer_dict[i] = {'genecluster': name, 'value': 0} + included.add(name) + i += 1 + + gc_layer_df = pd.DataFrame.from_dict(gc_layer_dict, orient='index') + + gc_layer_df.to_csv(self.output_raw_gc_additional_data, index=False) + + self.run.info_single("Done") + + + def add_additional_gene_layer_values(self): + + self.run.warning(None, header="Appending layer values from external gene data", lc="green") + + df = pd.read_csv(self.gene_additional_data) + df.set_index(['genome', 'contig', 'genecall'], inplace=True) + layer_names = list(df.columns) + layer_max = {layer_name: 0 for layer_name in layer_names} + + for node, data in self.ancest.nodes(data=True): + + if node != 'start' and node != 'stop': + + for layer_name in layer_names: + value_list = [] + + for genome in data['genome'].keys(): + + contig = data['genome'][genome]['contig'] + genecall = data['genome'][genome]['gene_call'] + + value = df.loc[(genome, contig, genecall)][layer_name].item() + value_list.append(value) + + value_sum = sum(value_list) / len(value_list) + + data['layer'][layer_name] = value_sum + layer_max[layer_name] = layer_max[layer_name] if layer_max[layer_name] > value_sum else value_sum + + for layer_name in layer_names: + + self.data_table_dict[layer_name] = { + 'max': layer_max[layer_name], + 'min': 0 + } + + self.run.info_single("Done") + + + def add_additional_gc_layer_values(self): + + self.run.warning(None, header="Appending layer values from external gc data", lc="green") + + df = pd.read_csv(self.gc_additional_data) + df.set_index(['genecluster'], inplace=True) + layer_names = list(df.columns) + layer_max = {layer_name: 0 for layer_name in layer_names} + + for node, data in self.ancest.nodes(data=True): + if node != 'start' and node != 'stop': + name = data['name'] + for layer_name in layer_names: + + value = df.loc[name][layer_name].item() + data['layer'][layer_name] = value + layer_max[layer_name] = layer_max[layer_name] if layer_max[layer_name] > value else value + + for layer_name in layer_names: + + self.data_table_dict[layer_name] = { + 'max': layer_max[layer_name], + 'min': 0 + } + + self.run.info_single("Done") + + + def walk_one_step(self, G, current, nodes_position_dict, visited): + successors = [successor for successor in G.successors(current) if successor not in visited] + predecessors = [predecessor for predecessor in G.predecessors(current) if predecessor not in visited] + + current_position_x, current_position_y = nodes_position_dict[current] + + if len(successors) > 0: + + nodes_position = [(nodes_position_dict[successor][0] - current_position_x, successor) for successor in successors] + successor_position, successor = sorted(nodes_position)[0] + + successor_position_x, successor_position_y = nodes_position_dict[successor] + return(successor_position_x, successor_position_y, successor) + + elif len(predecessors) > 0: + nodes_position = [(current_position_x - nodes_position_dict[predecessor][0], predecessor) for predecessor in predecessors] + predecessor_position, predecessor = sorted(nodes_position, reverse=True)[0] + + predecessor_position_x, predecessor_position_y = nodes_position_dict[predecessor] + return(predecessor_position_x, predecessor_position_y, predecessor) + + else: + return(-1, -1, '') + + + def is_node_core(self, node): + if set(self.ancest.nodes()[node]['genome'].keys()) == set(self.genomes_names): + return(True) + else: + return(False) + + + def get_genomic_motifs(self): + + self.run.warning(None, header="Generate pangenome graph summary tables", lc="green") + + graph_min = self.ancest.nodes()['start']['pos'][0] + 1 + graph_max = self.ancest.nodes()['stop']['pos'][0] - 1 + + region_id = 0 + regions = {} + region_sides = {} + + for genome in self.genomes_names: + # print(genome) + + G_subset_nodes = [n for n,d in self.ancest.nodes(data='genome') if genome not in d.keys()] + G_subset_edges = [(i,j) for i,j,d in self.ancest.edges(data='genome') if genome not in d.keys()] + G3 = self.ancest.copy() + G3.remove_nodes_from(['start', 'stop'] + G_subset_nodes) + G3.remove_edges_from(G_subset_edges) + + region_type = {} + regions_reverse = [] + nodes_position_dict = dict(G3.nodes(data="pos")) + + starting_points = [] + for node in list(G3.nodes()): + if len(list(G3.successors(node))) <= 1 and len(list(G3.predecessors(node))) == 0: + starting_points += [node] + + starting_points = sorted(starting_points, key=lambda x: nodes_position_dict[x][0]) + + # Prepare first starting point in data structure + node = starting_points[0] + starting_points.remove(node) + + node_position_x, node_position_y = nodes_position_dict[node] + former_position = node_position_x + + region_id += 1 + + # gene cluster core? + if self.is_node_core(node): + # start new core region + regions[region_id] = [(node_position_x, node_position_y, node)] + is_core = True + else: + # start new accessory region + regions[region_id] = [(node_position_x, node_position_y, node)] + is_core = False + + visited = set([node]) + mode = '' + + while True: + node_position_x, node_position_y, node = self.walk_one_step(G3, node, nodes_position_dict, visited) + + if node: + mode = '' + else: + if starting_points: + node = starting_points[0] + starting_points.remove(node) + node_position_x, node_position_y = nodes_position_dict[node] + direction = '' + mode = 'break' + else: + break + + # extend current region? + if node_position_x == former_position + 1 or node_position_x == (former_position - graph_max) + 1: + # extend core region? + if is_core: + # gene cluster core? + if self.is_node_core(node): + # extend core region with core gene cluster + regions[region_id] += [(node_position_x, node_position_y, node)] + is_core = True + else: + # end core region + region_type[region_id] = ('core', regions[region_id][0][0], regions[region_id][-1][0]) + region_id += 1 + + # start accessory region + regions[region_id] = [(node_position_x, node_position_y, node)] + is_core = False + else: + # gene cluster core? + if self.is_node_core(node): + # end accessory region + region_type[region_id] = ('accessory', regions[region_id][0][0], regions[region_id][-1][0]) + region_id += 1 + + # start core region + regions[region_id] = [(node_position_x, node_position_y, node)] + is_core = True + else: + # extend accessory region + regions[region_id] += [(node_position_x, node_position_y, node)] + is_core = False + + elif node_position_x == former_position: + if is_core: + region_type[region_id] = ('core', regions[region_id][0][0], regions[region_id][-1][0]) + else: + region_type[region_id] = ('accessory', regions[region_id][0][0], regions[region_id][-1][0]) + + region_id += 1 + + if self.is_node_core(node): + # start new core region + regions[region_id] = [(node_position_x, node_position_y, node)] + is_core = True + else: + # start new accessory region + regions[region_id] = [(node_position_x, node_position_y, node)] + is_core = False + + # new forward bridge region? + elif (node_position_x > former_position + 1 and abs(node_position_x - former_position) <= graph_max / 2) or (node_position_x < former_position + 1 and abs(former_position - node_position_x) > graph_max / 2): + # is the region to end core? + if is_core: + region_type[region_id] = ('core', regions[region_id][0][0], regions[region_id][-1][0]) + else: + region_type[region_id] = ('accessory', regions[region_id][0][0], regions[region_id][-1][0]) + + region_id += 1 + + if node_position_x > former_position + 1: + regions[region_id] = [(i, '', '') for i in range(former_position + 1, node_position_x, 1)] + else: + regions[region_id] = [(i, '', '') for i in range(former_position + 1, graph_max + 1, 1)] + [(i, '', '') for i in range(graph_min, node_position_x, 1)] + + if mode == 'break': + region_type[region_id] = ('break', regions[region_id][0][0], regions[region_id][-1][0]) + else: + region_type[region_id] = ('forward', regions[region_id][0][0], regions[region_id][-1][0]) + + region_id += 1 + + # gene cluster core? + if self.is_node_core(node): + # start new core region + regions[region_id] = [(node_position_x, node_position_y, node)] + is_core = True + else: + # start new accessory region + regions[region_id] = [(node_position_x, node_position_y, node)] + is_core = False + + # new backward bridge region? + elif (node_position_x < former_position + 1 and abs(former_position - node_position_x) <= graph_max / 2) or (node_position_x > former_position + 1 and abs(node_position_x - former_position) > graph_max / 2): + # is the region to end core? + if is_core: + region_type[region_id] = ('core', regions[region_id][0][0], regions[region_id][-1][0]) + else: + region_type[region_id] = ('accessory', regions[region_id][0][0], regions[region_id][-1][0]) + + region_id += 1 + + if node_position_x < former_position + 1: + regions[region_id] = [(i, '', '') for i in range(former_position - 1, node_position_x, -1)] + else: + regions[region_id] = [(i, '', '') for i in range(former_position - 1, graph_min - 1, -1)] + [(i, '', '', '', genome) for i in range(graph_max, node_position_x, -1)] + + region_type[region_id] = ('reverse', regions[region_id][0][0], regions[region_id][-1][0]) + + # mark a backward bridge region + regions_reverse += [(regions[region_id][0][0], regions[region_id][-1][0])] + region_id += 1 + + # gene cluster core? + if self.is_node_core(node): + # start new core region + regions[region_id] = [(node_position_x, node_position_y, node)] + is_core = True + else: + # start new accessory region + regions[region_id] = [(node_position_x, node_position_y, node)] + is_core = False + + else: + print('A motif of this type is not yet implemented as we never saw it before.') + + visited.add(node) + former_position = node_position_x + + if is_core: + region_type[region_id] = ('core', regions[region_id][0][0], regions[region_id][-1][0]) + else: + region_type[region_id] = ('accessory', regions[region_id][0][0], regions[region_id][-1][0]) + + for region_id_inner, region_tuple in region_type.items(): + # overlap = False + # for region_reverse in regions_reverse: + + # if region_tuple[1] == region_reverse[0] and region_tuple[2] == region_reverse[1]: + # overlap = True + + # elif region_tuple[1] <= region_tuple[2] and region_reverse[0] >= region_reverse[1]: + # # reverse: 15 -> 12 + # # current: 13 -> 16 + + # # ------------ + # # ------------ + # # 12 13 14 15 16 + + # # left_difference: abs(12 - 13) = 1 + # # right_difference: abs(15 - 16) = 1 + # # region_compare_size: (16 - 12) + 1 = 5 + # # region_overlap_size = 5 - (1 + 1) = 3 + + # # region_reverse_size: (15 - 12) + 1 = 4 + # # region_current_size: (16 - 13) + 1 = 4 + + # # test: 3 >= (4 * (1/2)) = True + + # left_difference = abs(min(region_tuple[1], region_tuple[2]) - min(region_reverse[0], region_reverse[1])) + # right_difference = abs(max(region_tuple[1], region_tuple[2]) - max(region_reverse[0], region_reverse[1])) + # region_compare_size = (max(region_tuple[1], region_tuple[2], region_reverse[0], region_reverse[1]) - min(region_tuple[1], region_tuple[2], region_reverse[0], region_reverse[1])) + 1 + # region_overlap_size = region_compare_size - (left_difference + right_difference) + + # region_reverse_size = (max(region_reverse[0], region_reverse[1]) - min(region_reverse[0], region_reverse[1])) + 1 + # region_current_size = (max(region_tuple[1], region_tuple[2]) - min(region_tuple[1], region_tuple[2])) + 1 + + # if region_overlap_size >= (region_current_size / 2): + # overlap = True + + # else: + # print('A rearrangement event over the start and end border of the graph is not yet implemented') + + # if overlap: + # region_sides[region_id_inner] = {'start': region_tuple[1], 'end': region_tuple[2], 'types': region_tuple[0], 'consensus': 1.0, 'motif': 'reassortment'} + # else: + region_sides[region_id_inner] = {'start': region_tuple[1], 'end': region_tuple[2], 'types': region_tuple[0], 'genome': genome, 'consensus': -1, 'motif': ''} + + tab = pd.DataFrame.from_dict(region_sides, orient='index') + tab.rename_axis("region_id", inplace=True) + tab.sort_values(['start', 'end'], inplace=True) + + drop_rows = [] + groups = tab.groupby(['genome', 'types']) + # come up with grouping like 0-10, 0-6, 7-10 end up in one group + # easy search for all connections from i = 0 -> 5, 8, 10 take the highest + # i = 0 ... 10 then pick all fragments inside of 0 and 10 e.g. 0 ... 5, 4 ... 8 + # then check every position for at most every genome once before combine + for name, group in groups: + + i = 0 + j = i + 1 + while i < len(group) - 1: + + row_i = group.iloc[i] + start_i, end_i, types_i, genome_i, consensus_i, motif_i = row_i + index_i = row_i.name + + row_j = group.iloc[j] + start_j, end_j, types_j, genome_j, consensus_j, motif_j = row_j + index_j = row_j.name + + if (end_i + 1) == start_j: + end_new = max(end_i, end_j) + + tab.at[index_i, 'end'] = end_new + drop_rows += [index_j] + + regions[index_i] += regions[index_j] + regions.pop(index_j) + tab.drop(index=index_j, inplace=True) + + j += 1 + + else: + i = j + 1 + j = i + 1 + + # return(regions, all_nodes) + tab.reset_index(drop=False, inplace=True) + tab['motif_id'] = tab.loc[:, 'region_id'] + + # for key in regions.keys(): + # regions[key] = set(regions[key]) + + groups = tab.groupby(['start', 'end']) + for name, group in groups: + + types_list = list(group['types']) + + if len(set(types_list)) == 1 and len(types_list) <= len(self.genomes_names): + + joining_list = sorted([(i, j, True) if regions[group.at[i,'motif_id']] == regions[group.at[i,'motif_id']] else (i, j, False) for i, j in it.combinations(group.index, 2)]) + + for index_i, index_j, join_boolean in joining_list: + if join_boolean: + region_id_i, start_i, end_i, types_i, consensus_i, genome_i, motif_i, motif_id_i = tab.loc[index_i] + region_id_j, start_j, end_j, types_j, consensus_j, genome_j, motif_j, motif_id_j = tab.loc[index_j] + + tab.at[index_j, 'motif_id'] = motif_id_i + tab.at[index_j, 'motif_id'] = motif_id_i + + tab.at[index_i, 'consensus'] = 1.0 + tab.at[index_j, 'consensus'] = 1.0 + + tab.at[index_i, 'motif'] = 'hypervariability' if types_i == 'accessory' and len(types_list) == len(self.genomes_names) else types_i + tab.at[index_j, 'motif'] = 'hypervariability' if types_i == 'accessory' and len(types_list) == len(self.genomes_names) else types_i + + # regions[region_id_i] += regions[region_id_j] + + elif set(types_list) == set(['forward', 'accessory']) and len(types_list) <= len(self.genomes_names): + + forward_group = group[group["types"] == 'forward'] + accessory_group = group[group["types"] == 'accessory'] + + joining_equal = all([True if regions[accessory_group.at[i,'motif_id']] == regions[accessory_group.at[i,'motif_id']] else False for i, j in it.combinations(accessory_group.index, 2)]) + joining_list = sorted([(i, j, True) if regions[group.at[i,'motif_id']] == regions[group.at[i,'motif_id']] else (i, j, False) for i, j in it.combinations(group.index, 2)]) + + for index_i, index_j, join_boolean in joining_list: + if join_boolean: + region_id_i, start_i, end_i, types_i, consensus_i, genome_i, motif_i, motif_id_i = tab.loc[index_i] + region_id_j, start_j, end_j, types_j, consensus_j, genome_j, motif_j, motif_id_j = tab.loc[index_j] + + tab.at[index_j, 'motif_id'] = motif_id_i + tab.at[index_j, 'motif_id'] = motif_id_i + + if joining_equal: + a = len(forward_group) + b = len(accessory_group) + if a >= b: + consensus = round(a / (a+b), 3) + motif = 'insertion' + else: + consensus = round(b / (a+b), 3) + motif = 'deletion' + else: + consensus = 1.0 + motif = 'hypervariability' + + tab.at[index_i, 'consensus'] = consensus + tab.at[index_j, 'consensus'] = consensus + + tab.at[index_i, 'motif'] = motif + tab.at[index_j, 'motif'] = motif + + # regions[region_id_i] += regions[region_id_j] + + elif set(types_list) == set(['break', 'accessory']) and len(types_list) <= len(self.genomes_names): + + joining_equal = all([True if regions[accessory_group.at[i,'motif_id']] == regions[accessory_group.at[i,'motif_id']] else False for i, j in it.combinations(accessory_group.index, 2)]) + joining_list = sorted([(i, j, True) if regions[group.at[i,'motif_id']] == regions[group.at[i,'motif_id']] else (i, j, False) for i, j in it.combinations(group.index, 2)]) + + for index_i, index_j, join_boolean in joining_list: + if join_boolean: + region_id_i, start_i, end_i, types_i, consensus_i, genome_i, motif_i, motif_id_i = tab.loc[index_i] + region_id_j, start_j, end_j, types_j, consensus_j, genome_j, motif_j, motif_id_j = tab.loc[index_j] + + tab.at[index_j, 'motif_id'] = motif_id_i + tab.at[index_j, 'motif_id'] = motif_id_i + + if joining_equal: + consensus = 1.0 + motif = 'broken core' + else: + consensus = 1.0 + motif = 'hypervariability' + + tab.at[index_i, 'consensus'] = consensus + tab.at[index_j, 'consensus'] = consensus + + tab.at[index_i, 'motif'] = motif + tab.at[index_j, 'motif'] = motif + + # regions[region_id_i] += regions[region_id_j] + + else: + continue + + tab.sort_values(['start', 'end'], inplace=True) + tab.reset_index(drop=True, inplace=True) + + groups = tab.groupby(['motif_id']) + nodes_dict_id = 0 + nodes_dict = {} + for motif_id, group in groups: + + for index, row in group.iterrows(): + + region_id = row['region_id'] + genome = row['genome'] + + for region in regions[region_id]: + node_position_x, node_position_y, node = region + + if node: + nodes_dict[nodes_dict_id] = {'motif_id': motif_id, 'node_position_x': node_position_x, 'node_position_y': node_position_y, 'gene_cluster_id': node, 'genome': genome} + nodes_dict_id += 1 + + tab2 = pd.DataFrame.from_dict(nodes_dict, orient='index').set_index(['gene_cluster_id', 'genome']) + tab1 = tab.drop(['region_id'], axis=1).set_index(['motif_id', 'genome']) + + nodes_dict = {n:d for n,d in self.ancest.nodes(data='genome')} + tab3 = pd.DataFrame.from_dict( + {(i,j): nodes_dict[i][j] + for i in nodes_dict.keys() + for j in nodes_dict[i].keys() + if i != 'start' and i != 'stop'}, orient='index') + + tab3.drop('gene_cluster_id', inplace=True, axis=1) + tab3.rename_axis(['gene_cluster_id', 'genome'], inplace=True) + + for source in self.functional_annotation_sources_available: + tab3[source] = tab3[source].apply(lambda x: ('None', 'None', 'None') if x == 'None' else x) + tab3[[source + '_ID', source + 'TEXT', source + '_E_VALUE']] = pd.DataFrame(tab3[source].values.tolist(), index=tab3.index) + tab3.drop(source, inplace=True, axis=1) + + tab3.fillna('None', inplace=True) + tab4 = pd.merge(tab2, tab3, how='left', on=['gene_cluster_id', 'genome']) + + if len(tab2) != len(tab3): + self.run.info_single('Genecall entries are missing from the result table!') + + tab1.to_csv(self.output_summary + '/motifs.tsv', sep='\t') + tab4.to_csv(self.output_summary + '/genes.tsv', sep='\t') + + self.run.info_single("Done") + + + def update_json_dict(self): + self.run.warning(None, header="Update JSON dictionary for export", lc="green") + + self.jsondata["infos"]['meta']['global_y'] = self.global_y + self.jsondata["infos"]['meta']['global_x_offset'] = self.global_x_offset + self.jsondata["infos"]['max_edge_length_filter'] = self.max_edge_length_filter + self.jsondata["infos"]['gene_cluster_grouping_threshold'] = self.gene_cluster_grouping_threshold + self.jsondata["infos"]['groupcompress'] = self.groupcompress + self.jsondata["infos"]['ungroup'] = self.ungroup + self.jsondata["infos"]['layout_graph']['edges'] = len(self.ancest.edges()) + self.jsondata["infos"]['data'] = list(self.ancest.graph.items()) + self.jsondata["infos"]['directed'] = self.ancest.is_directed() + self.jsondata["infos"]['groups'] = self.grouping + self.jsondata['infos']['grouped'] = {'nodes': len(self.ancest.nodes()), 'edges': len(self.ancest.edges())} + + for i, j in self.ancest.nodes(data=True): + self.jsondata["elements"]["nodes"][str(i)]["position"]['y'] = j['pos'][1] + self.jsondata["elements"]["nodes"][str(i)]["position"]['x_offset'] = self.offset[i] + + for edge in self.jsondata["elements"]["edges"]: + source = self.jsondata["elements"]["edges"][edge]["source"] + target = self.jsondata["elements"]["edges"][edge]["target"] + self.jsondata["elements"]["edges"][edge]["shown"] = 1 + + if (source, target) in self.removed: + self.jsondata["elements"]["edges"][edge]["shown"] = 0 + self.jsondata["elements"]["edges"][edge]["bended"] = "" + + elif self.ancest.has_edge(source, target): + self.jsondata["elements"]["edges"][edge]["bended"] = [{'x': x, 'y': y} for x, y in self.ancest[source][target]["bended"]] if len(self.ancest[source][target]["bended"]) != 0 else "" + + self.run.info_single("Done") + + + # TODO rework that section for better debugging and add more features as an example fuse start and top + # together or remove both so the graph becomes a REAL circle. Aside from that there is a bug in the remove + # edges section for (k,o) in circular edges and for (k,o) in pangenome edges. Change and test while reworking. + def get_json_dict_for_graph(self): + self.run.warning(None, header="Calculating syntenous distances", lc="green") + + # NOTE: Any change in `self.jsondata` will require the pangraph JSON in anvio.tables.__init__ + # to incrase by one (so that the new code that works with the new structure requires + # previously generated JSON to be recomputed). + + nodes_all = len(self.ancest.nodes()) + edges_all = len(self.ancest.edges()) + + X = np.zeros([len(self.genomes_names), len(self.genomes_names)]) + for genome_i, genome_j in it.combinations(self.genomes_names, 2): + nodes_similar = 0 + edges_similar = 0 + nodes_unsimilar = 0 + edges_unsimilar = 0 + for _, data in self.ancest.nodes(data=True): + if genome_i in data['genome'].keys() and genome_j in data['genome'].keys(): + nodes_similar += 1 + elif genome_i in data['genome'].keys() or genome_j in data['genome'].keys(): + nodes_unsimilar += 1 + for _, _, data in self.ancest.edges(data=True): + if genome_i in data['genome'].keys() and genome_j in data['genome'].keys(): + edges_similar += 1 + elif genome_i in data['genome'].keys() or genome_j in data['genome'].keys(): + edges_unsimilar += 1 + + i = self.genomes_names.index(genome_i) + j = self.genomes_names.index(genome_j) + + elements_similar = nodes_similar + edges_similar + elements_unsimilar = nodes_unsimilar + edges_unsimilar + elements_all = nodes_all + edges_all + + X[i][j] = elements_unsimilar / (elements_similar + elements_unsimilar) + X[j][i] = elements_unsimilar / (elements_similar + elements_unsimilar) + + self.run.info_single(f"{genome_i} and {genome_j}: {round(X[i][j], 3)}") + + condensed_X = squareform(X) + Z = linkage(condensed_X, 'ward') + tree = to_tree(Z, False) + newick = clustering.get_newick(tree, tree.dist, self.genomes_names) + + if self.output_graphics: + fig = plt.figure(figsize=(25, 10)) + ax = plt.axes() + dn = dendrogram(Z, ax=ax, labels=self.genomes_names, orientation='right') + plt.tight_layout() + fig.savefig(self.output_graphics + 'phylogenie' + '.pdf') + + self.run.info_single("Done.") + self.run.warning(None, header="Creating JSON dictionary for export", lc="green") + + self.jsondata["infos"] = { + 'meta': { + 'title': self.project_name, + 'version': anvio.__pangraph__version__, + 'global_x': self.global_x, + 'global_y': self.global_y, + 'global_x_offset': self.global_x_offset + }, + 'genomes': {genome: 'on' for genome in self.genomes}, + 'functional_annotation_sources_available': self.functional_annotation_sources_available, + 'num_genomes': len(self.genomes), + 'max_edge_length_filter': self.max_edge_length_filter, + 'gene_cluster_grouping_threshold': self.gene_cluster_grouping_threshold, + 'groupcompress': self.groupcompress, + 'ungroup': self.ungroup, + 'priority_genome': self.priority_genome, + 'newick': newick, + 'layout_graph': { + 'nodes': len(self.ancest.nodes()), + 'edges': len(self.ancest.edges()) + }, + 'data': list(self.ancest.graph.items()), + 'directed': self.ancest.is_directed(), + 'groups': self.grouping, + 'grouped': { + 'nodes': len(self.ancest.nodes()), + 'edges': len(self.ancest.edges()) + }, + 'layers_data': self.data_table_dict + } + + self.jsondata['elements'] = { + 'nodes': {}, + 'edges': {} + } + + for i, j in self.ancest.nodes(data=True): + + self.jsondata["elements"]["nodes"][str(i)] = { + "name": j["name"], + "weight": j["weight"], + "layer": j["layer"], + "position": { + 'x': j['pos'][0], + 'y': j['pos'][1], + 'x_offset': self.offset[i] + }, + "genome": {key: {item: j["genome"][key][item] for item in ['gene_call', 'contig', 'direction']} for key in j["genome"].keys()} + } + + for l, (k, o, m) in enumerate(self.ancest.edges(data=True)): + + self.jsondata["elements"]["edges"]['E_' + str(l).zfill(8)] = { + "source": k, + "target": o, + "shown": 1, + "genome": m["genome"], + "weight": m["weight"], + "direction": m["direction"], + "bended": [{'x': x, 'y': y} for x, y in m["bended"]] if len(m["bended"]) != 0 else "" + } + + self.run.info_single("Done.") + + + def store_network(self): + """Function to store final graph structure in a pan-db and/or JSON flat text output file""" + + self.run.warning(None, header="Exporting pangenome graph to JSON", lc="green") + + if self.output_graphml: + + graphml = nx.DiGraph() + graphml.add_nodes_from([n for n, d in self.ancest.nodes(data=True)]) + graphml.add_edges_from([(i, j) for i, j, d in self.ancest.edges(data=True)]) + + nx.write_graphml_lxml(graphml, self.output_graphml) + self.run.info("GraphML output file", os.path.abspath(self.output_graphml)) + + if self.json_output_file_path: + + filesnpaths.is_output_file_writable(self.json_output_file_path) + + with open(self.json_output_file_path, 'w') as output: + output.write(json.dumps(self.jsondata, indent=2)) + self.run.info("JSON output file", os.path.abspath(self.json_output_file_path)) + else: + self.run.info("JSON output file", "Skipped (but OK)", mc='red') + + if not self.skip_storing_in_pan_db: + raise ConfigError("The storage of graph data in pan-db is not yet implemented :/") + else: + self.run.info("Into the pan-db", "Skipped (but OK)", mc='red') + + self.run.info_single("Done") \ No newline at end of file diff --git a/anvio/tables/__init__.py b/anvio/tables/__init__.py index 6ef632fe1..5b1a71601 100644 --- a/anvio/tables/__init__.py +++ b/anvio/tables/__init__.py @@ -24,6 +24,9 @@ workflow_config_version = "3" metabolic_modules_db_version = "4" +# versions of key text data types +pangraph_json_version = "2" + versions_for_db_types = {'contigs': contigs_db_version, 'profile': profile_db_version, 'genes': genes_db_version, diff --git a/anvio/utils.py b/anvio/utils.py index 6aa091f54..684408bb2 100644 --- a/anvio/utils.py +++ b/anvio/utils.py @@ -4439,6 +4439,21 @@ def get_yaml_as_dict(file_path): f"{file_path} as a YAML file. It is likely that it is not a properly " f"formatted YAML file and it needs editing, but here is the error " f"message in case it clarifies things: '{e}'.") + +def save_dict_as_yaml(data, file_path): + """YAML parser""" + + filesnpaths.is_output_file_writable(file_path) + + try: + with open(file_path, 'w') as outfile: + yaml.dump(data, outfile, default_flow_style=False) + + except Exception as e: + raise ConfigError(f"Anvi'o run into some trouble when trying to parse the file at " + f"{file_path} as a YAML file. It is likely that it is not a properly " + f"formatted YAML file and it needs editing, but here is the error " + f"message in case it clarifies things: '{e}'.") def download_file(url, output_file_path, check_certificate=True, progress=progress, run=run): diff --git a/bin/anvi-display-pan-graph b/bin/anvi-display-pan-graph new file mode 100755 index 000000000..e3cbb7da8 --- /dev/null +++ b/bin/anvi-display-pan-graph @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# -*- coding: utf-8 + +import sys +from anvio.argparse import ArgumentParser + +import anvio +import anvio.utils as utils +import anvio.terminal as terminal +import anvio.interactive as interactive +from anvio.bottleroutes import BottleApplication + +from anvio.errors import ConfigError, FilesNPathsError, DictIOError + +__author__ = "Developers of anvi'o (see AUTHORS.txt)" +__copyright__ = "Copyleft 2015-2018, the Meren Lab (http://merenlab.org/)" +__credits__ = [] +__license__ = "GPL 3.0" +__version__ = anvio.__version__ +__authors__ = ['ahenoch'] +__requires__ = ['pan-graph-json'] +__provides__ = ['interactive', 'svg'] +__description__ = "Start an anvi'o interactive interface to view pan graphs." + +run = terminal.Run() +progress = terminal.Progress() + +if __name__ == '__main__': + parser = ArgumentParser(description=__description__) + groupA = parser.add_argument_group('PRIMARY INPUT', "Specify where your pan-db is.") + groupA.add_argument(*anvio.A('pan-db'), **anvio.K('pan-db', {'required': False})) + groupA.add_argument(*anvio.A('genomes-storage'), **anvio.K('genomes-storage', {'required': False})) + + groupB = parser.add_argument_group('OPTIONAL INPUT', "By default, this program will read the pan graph data from the pan-db tables. However, " + "you can also provide it with a JSON file that you have generated previously, in which case the program will skip " + "attempting to read anything from the pan-db and will use the input file to initialize an interface.") + groupB.add_argument(*anvio.A('pan-graph-json'), **anvio.K('pan-graph-json')) + + groupC = parser.add_argument_group('SERVER CONFIGURATION', "For power users.") + groupC.add_argument(*anvio.A('dry-run'), **anvio.K('dry-run')) + groupC.add_argument(*anvio.A('ip-address'), **anvio.K('ip-address')) + groupC.add_argument(*anvio.A('port-number'), **anvio.K('port-number')) + groupC.add_argument(*anvio.A('browser-path'), **anvio.K('browser-path')) + groupC.add_argument(*anvio.A('server-only'), **anvio.K('server-only')) + groupC.add_argument(*anvio.A('password-protected'), **anvio.K('password-protected')) + + args = parser.get_args(parser) + + try: + d = interactive.PangraphInteractive(args, run=run, progress=progress) + + if args.dry_run: + run.info_single('Dry run, eh? Fine. Bai!', nl_after=1) + sys.exit() + + args.mode = 'pangraph' + port_number = utils.get_port_num(args.port_number, args.ip_address, run=run) + + app = BottleApplication(d) + app.run_application(args.ip_address, port_number) + except ConfigError as e: + print(e) + sys.exit(-1) + except FilesNPathsError as e: + print(e) + sys.exit(-2) + except DictIOError as e: + print(e) + sys.exit(-3) diff --git a/bin/anvi-pan-graph b/bin/anvi-pan-graph new file mode 100755 index 000000000..c59ab92b4 --- /dev/null +++ b/bin/anvi-pan-graph @@ -0,0 +1,89 @@ +#!/usr/bin/env python +# -*- coding: utf-8 +"""Compute a graph representation of a pangenome""" + +import sys + +import anvio +import anvio.panops as panops +import anvio.terminal as terminal + +from anvio.errors import ConfigError, FilesNPathsError + + +__author__ = "Developers of anvi'o (see AUTHORS.txt)" +__copyright__ = "Copyleft 2015-2018, the Meren Lab (http://merenlab.org/)" +__credits__ = [] +__license__ = "GPL 3.0" +__version__ = anvio.__version__ +__authors__ = ['ahenoch'] +__requires__ = ['pan-db', 'genomes-storage-db', 'external-genomes'] +__provides__ = ['pan-graph-json'] +__description__ = ("An anvi'o program to compute a graph representation of pangenomes. It will do its magic, and store it into your " + "pan-db, or report a JSON formatted graph file, for downstream visualization and analyses with `anvi-display-pan-graph`") +__resources__ = [] + + +run = terminal.Run() +progress = terminal.Progress() + + +if __name__ == '__main__': + from anvio.argparse import ArgumentParser + + parser = ArgumentParser(description=__description__) + + groupA = parser.add_argument_group('INPUT', "Anvi'o artifacts for the pan graph to be computed.") + groupA.add_argument(*anvio.A('pan-db'), **anvio.K('pan-db', {'required': False})) + groupA.add_argument(*anvio.A('genomes-storage'), **anvio.K('genomes-storage', {'required': False})) + groupA.add_argument(*anvio.A('external-genomes'), **anvio.K('external-genomes', {'required': False})) + groupA.add_argument(*anvio.A('testing-yaml'), **anvio.K('testing-yaml')) + groupA.add_argument(*anvio.A('gene-additional-data'), **anvio.K('gene-additional-data')) + groupA.add_argument(*anvio.A('gc-additional-data'), **anvio.K('gc-additional-data')) + groupA.add_argument(*anvio.A('pan-graph-json'), **anvio.K('pan-graph-json')) + groupA.add_argument(*anvio.A('genomes-names'), **anvio.K('genomes-names')) + groupA.add_argument(*anvio.A('genomes-reverse'), **anvio.K('genomes-reverse')) + + groupB = parser.add_argument_group('OUTPUT', "By default, this program will store the resulting pangraph into the anvi'o pan-db " + "you have provided as input above, so it is accessible to downstream analyses seamlessly. Using the " + "the parameters below, you can ask anvi'o to store the resulting graph into a text output file (which " + "may be useful for developers for debugging purposes) or you can ask anvi'o to skip adding the results " + "to the pan-db.") + groupB.add_argument(*anvio.A('output-graphml'), **anvio.K('output-graphml')) + groupB.add_argument(*anvio.A('output-pan-graph-json'), **anvio.K('output-pan-graph-json')) + groupB.add_argument(*anvio.A('output-testing-yaml'), **anvio.K('output-testing-yaml')) + groupB.add_argument(*anvio.A('output-dir-summary'), **anvio.K('output-dir-summary')) + groupB.add_argument(*anvio.A('output-dir-graphics'), **anvio.K('output-dir-graphics')) + groupB.add_argument(*anvio.A('output-raw-gene-additional-data'), **anvio.K('output-raw-gene-additional-data')) + groupB.add_argument(*anvio.A('output-raw-gc-additional-data'), **anvio.K('output-raw-gc-additional-data')) + + groupB.add_argument('--skip-storing-in-pan-db', default=False, action="store_true", help="Do not store the resulting graph into " + "the pan-db.") + + groupC = parser.add_argument_group('DETAILS OF GRAPH COMPUTATION', "Variables that will influence the computation of the graph, the organization " + "of the gene clusters, and edges between them.") + groupC.add_argument('--max-edge-length-filter', default = -1, type=int, help = "In the final pan graph, edges that connect gene clusters will vary in " + "their length. The longer edges that connect far gene clusters with one another will add additional layers to the final " + "display, reducing the readability of the overall graph structure. This parameter, which is by default set to %(default)d, " + "will remove edges that span across more than %(default)d gene clusters. You can change the thresold to make graph much " + "more accurate (lower values) or much more readable (higher values). We suggest you to start with the default, but " + "explore other options if you are not satisfied. Please keep in mind that pangenomes that contain a very large number of " + "genomic rearrangement events may take a very long time to compute with very small values of this parameter.") + groupC.add_argument('--gene-cluster-grouping-threshold', default = 2, type=int, help = "This parameters influences how gene clusters that share " + "perfect synteny (across all genomes that contribute genes) are represented in the final display. With the default value " + "of %(default)d, the final graph will represent as many gene clusters as possible in groups.") + groupC.add_argument('--gene-cluster-grouping-compression', default = 0.0, type=float, help = "Description.") + groupC.add_argument('--ungrouping-area', default = [-1, -1], type=list, help = "Description.") + groupC.add_argument('--priority-genome', default = '', type=str, help = "Description.") + + args = parser.get_args(parser) + + try: + graph = panops.Pangraph(args, run, progress) + graph.process() + except ConfigError as e: + print(e) + sys.exit(-1) + except FilesNPathsError as e: + print(e) + sys.exit(-2) \ No newline at end of file diff --git a/bin/anvi-reorient-contigs b/bin/anvi-reorient-contigs new file mode 100755 index 000000000..6e0043180 --- /dev/null +++ b/bin/anvi-reorient-contigs @@ -0,0 +1,177 @@ +#!/usr/bin/env python +# coding: utf-8 + +import os +import sys +import glob +import subprocess + +from Bio import SeqIO, SearchIO +from Bio.SeqRecord import SeqRecord + +import anvio +import anvio.fastalib as f +import anvio.terminal as terminal +import anvio.filesnpaths as filesnpaths + +from anvio.argparse import ArgumentParser +from anvio.errors import ConfigError, FilesNPathsError + +__copyright__ = "Copyleft 2015-2024, The Anvi'o Project (http://anvio.org/)" +__credits__ = [] +__license__ = "GPL 3.0" +__version__ = anvio.__version__ +__authors__ = ['ahenoch', 'meren'] +__requires__ = [] +__provides__ = [] +__description__ = "Reorient contigs based on the longest complete genome presend in the given folder" + + +progress = terminal.Progress() +run = terminal.Run() +P = terminal.pluralize + + +def get_num_contigs_and_genome_length(file_path): + fasta = f.SequenceSource(file_path) + + num_contigs = 0 + length = 0 + + while next(fasta): + num_contigs += 1 + length += len(fasta.seq) + + return(num_contigs, length) + + +def main(args): + input_dir = args.input_dir + prioritize_genome_size = args.prioritize_genome_size + prioritize_number_of_contigs = args.prioritize_number_of_contigs + + reference_fasta = None + + filesnpaths.is_output_dir_writable(input_dir) + + if (not prioritize_genome_size and not prioritize_number_of_contigs) or (prioritize_genome_size and prioritize_number_of_contigs): + raise ConfigError("You must choose one of these parameters -- not both, not none: `--prioritize-genome-size` " + "or `--prioritize-number-of-contigs`.") + + # find FASTA files in the user directory and determine the best reference de novo + genome_properties = {} + + best_length = 0 + best_num_contigs = sys.maxsize + + F = lambda x: glob.glob(os.path.join(input_dir, x)) + input_fasta_files = F('*.fa') + F('*.fasta') + F('*.fna') + F('*.ffn') + F('*.faa') + + run.warning(f"Anvi'o found {P('FASTA file', len(input_fasta_files))} in out input directory, and " + f"will try to orient contigs in them based on the best reference in the same directory.", + header="FASTA FILES FOUND", lc="yellow") + + for fasta_file in input_fasta_files: + num_contigs, length = get_num_contigs_and_genome_length(fasta_file) + genome_properties[fasta_file] = {'num_contigs': num_contigs, 'length': length} + + run.info_single(f"{os.path.basename(fasta_file)} ({P('contig', num_contigs)}; {P('nt', length)})", level=2) + + if prioritize_genome_size and length > best_length: + reference_fasta = fasta_file + best_length = length + elif prioritize_number_of_contigs and num_contigs < best_num_contigs: + reference_fasta = fasta_file + best_num_contigs = num_contigs + elif prioritize_number_of_contigs and num_contigs == best_num_contigs: + if length > best_length: + reference_fasta = fasta_file + best_length = length + else: + # wtf case + pass + + run.info("Method to select reference", "Minimum number of contigs" if prioritize_number_of_contigs else "Maximum length", nl_before=1) + addtl_info = P('nt', genome_properties[reference_fasta]['length']) if prioritize_genome_size else P('contig', genome_properties[reference_fasta]['num_contigs']) + run.info("Reference FASTA", f"{reference_fasta} ({addtl_info})", nl_after=1) + + if genome_properties[reference_fasta]['num_contigs'] != 1: + if prioritize_genome_size: + raise ConfigError("Sadly, the reference genome anvi'o chose for you based on geonome size priority seems to have multiple contigs, " + "which suggests that it is not a complete genome :/ The current implementation of this script does not know " + "how to use a genome with multiple contigs as reference. You can try to re-run the program with `--prioritize-number-of-contigs` " + "flag if you have a complete genome.") + elif prioritize_number_of_contigs: + raise ConfigError("The genome in your collection with the smallest number of contigs is still not a complete genome :( There is nothing " + "this program can do for you at this point. Sorry!") + else: + # wtf case + pass + + + progress.new("BLAST") + for fasta_file in input_fasta_files: + blast_result_file = '.'.join(fasta_file.split('.')[:-1]) + '-blast.xml' + + if fasta_file != reference_fasta: + # run BLAST + progress.update(f"Running {os.path.basename(fasta_file)} against the reference ...") + cmd_line = ["blastn", "-query", reference_fasta, "-out", blast_result_file, "-outfmt", "5", "-subject", fasta_file] + subprocess.run(cmd_line, text=True, capture_output=True) + + progress.update(f"Parsing {os.path.basename(fasta_file)} hits ...") + qresult = SearchIO.read(blast_result_file, 'blast-xml') + fasta_entry = list(SeqIO.parse(fasta_file, "fasta")) + + with open(fasta_file, "w+") as output_handle: + for contig in qresult: + original_record = [entry for entry in fasta_entry if entry.id == contig.id][0] + + original_record_sequence = original_record.seq + reversed_record_sequence = original_record.seq.reverse_complement() + + reoriented_record_hit = contig[0].hit.seq.replace("-", "") + reoriented_record_sequence = "" + + if reoriented_record_hit in original_record_sequence: + reoriented_record_sequence = original_record_sequence + elif reoriented_record_hit in reversed_record_sequence: + reoriented_record_sequence = reversed_record_sequence + else: + pass + + if reoriented_record_sequence: + reoriented_record = SeqRecord(reoriented_record_sequence, id=original_record.id, name=original_record.name, description=original_record.description) + SeqIO.write(reoriented_record, output_handle, "fasta") + else: + progress.reset() + run.info_single(f"No output possible, the FASTA files do not align :(") + + progress.end() + + +if __name__ == '__main__': + parser = ArgumentParser(description=__description__) + groupA = parser.add_argument_group('INPUT FILES') + groupA.add_argument(*anvio.A('input-dir'), **anvio.K('input-dir', {'help': "Path for the directory " + "that contains the FASTA files of interest. Anvi'o will take into consideration any file " + "in this directory that has any of the following extensions: '.fa', '.fasta', '.fna', " + "'.ffn', and '.faa'."})) + + groupB = parser.add_argument_group('REFERECE GENOME OPTIONS', description="Orienting contigs will require a reference genome that " + "must be chosen by none other than YOU.") + groupB.add_argument('--prioritize-genome-size', action="store_true", default=False, help="Of all the candidates, choose the genome " + "with largest size as 'reference' to orient contigs in other FASTA files.") + groupB.add_argument('--prioritize-number-of-contigs', action="store_true", default=False, help="Of all the candidates, choose the genome " + "with the smallest number of contigs as 'reference' to orient contigs in other FASTA files.") + + args = parser.get_args(parser) + + try: + main(args) + except ConfigError as e: + print(e) + sys.exit(-1) + except FilesNPathsError as e: + print(e) + sys.exit(-2) diff --git a/requirements.txt b/requirements.txt index c0abfc1d8..5331fdf70 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,7 +22,7 @@ pandas==1.4.4 snakemake multiprocess plotext -networkx +networkx==3.1 pulp==2.7.0 biopython reportlab diff --git a/requirements.yml b/requirements.yml new file mode 100644 index 000000000..612344b67 --- /dev/null +++ b/requirements.yml @@ -0,0 +1,65 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconductor-qvalue + - biopython + - blast + - bottle + - bowtie2 + - bwa + - colored + - diamond + - django + - ete3 + - famsa + - fastani + - fasttree + - ghostscript + - graphviz + - hmmer + - idba + - illumina-utils + - iqtree + - matplotlib==3.5.1 + - mcl + - megahit + - meme + - mistune + - multiprocess + - muscle=3.8.1551 + - networkx==3.1 + - nodejs + - numba + - numpy[version='<=1.24'] + - pandas==1.4.4 + - paste + - plotext + - prodigal + - psutil + - pulp==2.7.0 + - pyani + - pymupdf + - pysam + - python=3.10 + - r-base + - r-magrittr + - r-optparse + - r-stringi + - r-tidyverse + - reportlab + - requests + - rich-argparse + - samtools[version='>=1.9'] + - scikit-learn==1.2.2 + - scipy + - six + - snakemake + - spades + - sqlite + - statsmodels + - tabulate + - trimal + - trnascan-se + - vmatch