Skip to content

Commit

Permalink
Merge pull request EBIvariation#439 from tcezard/EVA3541_Fix_duplicat…
Browse files Browse the repository at this point in the history
…e_counts

EVA-3541 - Remove duplicate counts when aggregating per taxonomy or assembly
  • Loading branch information
tcezard authored Apr 19, 2024
2 parents 7ac2345 + 9b65aae commit af7f442
Show file tree
Hide file tree
Showing 21 changed files with 104 additions and 61 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,15 @@ do
OUTPUT=tmp_${SC_NAME}_${ASSEMBLY}_${TYPE}.txt
if [[ ${INPUT} == *.vcf.gz ]]
then
zcat "${INPUT}" | grep -v '^#' | awk -v annotation="${ASSEMBLY}-${SC_NAME}-${TYPE}" '{print $3" "annotation}' > ${OUTPUT}
# There are sometime multiple rs (separated by ;) in one line that needs to be split across multiple lines
gzip -d -c "${INPUT}" | grep -v '^#' | awk '{gsub(";","\n",$3); print $3}' | awk -v annotation="${ASSEMBLY}-${SC_NAME}-${TYPE}" '{print $0" "annotation}' > ${OUTPUT}
elif [[ ${INPUT} == *_unmapped_ids.txt.gz ]]
then
SC_NAME=$(basename $(dirname ${INPUT}));
OUTPUT=tmp_${SC_NAME}_unmapped.txt
zcat "${INPUT}" | grep -v '^#' | awk -v annotation="Unmapped-${SC_NAME}-unmapped" '{print $1" "annotation}' > ${OUTPUT}
gzip -d -c "${INPUT}" | grep -v '^#' | awk -v annotation="Unmapped-${SC_NAME}-unmapped" '{print $1" "annotation}' > ${OUTPUT}
else
zcat "${INPUT}" | grep -v '^#' | awk -v annotation="${ASSEMBLY}-${SC_NAME}-${TYPE}" '{print $1" "annotation}' > ${OUTPUT}
gzip -d -c "${INPUT}" | grep -v '^#' | awk -v annotation="${ASSEMBLY}-${SC_NAME}-${TYPE}" '{print $1" "annotation}' > ${OUTPUT}
fi
ALL_TMP_OUTPUT=$OUTPUT" "$ALL_TMP_OUTPUT
done
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
from datetime import datetime

from ebi_eva_common_pyutils.logger import logging_config
from ebi_eva_common_pyutils.mongodb import MongoDatabase
from ebi_eva_common_pyutils.config_utils import get_accession_pg_creds_for_profile
from ebi_eva_common_pyutils.pg_utils import execute_query, get_all_results_for_query
from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.mongodb import MongoDatabase
from ebi_eva_internal_pyutils.config_utils import get_accession_pg_creds_for_profile
from ebi_eva_internal_pyutils.pg_utils import execute_query, get_all_results_for_query
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from gather_clustering_counts.gather_per_species_clustering_counts import assembly_table_name, tracker_table_name
from urllib.parse import urlsplit

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
import os

from ebi_eva_common_pyutils.logger import logging_config
from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_common_pyutils.command_utils import run_command_with_output
from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query, execute_query
from ebi_eva_internal_pyutils.pg_utils import get_all_results_for_query, execute_query

logger = logging_config.get_logger(__name__)
logging_config.add_stdout_handler()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@

from ebi_eva_common_pyutils.command_utils import run_command_with_output
from ebi_eva_common_pyutils.common_utils import pretty_print
from ebi_eva_common_pyutils.config_utils import get_metadata_creds_for_profile
from ebi_eva_common_pyutils.logger import logging_config, AppLogger
from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query
from ebi_eva_internal_pyutils.config_utils import get_metadata_creds_for_profile
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.pg_utils import get_all_results_for_query

from sqlalchemy import select
from sqlalchemy.orm import Session
Expand Down Expand Up @@ -279,7 +279,7 @@ def _write_per_taxonomy_counts(self, session):
self.info(f"Create persistence for aggregate per taxonomy {taxonomy_id}")
taxonomy_row = RSCountPerTaxonomy(
taxonomy_id=taxonomy_id,
assembly_accessions=species_annotation.get(taxonomy_id).get('assemblies'),
assembly_accessions=list(species_annotation.get(taxonomy_id).get('assemblies')),
release_folder=species_annotation.get(taxonomy_id).get('release_folder'),
release_version=self.release_version,
)
Expand Down Expand Up @@ -375,6 +375,11 @@ def get_assembly_counts_from_database(self):
return results

def parse_count_script_logs(self, all_logs):
'''
Create a list of grouped count
:param all_logs:
:return:
'''
for log_file in all_logs:
with open(log_file) as open_file:
for line in open_file:
Expand All @@ -394,31 +399,40 @@ def generate_per_taxonomy_counts(self):
species_counts = defaultdict(Counter)
species_annotations = defaultdict(dict)
for count_groups in self.all_counts_grouped:
for count_dict in count_groups:
species_counts[count_dict['taxonomy']][count_dict['idtype']] += count_dict['count']
if 'assemblies' not in species_annotations.get(count_dict['taxonomy'], {}):
species_annotations[count_dict['taxonomy']] = {
'assemblies': set(),
'release_folder': None
}

species_annotations[count_dict['taxonomy']]['assemblies'].add(count_dict['assembly'])
species_annotations[count_dict['taxonomy']]['release_folder'] = count_dict['release_folder']
taxonomy_and_types = set([(count_dict['taxonomy'], count_dict['idtype']) for count_dict in count_groups])
for taxonomy, rstype in taxonomy_and_types:
if taxonomy not in species_annotations:
species_annotations[taxonomy] = {'assemblies': set(), 'release_folder': None}
# All count_dict have the same count in a group
species_counts[taxonomy][rstype] += count_groups[0]['count']
species_annotations[taxonomy]['assemblies'].update(
set([
count_dict['assembly']
for count_dict in count_groups
if count_dict['taxonomy'] is taxonomy and count_dict['idtype'] is rstype
])
)
species_annotations[taxonomy]['release_folder'] = count_groups[0]['release_folder']
return species_counts, species_annotations

def generate_per_assembly_counts(self):
assembly_counts = defaultdict(Counter)
assembly_annotations = {}
for count_groups in self.all_counts_grouped:
for count_dict in count_groups:
assembly_counts[count_dict['assembly']][count_dict['idtype']] += count_dict['count']
if 'taxonomies' not in assembly_annotations.get(count_dict['assembly'], {}):
assembly_annotations[count_dict['assembly']] = {
'taxonomies': set(),
'release_folder': None
}
assembly_annotations[count_dict['assembly']]['taxonomies'].add(count_dict['taxonomy'])
assembly_annotations[count_dict['assembly']]['release_folder'] = count_dict['assembly']
assembly_and_types = set([(count_dict['assembly'], count_dict['idtype']) for count_dict in count_groups])
for assembly_accession, rstype in assembly_and_types:
if assembly_accession not in assembly_annotations:
assembly_annotations[assembly_accession] = {'taxonomies': set(), 'release_folder': None}
# All count_dict have the same count in a group
assembly_counts[assembly_accession][rstype] += count_groups[0]['count']
assembly_annotations[assembly_accession]['taxonomies'].update(
set([
count_dict['taxonomy']
for count_dict in count_groups
if count_dict['assembly'] is assembly_accession and count_dict['idtype'] is rstype
]))

assembly_annotations[assembly_accession]['release_folder'] = assembly_accession
return assembly_counts, assembly_annotations

# def generate_per_species_assembly_counts(self):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import argparse
import csv

from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_common_pyutils.logger import logging_config
from ebi_eva_common_pyutils.pg_utils import execute_query
from ebi_eva_internal_pyutils.pg_utils import execute_query

from gather_clustering_counts.gather_per_species_clustering_counts import assembly_table_name

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

from ebi_eva_common_pyutils.common_utils import pretty_print
from ebi_eva_common_pyutils.logger import logging_config
from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query, execute_query
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.pg_utils import get_all_results_for_query, execute_query

from gather_clustering_counts.gather_per_species_clustering_counts import get_taxonomy_and_scientific_name, \
assembly_table_name, id_to_column
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
18746871 GCA_000188235.2-oreochromis_niloticus-current,GCA_000188235.2-haplochromini-current,
4882192 GCA_001858045.3-oreochromis_niloticus-current,
237 GCA_000188235.2-oreochromis_niloticus-current,GCA_001858045.3-oreochromis_niloticus-current,GCA_000188235.2-haplochromini-current,
71 GCA_000188235.2-oreochromis_niloticus-deprecated,GCA_000188235.2-haplochromini-deprecated,
17 Unmapped-oreochromis_niloticus-unmapped,
14 GCA_000188235.2-haplochromini-multimap,GCA_000188235.2-oreochromis_niloticus-multimap,
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from itertools import cycle
from unittest import TestCase
from unittest.mock import patch

Expand Down Expand Up @@ -52,8 +53,11 @@ def test_write_counts_to_db(self):
log_files_release1 = [os.path.join(self.resource_folder, 'count_for_release1.log')]
log_files_release2 = [os.path.join(self.resource_folder, 'count_for_release2.log')]
list_cow_assemblies = ['GCA_000003055.3', 'GCA_000003055.5', 'GCA_000003205.1', 'GCA_000003205.4', 'GCA_000003205.6', 'Unmapped']
with patch.object(ReleaseCounter, 'get_taxonomy_and_scientific_name') as ptaxonomy:
ptaxonomy.return_value = (9913, 'Bos taurus')
folder_to_taxonomy = {'bos_taurus': 9913}

with patch.object(ReleaseCounter, 'get_taxonomy') as ptaxonomy:
# ptaxonomy.side_effect = lambda x: folder_to_taxonomy.get(x)
ptaxonomy.return_value = 9913
counter = ReleaseCounter(self.private_config_xml_file, config_profile=self.config_profile,
release_version=1, logs=log_files_release1)
counter.write_counts_to_db()
Expand All @@ -67,7 +71,7 @@ def test_write_counts_to_db(self):
result = session.execute(query).fetchone()
rs_taxonomy_count = result.RSCountPerTaxonomy
assert sorted(rs_taxonomy_count.assembly_accessions) == list_cow_assemblies
assert rs_taxonomy_count.current_rs == 169904286
assert rs_taxonomy_count.current_rs == 102813585
assert rs_taxonomy_count.new_current_rs == 0
assert rs_taxonomy_count.release_folder == 'Cow_9913'

Expand All @@ -76,8 +80,8 @@ def test_write_counts_to_db(self):
result = session.execute(query).fetchone()
rs_taxonomy_count = result.RSCountPerTaxonomy
assert sorted(rs_taxonomy_count.assembly_accessions) == list_cow_assemblies
assert rs_taxonomy_count.current_rs == 169101573
assert rs_taxonomy_count.new_current_rs == -802713
assert rs_taxonomy_count.current_rs == 102605893
assert rs_taxonomy_count.new_current_rs == -207692
assert rs_taxonomy_count.release_folder == 'bos_taurus'

query = select(RSCountPerAssembly).where(RSCountPerAssembly.assembly_accession == 'GCA_000003205.6',
Expand All @@ -89,3 +93,21 @@ def test_write_counts_to_db(self):
assert rs_assembly_count.new_current_rs == 0
assert rs_assembly_count.release_folder == 'GCA_000003205.6'

def test_write_counts_to_db2(self):
log_files_release = [os.path.join(self.resource_folder, 'count_for_haplochromini_oreochromis_niloticus.log')]
folder_to_taxonomy = {'oreochromis_niloticus': 8128, 'haplochromini': 319058}

with patch.object(ReleaseCounter, 'get_taxonomy') as ptaxonomy:
ptaxonomy.side_effect = lambda x: folder_to_taxonomy.get(x)
counter = ReleaseCounter(self.private_config_xml_file, config_profile=self.config_profile,
release_version=4, logs=log_files_release)
counter.write_counts_to_db()
session = Session(counter.sqlalchemy_engine)

query = select(RSCountPerAssembly).where(RSCountPerAssembly.assembly_accession == 'GCA_000188235.2',
RSCountPerAssembly.release_version == 4)
result = session.execute(query).fetchone()
rs_assembly_count = result.RSCountPerAssembly
assert rs_assembly_count.current_rs == 18747108 # 18746871 + 237
assert rs_assembly_count.release_folder == 'GCA_000188235.2'

Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
from publish_release_to_ftp.create_assembly_name_symlinks import create_assembly_name_symlinks
from ebi_eva_common_pyutils.command_utils import run_command_with_output
from ebi_eva_common_pyutils.logger import logging_config
from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.pg_utils import get_all_results_for_query
from run_release_in_embassy.run_release_for_species import get_common_release_properties
from run_release_in_embassy.release_metadata import release_vcf_file_categories, release_text_file_categories

Expand Down
2 changes: 1 addition & 1 deletion eva-accession-release-automation/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ebi_eva_common_pyutils[internal]==0.6.5
ebi_eva_common_pyutils[eva-internal]==0.6.5
click==7.1.2
pytz==2022.6
pyyaml==5.3.1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
import traceback

from ebi_eva_common_pyutils.command_utils import run_command_with_output
from ebi_eva_common_pyutils.config_utils import get_mongo_uri_for_eva_profile
from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_common_pyutils.mongo_utils import copy_db
from ebi_eva_internal_pyutils.config_utils import get_mongo_uri_for_eva_profile
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.mongo_utils import copy_db
from pymongo import MongoClient
from pymongo.uri_parser import parse_uri
from run_release_in_embassy.release_common_utils import open_mongo_port_to_tempmongo, close_mongo_port_to_tempmongo, \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
import os

import click
from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_common_pyutils.spring_properties import SpringPropertiesGenerator
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.spring_properties import SpringPropertiesGenerator

from run_release_in_embassy.release_common_utils import get_release_db_name_in_tempmongo_instance
from run_release_in_embassy.release_metadata import get_release_inventory_info_for_assembly
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
from itertools import cycle

from ebi_eva_common_pyutils.assembly import NCBIAssembly
from ebi_eva_common_pyutils.config_utils import get_mongo_uri_for_eva_profile
from ebi_eva_internal_pyutils.config_utils import get_mongo_uri_for_eva_profile
from ebi_eva_common_pyutils.logger import logging_config, AppLogger
from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_common_pyutils.mongodb import MongoDatabase
from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query, execute_query
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.mongodb import MongoDatabase
from ebi_eva_internal_pyutils.pg_utils import get_all_results_for_query, execute_query
from ebi_eva_common_pyutils.taxonomy.taxonomy import normalise_taxon_scientific_name, get_scientific_name_from_ensembl


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from include_mapping_weight_from_dbsnp.snpmapinfo_metadata import \
get_snpmapinfo_tables_with_overweight_snps_for_dbsnp_species, get_snpmapinfo_asm_columns
from include_mapping_weight_from_dbsnp.dbsnp_mirror_metadata import get_db_conn_for_species, get_species_info
from ebi_eva_common_pyutils.pg_utils import get_pg_connection_handle, execute_query, create_index_on_table, \
from ebi_eva_internal_pyutils.pg_utils import get_pg_connection_handle, execute_query, create_index_on_table, \
vacuum_analyze_table


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@


from run_release_in_embassy.release_metadata import update_release_progress_status
from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle


logger = logging.getLogger(__name__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import os

from ebi_eva_common_pyutils.command_utils import run_command_with_output
from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from run_release_in_embassy.release_metadata import release_vcf_file_categories, release_text_file_categories, \
get_release_inventory_info_for_assembly
from run_release_in_embassy.release_common_utils import get_bgzip_bcftools_index_commands_for_file, \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import traceback

from run_release_in_embassy.release_metadata import get_target_mongo_instance_for_assembly
from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_common_pyutils.network_utils import get_available_local_port, forward_remote_port_to_local_port
from ebi_eva_common_pyutils.taxonomy import taxonomy

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.
import datetime

from ebi_eva_common_pyutils.pg_utils import get_all_results_for_query
from ebi_eva_internal_pyutils.pg_utils import get_all_results_for_query

release_vcf_file_categories = ["current_ids", "merged_ids"]
release_text_file_categories = ["deprecated_ids", "merged_deprecated_ids"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import yaml

from ebi_eva_common_pyutils.common_utils import merge_two_dicts
from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle
from run_release_in_embassy.release_metadata import get_release_assemblies_for_taxonomy
from run_release_in_embassy.release_common_utils import get_release_folder_name

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@


from run_release_in_embassy.release_metadata import update_release_progress_status
from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle


logger = logging.getLogger(__name__)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from run_release_in_embassy.release_metadata import get_release_inventory_info_for_assembly, \
release_vcf_file_categories, vcf_validation_output_file_pattern, asm_report_output_file_pattern
from ebi_eva_common_pyutils.command_utils import run_command_with_output
from ebi_eva_common_pyutils.metadata_utils import get_metadata_connection_handle
from ebi_eva_internal_pyutils.metadata_utils import get_metadata_connection_handle


def remove_index_if_outdated(fasta_path):
Expand Down

0 comments on commit af7f442

Please sign in to comment.