Skip to content

Commit

Permalink
#480 - annotation and transcript version pages
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Sep 14, 2021
1 parent a377329 commit 245374d
Show file tree
Hide file tree
Showing 9 changed files with 289 additions and 111 deletions.
20 changes: 19 additions & 1 deletion annotation/templates/annotation/annotation.html
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,24 @@ <h5>RefSeq GRCh38</h5>
{% endinstall-instructions %}
{% endlabelled %}

{% labelled label="TranscriptVersion Sequence Info" %}
{% if transcript_version_sequence_info %}
{{ transcript_version_sequence_info }}
{% for fasta_import in transcript_fasta_imports %}
<div title="{{ fasta_import.md5_hash }}">
{{ fasta_import }} ({{ fasta_import.created|localtime }})
</div>
{% endfor %}
{% else %}
<div class="no-value">None</div>
{% endif %}
{% install-instructions "TranscriptVersion Sequence Info" installed=transcript_fasta_imports %}
This is genome build independent, so just download most recently updated version.
<code>wget https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz
{{ python_command }} manage.py import_refseq_transcript_fasta ${VARIANTGRID_SETUP_DATA}/GRCh38_latest_rna.fna.gz</code>
{% endinstall-instructions %}
{% endlabelled %}

{% for ontology_count in ontology_counts %}
{% labelled label=ontology_count.service %}
{{ ontology_count.count | intcomma }}
Expand Down Expand Up @@ -410,6 +428,6 @@ <h5>RefSeq GRCh38</h5>
</div>
{% endlabelled %}
{% endfor %}
</>
</div>
</div>
{% endblock content %}
57 changes: 39 additions & 18 deletions annotation/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
from annotation.models.models_version_diff import VersionDiff
from annotation.tasks.annotate_variants import annotation_run_retry
from annotation.vep_annotation import get_vep_command
from genes.models import GeneListCategory, GeneAnnotationImport, GeneVersion, TranscriptVersion, GeneSymbolAlias
from genes.models import GeneListCategory, GeneAnnotationImport, GeneVersion, TranscriptVersion, GeneSymbolAlias, \
TranscriptVersionSequenceInfoFastaFileImport, TranscriptVersionSequenceInfo
from genes.models_enums import AnnotationConsortium, GeneSymbolAliasSource
from library.constants import WEEK_SECS
from library.django_utils import require_superuser, get_field_counts
Expand Down Expand Up @@ -194,14 +195,30 @@ def annotation(request):
if diagnostic_gene_list_count:
diagnostic_gene_list = f"{diagnostic_gene_list_count} diagnostic gene lists"

clinvar_citations = ClinVarCitation.objects.count()
if clinvar_citations:
if clinvar_citations := ClinVarCitation.objects.count():
num_cached_clinvar_citations = CachedCitation.objects.count()
clinvar_citations = f"{clinvar_citations} ClinVar citations ({num_cached_clinvar_citations} cached)"

hpa_version = HumanProteinAtlasAnnotationVersion.objects.order_by("-annotation_date").first()
hpa_counts = HumanProteinAtlasAnnotation.objects.filter(version=hpa_version).count()

transcript_fasta_imports = list(TranscriptVersionSequenceInfoFastaFileImport.objects.all().order_by("created"))
tvi_qs = TranscriptVersionSequenceInfo.objects.all()
tvi_api_count = tvi_qs.filter(fasta_import__isnull=True).count()
tvi_fasta_count = tvi_qs.filter(fasta_import__isnull=False).count()
if tvi_total := tvi_api_count + tvi_fasta_count:
transcript_version_sequence_info = f"{tvi_total} total"
subtotals = []
if tvi_api_count:
subtotals.append(f"{tvi_api_count} from API")
if tvi_fasta_count:
subtotals.append(f"{tvi_fasta_count} from {len(transcript_fasta_imports)} imports")
if subtotals:
subtotal = ", ".join(subtotals)
transcript_version_sequence_info += f" ({subtotal})"
else:
transcript_version_sequence_info = None

somalier = None
if somalier_enabled := settings.SOMALIER.get("enabled"):
somalier = _verify_somalier_config()
Expand All @@ -220,21 +237,25 @@ def annotation(request):
cwr, _ = CachedWebResource.objects.get_or_create(name=cwr_name)
cached_web_resources.append(cwr)

context = {"annotations_all_imported": annotations_all_imported,
"genome_build_annotations": genome_build_annotations,
"ensembl_biomart_transcript_genes": ensembl_biomart_transcript_genes,
"ontology_services": ontology_services,
"ontology_counts": ontology_counts,
"ontology_relationship_counts": ontology_relationship_counts,
"ontology_imports": ontology_imports,
"gene_symbol_alias_counts": gene_symbol_alias_counts,
"diagnostic_gene_list": diagnostic_gene_list,
"clinvar_citations": clinvar_citations,
"hpa_counts": hpa_counts,
"num_annotation_columns": VariantGridColumn.objects.count(),
"cached_web_resources": cached_web_resources,
"python_command": settings.PYTHON_COMMAND,
"somalier": somalier}
context = {
"annotations_all_imported": annotations_all_imported,
"genome_build_annotations": genome_build_annotations,
"ensembl_biomart_transcript_genes": ensembl_biomart_transcript_genes,
"ontology_services": ontology_services,
"ontology_counts": ontology_counts,
"ontology_relationship_counts": ontology_relationship_counts,
"ontology_imports": ontology_imports,
"gene_symbol_alias_counts": gene_symbol_alias_counts,
"diagnostic_gene_list": diagnostic_gene_list,
"clinvar_citations": clinvar_citations,
"hpa_counts": hpa_counts,
"transcript_version_sequence_info": transcript_version_sequence_info,
"transcript_fasta_imports": transcript_fasta_imports,
"num_annotation_columns": VariantGridColumn.objects.count(),
"cached_web_resources": cached_web_resources,
"python_command": settings.PYTHON_COMMAND,
"somalier": somalier
}
return render(request, "annotation/annotation.html", context)


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import logging

from django.core.management import BaseCommand

from genes.models import TranscriptVersion, GeneAnnotationImport, TranscriptVersionSequenceInfo
from snpdb.models import GenomeBuild


class Command(BaseCommand):
def handle(self, *args, **options):

have_tvi = set()
for tvi in TranscriptVersionSequenceInfo.objects.all():
accession = f"{tvi.transcript_id}.{tvi.version}"
have_tvi.add(accession)

tv_lengths_37 = {}
tv_lengths_38 = {}
for tv in TranscriptVersion.objects.filter(transcript__annotation_consortium='R', alignment_gap=False,
genome_build=GenomeBuild.grch37()):
tv_lengths_37[tv.accession] = tv.length

for tv in TranscriptVersion.objects.filter(transcript__annotation_consortium='R', alignment_gap=False,
genome_build=GenomeBuild.grch38()):
tv_lengths_38[tv.accession] = tv.length

different_lengths = set()
for accession in set(tv_lengths_37) & set(tv_lengths_38):
length_37 = tv_lengths_37[accession]
length_38 = tv_lengths_38[accession]
if length_37 != length_38:
different_lengths.add(accession)

only_one_build = set(tv_lengths_37) ^ set(tv_lengths_38)
# Imports w/o GFFs (only from genePred) don't have alignment info (gap_count) so we can't detect alignment gaps
no_gff_imports = GeneAnnotationImport.objects.filter(filename__contains='genePred')
no_gff_tvs = {tv.accession for tv in TranscriptVersion.objects.filter(import_source__in=no_gff_imports)}

need_to_retrieve = (different_lengths | only_one_build | no_gff_tvs) - have_tvi
num_to_retrieve = len(need_to_retrieve)
logging.info("Retrieving %d Transcript Version Sequence Info records (takes ~1min per 1000)", num_to_retrieve)
TranscriptVersionSequenceInfo.get_refseq_transcript_versions(need_to_retrieve)
47 changes: 26 additions & 21 deletions genes/management/commands/import_refseq_transcript_fasta.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
import logging
from collections import defaultdict
from collections import Counter

from Bio import SeqIO
from django.core.management import BaseCommand
from django.db.models import QuerySet

from genes.models import TranscriptVersionInfo, TranscriptVersionInfoFastaFileImport, TranscriptVersion, Transcript
from genes.models import TranscriptVersionSequenceInfo, TranscriptVersionSequenceInfoFastaFileImport, TranscriptVersion, Transcript
from genes.models_enums import AnnotationConsortium
from library.file_utils import file_md5sum, open_handle_gzip




class Command(BaseCommand):

def add_arguments(self, parser):
Expand All @@ -23,9 +19,9 @@ def handle(self, *args, **options):
overwrite = options["overwrite"]

md5_hash = file_md5sum(filename)
if existing_import := TranscriptVersionInfoFastaFileImport.objects.filter(md5_hash=md5_hash).first():
if existing_import := TranscriptVersionSequenceInfoFastaFileImport.objects.filter(md5_hash=md5_hash).first():
if overwrite:
print(f"Deleting existing TranscriptVersionInfos for fasta import {md5_hash}")
print(f"Deleting existing TranscriptVersionSequenceInfos for fasta import {md5_hash}")
existing_import.delete()
else:
raise ValueError(f"Fasta import {md5_hash} exists, use --overwrite to delete old data")
Expand All @@ -34,26 +30,35 @@ def handle(self, *args, **options):
if not known_transcripts:
raise ValueError("No transcripts! Insert them first!")

fasta_import = TranscriptVersionInfoFastaFileImport.objects.create(md5_hash=md5_hash,
annotation_consortium=AnnotationConsortium.REFSEQ,
filename=filename)
skipped_transcripts = 0
fasta_import = TranscriptVersionSequenceInfoFastaFileImport.objects.create(md5_hash=md5_hash,
annotation_consortium=AnnotationConsortium.REFSEQ,
filename=filename)
unknown_transcripts = []
unknown_transcript_prefixes = Counter()
records = []
with open_handle_gzip(filename, "rt") as f:
for record in SeqIO.parse(f, "fasta"):
transcript_id, version = TranscriptVersion.get_transcript_id_and_version(record.id)
if transcript_id not in known_transcripts:
skipped_transcripts += 1
continue
if transcript_id.startswith("X"):
continue # We don't want these
prefix = transcript_id.split("_")[0]
unknown_transcript_prefixes[prefix] += 1
unknown_transcripts.append(Transcript(identifier=transcript_id,
annotation_consortium=AnnotationConsortium.REFSEQ))

tvi = TranscriptVersionInfo(transcript_id=transcript_id, version=version,
fasta_import=fasta_import,
sequence=str(record.seq), length=len(record.seq))
tvi = TranscriptVersionSequenceInfo(transcript_id=transcript_id, version=version,
fasta_import=fasta_import,
sequence=str(record.seq), length=len(record.seq))
records.append(tvi)

print(f"Skipped {skipped_transcripts} transcripts not in our database")
if unknown_transcripts:
print(f"Inserting {len(unknown_transcripts)} unknown_transcripts")
print(unknown_transcript_prefixes)
Transcript.objects.bulk_create(unknown_transcripts, batch_size=2000)

if num_records := len(records):
print(f"Inserting {num_records} TranscriptVersionInfo records")
TranscriptVersionInfo.objects.bulk_create(records, ignore_conflicts=True, batch_size=2000)
print(f"Inserting {num_records} TranscriptVersionSequenceInfo records")
TranscriptVersionSequenceInfo.objects.bulk_create(records, ignore_conflicts=True, batch_size=2000)

TranscriptVersionInfo.set_transcript_version_alignment_gap_if_length_different(records)
TranscriptVersionSequenceInfo.set_transcript_version_alignment_gap_if_length_different(records)
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Generated by Django 3.2.6 on 2021-09-14 02:01
# Generated by Django 3.2.6 on 2021-09-14 12:08

from django.db import migrations, models
import django.db.models.deletion
Expand All @@ -13,7 +13,7 @@ class Migration(migrations.Migration):

operations = [
migrations.CreateModel(
name='TranscriptVersionInfoFastaFileImport',
name='TranscriptVersionSequenceInfoFastaFileImport',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created', django_extensions.db.fields.CreationDateTimeField(auto_now_add=True, verbose_name='created')),
Expand All @@ -28,7 +28,7 @@ class Migration(migrations.Migration):
},
),
migrations.CreateModel(
name='TranscriptVersionInfo',
name='TranscriptVersionSequenceInfo',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created', django_extensions.db.fields.CreationDateTimeField(auto_now_add=True, verbose_name='created')),
Expand All @@ -37,7 +37,7 @@ class Migration(migrations.Migration):
('api_response', models.TextField(null=True)),
('sequence', models.TextField()),
('length', models.IntegerField()),
('fasta_import', models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, to='genes.transcriptversioninfofastafileimport')),
('fasta_import', models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, to='genes.transcriptversionsequenceinfofastafileimport')),
('transcript', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='genes.transcript')),
],
options={
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Generated by Django 3.2.6 on 2021-09-14 12:08

from django.db import migrations

from manual.operations.manual_operations import ManualOperation


def _test_has_transcript_versions(apps):
""" Don't need to run this for new deployments """

TranscriptVersion = apps.get_model("genes", "TranscriptVersion")
return TranscriptVersion.objects.exists()

class Migration(migrations.Migration):

dependencies = [
('genes', '0039_transcriptversionsequenceinfo_transcriptversionsequenceinfofastafileimport'),
]

operations = [
ManualOperation.operation_other(args=[
"*** BEFORE fix_retrieve_transcript_version_sequence_info - import_refseq_transcript_fasta - see annotation page"],
test=_test_has_transcript_versions),
ManualOperation(task_id=ManualOperation.task_id_manage(["fix_retrieve_transcript_version_sequence_info"]),
test=_test_has_transcript_versions)
]
Loading

0 comments on commit 245374d

Please sign in to comment.