From 1be6cdb635425ea009fd450c0cc0b6ea65d26e5a Mon Sep 17 00:00:00 2001 From: Dave Lawrence Date: Wed, 25 Aug 2021 15:21:26 +0930 Subject: [PATCH] #459 - New gene annotation releases --- .../commands/import_gene_annotation.py | 2 +- .../0033_alter_geneversion_gene_symbol.py | 19 +++++ genes/models.py | 3 +- ...download_ensembl_gene_annotation_grch38.sh | 4 +- .../download_refseq_gene_annotation_grch38.sh | 11 +-- .../scripts/updates/210824_new_annotations.sh | 78 +++++++++++++++++++ 6 files changed, 106 insertions(+), 11 deletions(-) create mode 100644 genes/migrations/0033_alter_geneversion_gene_symbol.py create mode 100755 genes/scripts/updates/210824_new_annotations.sh diff --git a/genes/management/commands/import_gene_annotation.py b/genes/management/commands/import_gene_annotation.py index d674ccec5..469dc7ae9 100644 --- a/genes/management/commands/import_gene_annotation.py +++ b/genes/management/commands/import_gene_annotation.py @@ -591,7 +591,7 @@ def get_gene_version(self, attributes): return GeneVersion(gene_id=attributes["gene_id"], version=int(attributes["version"]), - gene_symbol_id=attributes["Name"], + gene_symbol_id=attributes.get("Name"), # Can be null (eg lncRNA) hgnc_id=hgnc_id, description=description, biotype=attributes["biotype"], diff --git a/genes/migrations/0033_alter_geneversion_gene_symbol.py b/genes/migrations/0033_alter_geneversion_gene_symbol.py new file mode 100644 index 000000000..8af3c2651 --- /dev/null +++ b/genes/migrations/0033_alter_geneversion_gene_symbol.py @@ -0,0 +1,19 @@ +# Generated by Django 3.2.1 on 2021-08-24 11:21 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('genes', '0032_alter_geneannotationrelease_version'), + ] + + operations = [ + migrations.AlterField( + model_name='geneversion', + name='gene_symbol', + field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, to='genes.genesymbol'), + ), + ] diff --git a/genes/models.py b/genes/models.py index a559b10e5..f98d73050 100644 --- a/genes/models.py +++ b/genes/models.py @@ -408,7 +408,8 @@ class GeneVersion(models.Model): Genes/TranscriptVersion needs to be able to represent both RefSeq and Ensembl """ gene = models.ForeignKey(Gene, on_delete=CASCADE) version = models.IntegerField() # RefSeq GeneIDs are always 1 (not versioned) - gene_symbol = models.ForeignKey(GeneSymbol, on_delete=CASCADE) + # symbol can be null as Ensembl has genes w/o symbols, eg ENSG00000238009 (lncRNA) + gene_symbol = models.ForeignKey(GeneSymbol, null=True, on_delete=CASCADE) hgnc = models.ForeignKey(HGNC, null=True, on_delete=CASCADE) description = models.TextField(null=True) biotype = models.TextField(null=True) diff --git a/genes/scripts/ensembl/download_ensembl_gene_annotation_grch38.sh b/genes/scripts/ensembl/download_ensembl_gene_annotation_grch38.sh index 17fbdf99c..c7c084534 100755 --- a/genes/scripts/ensembl/download_ensembl_gene_annotation_grch38.sh +++ b/genes/scripts/ensembl/download_ensembl_gene_annotation_grch38.sh @@ -1,12 +1,12 @@ #!/bin/bash -ftp://ftp.ensembl.org/pub/release-78/gtf/homo_sapiens/Homo_sapiens.GRCh38.78.gtf.gz +wget ftp://ftp.ensembl.org/pub/release-78/gtf/homo_sapiens/Homo_sapiens.GRCh38.78.gtf.gz for release in 76 77 78 79 80; do wget ftp://ftp.ensembl.org/pub/release-${release}/gtf/homo_sapiens/Homo_sapiens.GRCh38.${release}.gtf.gz done #81 is first GFF3 for GRCh38 -for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101; do +for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104; do wget ftp://ftp.ensembl.org/pub/release-${release}/gff3/homo_sapiens/Homo_sapiens.GRCh38.${release}.gff3.gz done \ No newline at end of file diff --git a/genes/scripts/refseq/download_refseq_gene_annotation_grch38.sh b/genes/scripts/refseq/download_refseq_gene_annotation_grch38.sh index 01ae7fded..81efbd897 100755 --- a/genes/scripts/refseq/download_refseq_gene_annotation_grch38.sh +++ b/genes/scripts/refseq/download_refseq_gene_annotation_grch38.sh @@ -7,10 +7,7 @@ wget ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.gff.gz # These all have the same name, so rename them based on release ID -wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109.20190607/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz --output-document=GCF_000001405.39_GRCh38.p13_genomic.109.20190607.gff.gz -wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109.20190905/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz --output-document=GCF_000001405.39_GRCh38.p13_genomic.109.20190905.gff.gz -wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109.20191205/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz --output-document=GCF_000001405.39_GRCh38.p13_genomic.109.20191205.gff.gz -wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109.20200228/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz --output-document=GCF_000001405.39_GRCh38.p13_genomic.109.20200228.gff.gz -wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109.20200522/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz --output-document=GCF_000001405.39_GRCh38.p13_genomic.109.20200522.gff.gz -wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109.20200815/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz --output-document=GCF_000001405.39_GRCh38.p13_genomic.109.20200815.gff.gz -wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109.20201120/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz --output-document=GCF_000001405.39_GRCh38.p13_genomic.109.20201120.gff.gz + +for release in 109.20190607 109.20190905 109.20191205 109.20200228 109.20200522 109.20200815 109.20201120 109.20210226 109.20210514; do + wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz --output-document=GCF_000001405.39_GRCh38.p13_genomic.${release}.gff.gz +done diff --git a/genes/scripts/updates/210824_new_annotations.sh b/genes/scripts/updates/210824_new_annotations.sh new file mode 100755 index 000000000..5c85e65b3 --- /dev/null +++ b/genes/scripts/updates/210824_new_annotations.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +set -e + +# VG dir +if [ -z $VG_DIR ]; then + echo "You need to define 'VG_DIR'" + exit 1; +fi + +GFF3_TO_GENEPRED=$(which gff3ToGenePred) +if [ -z ${GFF3_TO_GENEPRED} ]; then + echo "Downloading gff3ToGenePred command line tool" + wget hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/gff3ToGenePred + chmod a+x gff3ToGenePred + GFF3_TO_GENEPRED=./gff3ToGenePred +fi + + +# Ensembl +echo "Downloading ENSEMBL" +mkdir -p ensembl +cd ensembl + +if false; then + for release in 102 103 104; do + gff=Homo_sapiens.GRCh38.${release}.gff3.gz; + GENEPRED="$(basename ${gff} .gff.gz).genePred" + echo "GenePred = ${GENEPRED}"; + + if [[ ! -e ${gff} ]]; then + wget ftp://ftp.ensembl.org/pub/release-${release}/gff3/homo_sapiens/${gff} + fi + + if [[ ! -e ${GENEPRED} ]]; then + ${GFF3_TO_GENEPRED} -processAllGeneChildren ${gff} ${GENEPRED} + fi + + echo "Inserting gene annotation" + + python3.8 ${VG_DIR}/manage.py import_gene_annotation --genome-build=GRCh38 --replace --annotation-consortium=Ensembl \ + --gff ${gff} \ + --genePred ${GENEPRED} + + done +fi + +cd .. + +# RefSeq +echo "Downloading RefSeq" + +mkdir -p refseq +cd refseq + +for release in 109.20210226 109.20210514; do + gff=GCF_000001405.39_GRCh38.p13_genomic.${release}.gff.gz + GENEPRED="$(basename ${gff} .gff.gz).genePred" + + if [[ ! -e ${gff} ]]; then + echo "Downloading '${gff}'" + # FTP is corrupt, trying http + + wget http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz + mv GCF_000001405.39_GRCh38.p13_genomic.gff.gz ${gff} + fi + + if [[ ! -e ${GENEPRED} ]]; then + ${GFF3_TO_GENEPRED} -processAllGeneChildren -maxParseErrors=-1 -geneNameAttr=Name -rnaNameAttr=transcript_id ${gff} ${GENEPRED} + fi + + python3.8 ${VG_DIR}/manage.py import_gene_annotation --genome-build=GRCh38 --replace --annotation-consortium=RefSeq \ + --gff ${gff} \ + --genePred ${GENEPRED} + +done + +cd .. \ No newline at end of file