Skip to content

Commit

Permalink
#459 - New gene annotation releases
Browse files Browse the repository at this point in the history
  • Loading branch information
davmlaw committed Aug 25, 2021
1 parent 7c4e81c commit 1be6cdb
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 11 deletions.
2 changes: 1 addition & 1 deletion genes/management/commands/import_gene_annotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,7 +591,7 @@ def get_gene_version(self, attributes):

return GeneVersion(gene_id=attributes["gene_id"],
version=int(attributes["version"]),
gene_symbol_id=attributes["Name"],
gene_symbol_id=attributes.get("Name"), # Can be null (eg lncRNA)
hgnc_id=hgnc_id,
description=description,
biotype=attributes["biotype"],
Expand Down
19 changes: 19 additions & 0 deletions genes/migrations/0033_alter_geneversion_gene_symbol.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Generated by Django 3.2.1 on 2021-08-24 11:21

from django.db import migrations, models
import django.db.models.deletion


class Migration(migrations.Migration):

dependencies = [
('genes', '0032_alter_geneannotationrelease_version'),
]

operations = [
migrations.AlterField(
model_name='geneversion',
name='gene_symbol',
field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, to='genes.genesymbol'),
),
]
3 changes: 2 additions & 1 deletion genes/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,8 @@ class GeneVersion(models.Model):
Genes/TranscriptVersion needs to be able to represent both RefSeq and Ensembl """
gene = models.ForeignKey(Gene, on_delete=CASCADE)
version = models.IntegerField() # RefSeq GeneIDs are always 1 (not versioned)
gene_symbol = models.ForeignKey(GeneSymbol, on_delete=CASCADE)
# symbol can be null as Ensembl has genes w/o symbols, eg ENSG00000238009 (lncRNA)
gene_symbol = models.ForeignKey(GeneSymbol, null=True, on_delete=CASCADE)
hgnc = models.ForeignKey(HGNC, null=True, on_delete=CASCADE)
description = models.TextField(null=True)
biotype = models.TextField(null=True)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#!/bin/bash

ftp://ftp.ensembl.org/pub/release-78/gtf/homo_sapiens/Homo_sapiens.GRCh38.78.gtf.gz
wget ftp://ftp.ensembl.org/pub/release-78/gtf/homo_sapiens/Homo_sapiens.GRCh38.78.gtf.gz

for release in 76 77 78 79 80; do
wget ftp://ftp.ensembl.org/pub/release-${release}/gtf/homo_sapiens/Homo_sapiens.GRCh38.${release}.gtf.gz
done

#81 is first GFF3 for GRCh38
for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101; do
for release in 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104; do
wget ftp://ftp.ensembl.org/pub/release-${release}/gff3/homo_sapiens/Homo_sapiens.GRCh38.${release}.gff3.gz
done
11 changes: 4 additions & 7 deletions genes/scripts/refseq/download_refseq_gene_annotation_grch38.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,7 @@ wget ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/
wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.gff.gz

# These all have the same name, so rename them based on release ID
wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109.20190607/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz --output-document=GCF_000001405.39_GRCh38.p13_genomic.109.20190607.gff.gz
wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109.20190905/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz --output-document=GCF_000001405.39_GRCh38.p13_genomic.109.20190905.gff.gz
wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109.20191205/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz --output-document=GCF_000001405.39_GRCh38.p13_genomic.109.20191205.gff.gz
wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109.20200228/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz --output-document=GCF_000001405.39_GRCh38.p13_genomic.109.20200228.gff.gz
wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109.20200522/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz --output-document=GCF_000001405.39_GRCh38.p13_genomic.109.20200522.gff.gz
wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109.20200815/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz --output-document=GCF_000001405.39_GRCh38.p13_genomic.109.20200815.gff.gz
wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/109.20201120/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz --output-document=GCF_000001405.39_GRCh38.p13_genomic.109.20201120.gff.gz

for release in 109.20190607 109.20190905 109.20191205 109.20200228 109.20200522 109.20200815 109.20201120 109.20210226 109.20210514; do
wget ftp://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz --output-document=GCF_000001405.39_GRCh38.p13_genomic.${release}.gff.gz
done
78 changes: 78 additions & 0 deletions genes/scripts/updates/210824_new_annotations.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/bin/bash

set -e

# VG dir
if [ -z $VG_DIR ]; then
echo "You need to define 'VG_DIR'"
exit 1;
fi

GFF3_TO_GENEPRED=$(which gff3ToGenePred)
if [ -z ${GFF3_TO_GENEPRED} ]; then
echo "Downloading gff3ToGenePred command line tool"
wget hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/gff3ToGenePred
chmod a+x gff3ToGenePred
GFF3_TO_GENEPRED=./gff3ToGenePred
fi


# Ensembl
echo "Downloading ENSEMBL"
mkdir -p ensembl
cd ensembl

if false; then
for release in 102 103 104; do
gff=Homo_sapiens.GRCh38.${release}.gff3.gz;
GENEPRED="$(basename ${gff} .gff.gz).genePred"
echo "GenePred = ${GENEPRED}";

if [[ ! -e ${gff} ]]; then
wget ftp://ftp.ensembl.org/pub/release-${release}/gff3/homo_sapiens/${gff}
fi

if [[ ! -e ${GENEPRED} ]]; then
${GFF3_TO_GENEPRED} -processAllGeneChildren ${gff} ${GENEPRED}
fi

echo "Inserting gene annotation"

python3.8 ${VG_DIR}/manage.py import_gene_annotation --genome-build=GRCh38 --replace --annotation-consortium=Ensembl \
--gff ${gff} \
--genePred ${GENEPRED}

done
fi

cd ..

# RefSeq
echo "Downloading RefSeq"

mkdir -p refseq
cd refseq

for release in 109.20210226 109.20210514; do
gff=GCF_000001405.39_GRCh38.p13_genomic.${release}.gff.gz
GENEPRED="$(basename ${gff} .gff.gz).genePred"

if [[ ! -e ${gff} ]]; then
echo "Downloading '${gff}'"
# FTP is corrupt, trying http

wget http://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/annotation/annotation_releases/${release}/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz
mv GCF_000001405.39_GRCh38.p13_genomic.gff.gz ${gff}
fi

if [[ ! -e ${GENEPRED} ]]; then
${GFF3_TO_GENEPRED} -processAllGeneChildren -maxParseErrors=-1 -geneNameAttr=Name -rnaNameAttr=transcript_id ${gff} ${GENEPRED}
fi

python3.8 ${VG_DIR}/manage.py import_gene_annotation --genome-build=GRCh38 --replace --annotation-consortium=RefSeq \
--gff ${gff} \
--genePred ${GENEPRED}

done

cd ..

0 comments on commit 1be6cdb

Please sign in to comment.