diff --git a/.gitignore b/.gitignore index 8abc040..fe5cf53 100644 --- a/.gitignore +++ b/.gitignore @@ -1,18 +1,28 @@ +# Files +git.hash +nextflow.config +todo +mito_full.nf + +# Suffix *~ -__pycache__/ container/*.sif *.sif .nextflow* -nextflow.config -work/ *.html *# -git.hash -todo -mito_full.nf -## ref tools -bin/reference_tools/*.log -bin/reference_tools/*.bed *.orig +*.tbi + +# Folders +__pycache__/ +work/ data/ +output/ .vscode/ + +# Ref tools +bin/reference_tools/*.log +bin/reference_tools/*.bed +bin/reference_tools/refdata/*.vcf.gz + diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bb3d8f..62e0b45 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # CHANGELOG +### 3.10.1 +* Update config for bed intersect +* Some fixes to the logging of the bed intersect script + ### 3.9.10 * Use reduced gene_panel JSON to avoid adding dead/archived panels to new scout cases * Add lennart-side script/worker CRON job to generate new gene panel JSON diff --git a/bin/reference_tools/agilient_hg38_nochr_noalt_1-3.bed b/bin/reference_tools/refdata/agilient_hg38_nochr_noalt_1-3.bed similarity index 100% rename from bin/reference_tools/agilient_hg38_nochr_noalt_1-3.bed rename to bin/reference_tools/refdata/agilient_hg38_nochr_noalt_1-3.bed diff --git a/bin/reference_tools/refdata/user_added.bed b/bin/reference_tools/refdata/user_added.bed new file mode 100644 index 0000000..dc89486 --- /dev/null +++ b/bin/reference_tools/refdata/user_added.bed @@ -0,0 +1 @@ +17 43387226 43387417 RNU2-4 diff --git a/bin/reference_tools/update_bed.pl b/bin/reference_tools/update_bed.pl deleted file mode 100755 index 42a935c..0000000 --- a/bin/reference_tools/update_bed.pl +++ /dev/null @@ -1,338 +0,0 @@ -#!/usr/bin/perl -w -use strict; -use Data::Dumper; -use File::Basename; -use lib dirname (__FILE__); -use vcf2 qw( parse_vcf ); -use Getopt::Long; - -my %opt = ( - skip_download => 0 -); -my @incl_bed_files; -GetOptions( \%opt, 'old=s', 'new=s', 'build=s', 'clinvardate=s', 'incl_bed=s@' => \@incl_bed_files, 'skip_download' ); - -my @required_params = qw(old new build clinvardate); -my @missing_params; - -foreach my $param (@required_params) { - push @missing_params, $param unless defined $opt{$param}; -} - -if (@missing_params) { - die "Error: Missing required parameters: " . join(", ", @missing_params) . "\nUsage: update_bed.pl --old --new --build <108> --clinvardate <20240701> [--incl_bed ] [--skip_download]\n"; -} - -if (@incl_bed_files) { - print "Additional included BED files: " . join(", ", @incl_bed_files) . "\n"; -} - -### Logic ### -# have a baseline bed that defines exonic regions, padded with 20(?) bp -# read clinvar vcf (likely pathogenic + pathogenic), convert it to bed, intersect. Regions not in baseline padded (5 bp?) and merge - -#my $clinvar_vcf = "/fs1/resources/ref/hg38/annotation_dbs/bedupdatertest/new_chrom8.vcf"; -#my $clinvar_vcf_old = "/fs1/resources/ref/hg38/annotation_dbs/bedupdatertest/old_chrom8.vcf"; -#my $clinvar_vcf = "/fs1/resources/ref/hg38/annotation_dbs/clinvar38_latest.vcf.gz"; -#my $clinvar_vcf_old = "/data/bnf/ref/annotations_dbs/clinvar38_20200106.vcf.gz"; -my $clinvar_vcf = $opt{'new'}; -my $clinvar_vcf_old = $opt{'old'}; -# my $agilent = "agilient_hg38_nochr_noalt_1-3.bed"; - -### BASE BED ######## -my $release = $opt{'build'}; # as argument -my $gtf = "Homo_sapiens.GRCh38.".$release.".gtf.gz"; -my $gtf_request = "https://ftp.ensembl.org/pub/release-".$release."/gtf/homo_sapiens/$gtf"; -my $base_bed = "exons_hg38_".$release.".bed"; - -## fetch $release from ensembl and get all exons for coding genes -get_base($gtf_request, $gtf, $base_bed, $release, $opt{'skip_download'}); - -### BED TO ADD DATA TO ### -my $clinvar = $opt{'clinvardate'}; -# final name of bed, will concat to this -my $final_bed_fp = "exons_".$release."padded20bp_clinvar-".$clinvar."padded5bp."."bed"; - -# if recreating same clinvarversion and ensembl build, delete old so it won't be concatenated -if (-e $final_bed_fp) { - unlink($final_bed_fp); -} - -## ADD DATA ## -# add_to_bed($final_bed_fp, $base_bed, "EXONS-$release"); - -foreach my $incl_bed (@incl_bed_files) { - my $suffix = $incl_bed; - add_to_bed($final_bed_fp, $incl_bed, $suffix); -} - -# add_to_bed($final_bed_fp, $agilent,"AGILENT-EXOME"); - -## clinvar variants ## -my ($clinvar_new, $new_benign) = read_clinvar($clinvar_vcf); -my ($clinvar_old, $old_benign) = read_clinvar($clinvar_vcf_old); - -my %old_benign = %$old_benign; -my %new_benign = %$new_benign; -my ($newtoadd, $oldtoremove) = compare_clinvar($clinvar_new, $clinvar_old, $final_bed_fp); -my $clinvarlog = "clinvar_".$clinvar.".log"; -my $clinvar_final_bed_fp = "clinvar_".$clinvar.".bed"; -if (-e $clinvarlog) { - unlink( $clinvarlog ); -} -if (-e $clinvar_final_bed_fp) { - unlink( $clinvar_final_bed_fp ); -} -clinvar_bed_and_info($newtoadd, "new", $clinvarlog, $new_benign); -clinvar_bed_and_info($oldtoremove, "old", $clinvarlog, $new_benign); -clinvar_bed_and_info($newtoadd, "bed", $clinvar_final_bed_fp, $new_benign); -add_to_bed($final_bed_fp, $clinvar_final_bed_fp, "."); - -sort_merge_output($final_bed_fp); - -## takes clinvar-vcf saves pathogenicity status, returns one hash with important variants -## and one with benign -sub read_clinvar { - my $vcf = shift; - my $vcf_hash = CMD::vcf2->new('file'=>$vcf); - my %clinvar_variants; - my %benign_clinvar_variants; - my $clinvar_bed = "clinvar.bed"; - - ## save all variants that is marked as Pathogenic in some way - while ( my $var = $vcf_hash->next_var() ) { - - my $sig = ""; my $confidence = ""; - my $haplo = ""; - if ($var->{INFO}->{CLNSIG}) { - $sig = $var->{INFO}->{CLNSIG}; - } - if ($var->{INFO}->{CLNSIGINCL}) { - $haplo = $var->{INFO}->{CLNSIGINCL}; - } - if ($var->{INFO}->{CLNSIGCONF}) { - $confidence = $var->{INFO}->{CLNSIGCONF}; - } - next if ($sig eq "" && $haplo); - my $keep = 0; - my $reason = ""; - if ( $sig =~ /Pathogenic/ || $sig =~ /Likely_pathogenic/ ) { - $keep = 1; - $reason = $sig; - } - elsif ( $sig =~ /Conflicting_interpretations_of_pathogenicity/ ) { - if ( $confidence =~ /Pathogenic/ || $confidence =~ /Likely_pathogenic/ ) { - $keep = 1; - $reason = $confidence; - } - } - elsif ( $haplo =~ /athogenic/ ) { - $keep = 1; - $reason = $haplo; - } - - - if ($keep) { - next if ($var->{CHROM} eq "MT"); - $clinvar_variants{$var->{CHROM}.":".$var->{POS}."_".$var->{REF}."_".$var->{ALT}}{INFO} = $var->{INFO}; - $clinvar_variants{$var->{CHROM}.":".$var->{POS}."_".$var->{REF}."_".$var->{ALT}}{CHROM} = $var->{CHROM}; - $clinvar_variants{$var->{CHROM}.":".$var->{POS}."_".$var->{REF}."_".$var->{ALT}}{POS} = $var->{POS}; - $clinvar_variants{$var->{CHROM}.":".$var->{POS}."_".$var->{REF}."_".$var->{ALT}}{REASON} = $reason; - - } - else { - $benign_clinvar_variants{$var->{CHROM}.":".$var->{POS}."_".$var->{REF}."_".$var->{ALT}}{INFO} = $var->{INFO}; - $benign_clinvar_variants{$var->{CHROM}.":".$var->{POS}."_".$var->{REF}."_".$var->{ALT}}{REASON} = $reason; - } - } - return \%clinvar_variants, \%benign_clinvar_variants; -} - -## fetches $release ensembl gtf and returns bed with coding genes exons -sub get_base { - my $gtf_req = shift; - my $gtf = shift; - my $base = shift; - my $release = shift; - my $skip_download = shift; - - unless ($skip_download) { - system("wget $gtf_req -O tmp.gtf.gz"); - system("gunzip tmp.gtf.gz"); - } - open(BASE,'>',$base); - open(GTF, "tmp.gtf"); - my $keep = 0; - while() { - chomp; - next if /^#/; - my @g =split /\t/; - next if length($g[0]) >2; - if( $g[2] eq "transcript" ) { - $keep = 0; - $keep = 1 if $g[8] =~ /transcript_biotype "protein_coding"/; - } - if( $g[2] eq "exon" and $keep ) { - print BASE $g[0]."\t".($g[3]-20)."\t".($g[4]+20)."\n"; - } - } - ## All of mitochondria ## - close GTF; - close BASE; - -} - -## concats bed files and adds 4th column describing why it was added -sub add_to_bed { - my $bed = shift; - my $bed2add = shift; - my $forthcolumn = shift; - open(BED2ADD, $bed2add); - open(BED, '>>', $bed); - while () { - chomp; - my @line = split('\t'); - next if $line[0] eq "M"; - if ($line[3]) { - print BED $line[0]."\t".$line[1]."\t".$line[2]."\t".$line[3]."\n"; - } - else { - print BED $line[0]."\t".$line[1]."\t".$line[2]."\t".$forthcolumn."\n"; - } - - } - close BED; - close BED2ADD; -} - -## merges bed-file targets and collapses 4th column -sub sort_merge_output { - my $bed = shift; - system( "bedtools sort -i $bed > tmp.sort.bed" ); - system( "bedtools merge -i tmp.sort.bed -c 4 -o collapse > $bed" ); - unlink( "tmp.sort.bed"); -} - -## finds calculates which variants to add to new bed, and which to remove -## depending on variant status. Take clinvar-hashes (old and new) returns -## list of variants to create bed -sub compare_clinvar { - my $new_clinvar = shift; - my $old_clinvar = shift; - my $final_bed_fp = shift; - my %new_clinvar = %$new_clinvar; - my %old_clinvar = %$old_clinvar; - - my $new_bed_fp = "clinvar_new.bed"; - my $old_bed_fp = "clinvar_old.bed"; - open (NEW, '>', $new_bed_fp); - open (OLD, '>', $old_bed_fp); - my @clinvar_in_common = (); - foreach my $key (keys %new_clinvar ) { - push(@clinvar_in_common, $key) if exists $old_clinvar{$key}; - print NEW $new_clinvar{$key}->{CHROM}."\t"; - print NEW ($new_clinvar{$key}->{POS} - 5)."\t"; - print NEW ($new_clinvar{$key}->{POS} + 5)."\t"; - my $info = clinvar_info($new_clinvar{$key}, $key); - print NEW "$info\n"; - } - my $clinvar_in_common_nbr = scalar @clinvar_in_common; - - my @clinvar_new_added = (); - foreach my $key (keys %new_clinvar ) { - push(@clinvar_new_added, $key) unless exists $old_clinvar{$key}; - } - my @clinvar_old_removed = (); - foreach my $key (keys %old_clinvar ) { - unless (exists $new_clinvar{$key}) { - push(@clinvar_old_removed, $key); - print OLD $old_clinvar{$key}->{CHROM}."\t"; - print OLD ($old_clinvar{$key}->{POS} - 5)."\t"; - print OLD ($old_clinvar{$key}->{POS} + 5)."\t"; - my $info = clinvar_info($old_clinvar{$key}, $key); - print OLD "$info\n"; - } - } - close NEW; - close OLD; - - my $new_to_add = intersect($new_bed_fp, $final_bed_fp); - unlink($new_bed_fp); - my $old_to_remove = intersect($old_bed_fp, $final_bed_fp); - unlink($old_bed_fp); - print "ClinVar in common between versions :".scalar(@clinvar_in_common)."\n"; - print "Added new(unique targets) :".scalar(@clinvar_new_added)."(".scalar(@{$new_to_add}).")"."\n"; - print "Removed old(unique targets) :".scalar(@clinvar_old_removed)."(".scalar(@{$old_to_remove}).")"."\n"; - return $new_to_add, $old_to_remove; -} - -## finds unique targets, important to keep track of what variants gets added -## and what variants get removed -sub intersect { - my $clinvar = shift; - my $bed = shift; - my @notinbed = `bedtools intersect -a $clinvar -b $bed -v`; - return \@notinbed; -} - -## concats INFO-field into 4th column for clinvarbed -sub clinvar_info { - my $info = shift; - my $var = shift; - my %info = %$info; - my @fourth; - - push (@fourth, $var); - push (@fourth, $info->{REASON}); - push (@fourth, $info->{INFO}->{CLNACC}); - - my $clndn = "Undefined"; - if (defined $fourth[3]) { - $clndn = $info->{INFO}->{CLNDN}; - } - - push (@fourth, $clndn); - - my $fourth = join('~', @fourth); - return $fourth; -} - -## creates a log for added and removed clinvar variants as well -## as creates the bed that gets added to the final big bed-file -sub clinvar_bed_and_info { - my $clinvar = shift; - my $new_or_old = shift; - my $clinvarlog = shift; - my $new_benign = shift; - - open (LOG, '>>', $clinvarlog); - foreach my $clinvar_line (@{ $clinvar }) { - chomp($clinvar_line); - - - my @clinvar_fields = split('\t', $clinvar_line); - my @clinvarreason = split("~", $clinvar_fields[3]); - - while (@clinvarreason < 3) { - push @clinvarreason, "[missing value]" - } - - my @pos = split('_' ,$clinvarreason[0]); - if ($new_or_old eq "old") { - my $innew = "MISSING"; - if ($new_benign{$clinvarreason[0]}->{INFO}->{CLNSIG}) { - $innew = $new_benign{$clinvarreason[0]}->{INFO}->{CLNSIG}; - } - print LOG "REMOVED:".$pos[0].":".$clinvarreason[2].":".$clinvarreason[1]."=>".$innew."\n"; - } - elsif ($new_or_old eq "new") { - print LOG "ADDED:".$pos[0].":".$clinvarreason[2].":".$clinvarreason[1]."\n"; - } - else { - print LOG $clinvar_fields[0]."\t".$clinvar_fields[1]."\t".$clinvar_fields[2]."\t"."CLINVAR-".$clinvarreason[1]."\n"; - } - - } - close LOG; -} - diff --git a/bin/reference_tools/update_bed.py b/bin/reference_tools/update_bed.py old mode 100644 new mode 100755 index 0cf94c3..b53c67a --- a/bin/reference_tools/update_bed.py +++ b/bin/reference_tools/update_bed.py @@ -23,6 +23,8 @@ logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s") LOG = logging.getLogger(__name__) +BED = list[list[str]] + """ CLNSIG: Clinical significance of variant CLNSIGINCL: Clinical significance for a haplotype or genotype that includes @@ -91,18 +93,17 @@ def main( LOG.info("%s pathogenic and %s benign loaded", len(clinvar_new), len(new_benign)) LOG.info("Calculating ClinVar differences") - (new_to_add_clinvar_bed_rows, old_to_remove_clinvar_bed_rows) = compare_clinvar( + ( + all_to_add_bed_rows, + new_clinvar_bed_rows, + old_removed_clinvar_bed_rows, + ) = compare_clinvar( clinvar_new, clinvar_old, final_bed_path, VARIANT_PAD, out_dir, keep_tmp ) - LOG.info( - "%s new BED entries found, %s BED entries to remove", - len(new_to_add_clinvar_bed_rows), - len(old_to_remove_clinvar_bed_rows), - ) id_col = 4 - new_to_remove_keys = [bed_row[id_col] for bed_row in new_to_add_clinvar_bed_rows] - old_to_remove_keys = [bed_row[id_col] for bed_row in old_to_remove_clinvar_bed_rows] + new_to_remove_keys = [bed_row[id_col] for bed_row in new_clinvar_bed_rows] + old_to_remove_keys = [bed_row[id_col] for bed_row in old_removed_clinvar_bed_rows] clinvar_log_path = ensure_new_empty_file(f"{out_dir}/clinvar_{clinvardate}.log") LOG.info("Writing log to %s", str(clinvar_log_path)) @@ -120,15 +121,13 @@ def main( annot_col = 3 # Slice to keep chrom, start, end and annotation, skip variant keys new_clinvar_to_add_path.write_text( - "\n".join( - ["\t".join(row[0 : annot_col + 1]) for row in new_to_add_clinvar_bed_rows] - ) + "\n".join(["\t".join(row[0 : annot_col + 1]) for row in all_to_add_bed_rows]) + "\n" ) append_to_bed(final_bed_path, new_clinvar_to_add_path, ".") LOG.info("Sort and merge output") - sort_merge_output(out_dir, final_bed_path, keep_tmp) + sort_merge_bed(final_bed_path, keep_tmp) LOG.info("Done") @@ -332,13 +331,13 @@ def append_to_bed(out_bed_fp: Path, bed_to_add_path: Path, bed_annot_default: st print("\t".join(out_fields), file=out_fh) -def sort_merge_output(out_dir: Path, bed_fp: Path, keep_tmp: bool): - tmp_bed_path = Path(out_dir / "tmp.sort.bed") - sort_cmd = f"bedtools sort -i {str(bed_fp)} > {str(tmp_bed_path)}" +def sort_merge_bed(bed_path: Path, keep_tmp: bool): + tmp_bed_path = bed_path.parent / (bed_path.name + ".tmp") + sort_cmd = f"bedtools sort -i {str(bed_path)} > {str(tmp_bed_path)}" # Annotation column is concatenated together # "distinct" means the same value won't be reused multiple times merge_cmd = ( - f"bedtools merge -i {str(tmp_bed_path)} -c 4 -o distinct > {str(bed_fp)}" + f"bedtools merge -i {str(tmp_bed_path)} -c 4 -o distinct > {str(bed_path)}" ) subprocess.call(sort_cmd, shell=True) subprocess.call(merge_cmd, shell=True) @@ -353,12 +352,13 @@ def compare_clinvar( variant_padding: int, out_dir: Path, keep_tmp: bool, -) -> tuple[list[list[str]], list[list[str]]]: +) -> tuple[BED, BED, BED]: """ 1. Check what variants are new / kept / removed among keys 2. Generate bed files for padded ClinVar variants to run "bedtools intersect" 3. Keep info about pathogenicity as the fourth column and variant key (chr:pos_ref_alt) in fifth - 4. Return bed field lists + 4. Log info about changes / what will be included in the bed based on the ClinVar + 5. Return bed field lists """ new_clinvar_keys = set(new_clinvar.keys()) @@ -383,37 +383,50 @@ def write_tmp_bed( ) path.write_text(bed_text) - new_clinvar_tmp_bed = out_dir / "clinvar_new_python.bed" - write_tmp_bed(new_clinvar_tmp_bed, new_clinvar, set()) - old_clinvar_tmp_bed = out_dir / "clinvar_old_python.bed" - clinvar_old_removed = old_clinvar_keys.difference(new_clinvar_keys) - write_tmp_bed(old_clinvar_tmp_bed, old_clinvar, clinvar_old_removed) - + # First we check what ClinVar variants differs between the ClinVar VCFs clinvar_in_common = new_clinvar_keys.intersection(old_clinvar_keys) clinvar_new_added = new_clinvar_keys.difference(old_clinvar_keys) + clinvar_old_removed = old_clinvar_keys.difference(new_clinvar_keys) - new_to_add_bed_rows = get_bed_intersect( - str(new_clinvar_tmp_bed), str(final_bed_path) + # Write bed files with padded ClinVar variants + new_clinvar_added_bed = out_dir / "clinvar_new_added.bed" + write_tmp_bed(new_clinvar_added_bed, new_clinvar, clinvar_new_added) + old_clinvar_removed_bed = out_dir / "clinvar_old_removed.bed" + write_tmp_bed(old_clinvar_removed_bed, old_clinvar, clinvar_old_removed) + new_clinvar_all_bed = out_dir / "clinvar_new_all.bed" + write_tmp_bed(new_clinvar_all_bed, new_clinvar, set()) + + # Check the following regions: + # Newly added old ClinVar -> new ClinVar + newly_added_bed_rows = get_bed_intersect( + str(new_clinvar_added_bed), + str(final_bed_path), + ) + all_added_bed_rows = get_bed_intersect( + str(new_clinvar_all_bed), str(final_bed_path) ) - old_to_remove_bed_rows = get_bed_intersect( - str(old_clinvar_tmp_bed), str(final_bed_path) + old_previously_present_bed_rows = get_bed_intersect( + str(old_clinvar_removed_bed), + str(final_bed_path), ) if not keep_tmp: - new_clinvar_tmp_bed.unlink() - old_clinvar_tmp_bed.unlink() + new_clinvar_added_bed.unlink() + old_clinvar_removed_bed.unlink() + new_clinvar_all_bed.unlink() LOG.info(f"Clinvar in common between versions: {len(clinvar_in_common)}") LOG.info( - f"Added new (unique targets): {len(clinvar_new_added)} ({len(new_to_add_bed_rows)})" + f"New ClinVar variants (total added ClinVar sites) (new ClinVar sites): {len(clinvar_new_added)} ({len(all_added_bed_rows)}) ({len(newly_added_bed_rows)})" ) LOG.info( - f"Removed old (unique targets): {len(clinvar_old_removed)} ({len(old_to_remove_bed_rows)})" + f"Removed ClinVar variants (old removed ClinVar sites): {len(clinvar_old_removed)} ({len(old_previously_present_bed_rows)})" ) + LOG.info("Note - the numbers above are before sorting and merging intervals") - return (new_to_add_bed_rows, old_to_remove_bed_rows) + return (all_added_bed_rows, newly_added_bed_rows, old_previously_present_bed_rows) -def get_bed_intersect(left_bed: str, right_bed: str) -> list[list[str]]: +def get_bed_intersect(left_bed: str, right_bed: str) -> BED: not_in_bed_cmd = [ "bedtools", "intersect", diff --git a/configs/nextflow.hopper.config b/configs/nextflow.hopper.config index d83b407..b82f22d 100644 --- a/configs/nextflow.hopper.config +++ b/configs/nextflow.hopper.config @@ -113,7 +113,7 @@ profiles { params.outdir = "${params.resultsdir}${params.dev_suffix}" params.subdir = 'wgs' params.crondir = "${params.outdir}/cron/" - params.intersect_bed = "${params.refpath}/bed/wgsexome/exons_108padded20bp_clinvar-20231230padded5bp.bed" + params.intersect_bed = "${params.refpath}/bed/wgsexome/exons_108padded20bp_clinvar-20240825padded5bp.bed" params.align = true params.varcall = true params.annotate = true