Add the custom perl scripts

Add the current version of custom perl scripts for selection of specific value lines in fasta files, for exclusion of specific value lines in fasta files, for converting gff file to gtf file and for getting microRNA annotation information into a txt file
nnalpas · Jan 21, 2015 · f1115d5 · f1115d5
1 parent 6062e19
commit f1115d5
Show file tree

Hide file tree

Showing 4 changed files with 383 additions and 0 deletions.
diff --git a/Perl_scripts/Fasta_ignore_value.pl b/Perl_scripts/Fasta_ignore_value.pl
@@ -0,0 +1,88 @@
+#!/usr/bin/perl -w
+
+# Script used on sequence fasta file for parsing into smaller fasta files
+
+# Define all modules to be used in script
+use strict;
+use warnings;
+use Getopt::Long;
+use IO::File;
+
+# Define the different input/output files such as fastq sequence file, indices list file, each index output fastq file
+my $fasta;    # Input file listing all indices used in the pool RNA-seq library
+my $ignore;         # Header value (and subsequent sequence) to ignore
+my $output;     # Ouput file containing fasta sequences
+
+# Define the parameter in order to submit input files to this script
+&GetOptions (
+    'fasta=s' => \$fasta,
+    'ignore=s' => \$ignore,
+    'output=s' => \$output,
+);
+
+my $start_date = localtime;
+print STDERR "START = $start_date\n\n";
+
+# Open the input fasta file
+unless ($fasta) {
+    die "Please specify the fasta file via -fasta parameter!\n";
+}
+open (FASTA, "<$fasta") || die "Cannot open $fasta: $!\n"; $_="1";
+
+# Obtain the value to ignore
+unless ($ignore) {
+    die "Please specify the value to ignore in the fasta header via -ignore parameter!\n";
+}
+
+# Open the ouput file which will be in gtf format
+unless ($output) {
+    die "Please specify the output fasta file via -output parameter!\n";
+}
+if (-e $output) {
+    die "This file: $output already exists, cannot overwrite it: $!\n"; $_="1";
+}
+else {
+    open (OUTPUT, ">$output") || die "Cannot open $output: $!\n"; $_="1";
+}
+
+# Define variables required for reading the input file
+my $action;
+my $total = 0;  # Variable to count line in input file
+my $ig_count = 0;  # Variable to count line ignored from input file
+my $keep_count = 0;  # Variable to count line kept from input file
+
+# Read in the fastq sequence file
+while(1) {
+    my $line;        # Scalar containing line from the fasta file
+    chomp($line = <FASTA>);    # Read lines one by one from the fasta file
+    if ($line =~ /${ignore}/){   # Try to match the value to ignore
+        $action = "remove";
+        $total ++;
+        $ig_count++;
+    }
+    elsif ($line =~ /^>/) {
+        $action = "keep";
+        $total ++;
+        $keep_count++;
+        print OUTPUT ("$line\n");
+    }
+    elsif ($action eq "remove") {
+        $total ++;
+        $ig_count++;
+    }
+    else {
+        $total ++;
+        $keep_count++;
+        print OUTPUT ("$line\n");
+    }
+    last if eof (FASTA);  # If the sequence fastq file was fully read, then exit reading the fastq file
+}
+close (FASTA);    # Close sequence fasta file
+close (OUTPUT);     # Close the output fasta file
+
+print STDERR "The total line count from input file is: $total; ignored line is: $ig_count and kept line is: $keep_count!\n\n";
+
+my $finish_date = localtime;
+print STDERR "Finish = $finish_date\n\n";
+
+__END__
diff --git a/Perl_scripts/Fasta_keep_value.pl b/Perl_scripts/Fasta_keep_value.pl
@@ -0,0 +1,88 @@
+#!/usr/bin/perl -w
+
+# Script used on sequence fasta file for parsing into smaller fasta files
+
+# Define all modules to be used in script
+use strict;
+use warnings;
+use Getopt::Long;
+use IO::File;
+
+# Define the different input/output files such as fastq sequence file, indices list file, each index output fastq file
+my $fasta;    # Input file listing all indices used in the pool RNA-seq library
+my $keep;         # Header value (and subsequent sequence) to keep
+my $output;     # Ouput file containing fasta sequences
+
+# Define the parameter in order to submit input files to this script
+&GetOptions (
+    'fasta=s' => \$fasta,
+    'keep=s' => \$keep,
+    'output=s' => \$output,
+);
+
+my $start_date = localtime;
+print STDERR "START = $start_date\n\n";
+
+# Open the input fasta file
+unless ($fasta) {
+    die "Please specify the fasta file via -fasta parameter!\n";
+}
+open (FASTA, "<$fasta") || die "Cannot open $fasta: $!\n"; $_="1";
+
+# Obtain the value to keep
+unless ($keep) {
+    die "Please specify the value to keep in the fasta header via -keep parameter!\n";
+}
+
+# Open the ouput file which will be in gtf format
+unless ($output) {
+    die "Please specify the output fasta file via -output parameter!\n";
+}
+if (-e $output) {
+    die "This file: $output already exists, cannot overwrite it: $!\n"; $_="1";
+}
+else {
+    open (OUTPUT, ">$output") || die "Cannot open $output: $!\n"; $_="1";
+}
+
+# Define variables required for reading the input file
+my $action;
+my $total = 0;  # Variable to count line in input file
+my $ig_count = 0;  # Variable to count line ignored from input file
+my $keep_count = 0;  # Variable to count line kept from input file
+
+# Read in the fastq sequence file
+while(1) {
+    my $line;        # Scalar containing line from the fasta file
+    chomp($line = <FASTA>);    # Read lines one by one from the fasta file
+    if ($line =~ /${keep}/){   # Try to match the value to keep
+        $action = "keep";
+        $total ++;
+        $keep_count ++;
+        print OUTPUT ("$line\n");
+    }
+    elsif ($line =~ /^>/) {
+        $action = "remove";
+        $total ++;
+        $ig_count ++;
+    }
+    elsif ($action eq "keep") {
+        $total ++;
+        $keep_count ++;
+        print OUTPUT ("$line\n");
+    }
+    else {
+        $total ++;
+        $ig_count ++;
+    }
+    last if eof (FASTA);  # If the sequence fastq file was fully read, then exit reading the fastq file
+}
+close (FASTA);    # Close sequence fasta file
+close (OUTPUT);     # Close the output fasta file
+
+print STDERR "The total line count from input file is: $total; ignored line is: $ig_count and kept line is: $keep_count!\n\n";
+
+my $finish_date = localtime;
+print STDERR "Finish = $finish_date\n\n";
+
+__END__
diff --git a/Perl_scripts/gff2gtf.pl b/Perl_scripts/gff2gtf.pl
@@ -0,0 +1,82 @@
+#!/usr/bin/perl -w
+
+# Script used to convert gff format to gtf format
+
+# Define all modules to be used in script
+use strict;
+use warnings;
+use Getopt::Long;
+use IO::File;
+
+# Define the input file
+my $input;    # Input file containing gff format annotation
+my $output;  # Output file containing gtf format annotation
+
+# Define the parameter in order to submit input files to this script
+&GetOptions (
+    'i=s' => \$input,
+    'o=s' => \$output,
+);
+
+my $start_date = localtime;
+print STDERR "START = $start_date\n\n";
+
+# Open the input gtf file
+unless ($input) {
+    die "Please specify the gff file to convert via -i parameter!\n";
+}
+open (INPUT, "<$input") || die "Cannot open $input: $!\n"; $_="1";
+
+# Define output file
+unless ($output) {
+    $output = $input;
+    chomp ($output);
+    $output =~ s/(.*)\.gff3$/$1\.gtf/;
+    print STDERR "File name for gtf output file not provided (-o parameter), name generated from input gff file: $output\n";
+}
+
+# Open the ouput file which will be in gtf format
+if (-e $output) {
+    die "This file: $output already exists, cannot overwrite it: $!\n"; $_="1";
+}
+else {
+    open (OUTPUT, ">$output") || die "Cannot open $output: $!\n"; $_="1";
+}
+
+# Define variables required for reading input file
+my $gff_line = 0;
+
+# Read and split the gff file for converting to gtf format
+while (1) {
+    chomp (my $line = <INPUT>);
+    $gff_line ++;
+    my $biotype;
+    unless ($line =~ /^#/) {
+        my ($chromosome, $source, $feature, $start, $end, $score, $strand, $frame, $attributes) = (split(/\t/, $line));
+        my ($id, $alias, $name, $derives) = (split(/;/, $attributes));
+        if ($feature eq "miRNA_primary_transcript") {
+            $biotype = "pre-miRNA";
+        }
+        elsif ($feature eq "miRNA") {
+            $biotype = "miRNA";
+        }
+        else {
+            die "Biotype value not recognised at gff file line: $gff_line!\n";
+        }
+        $id =~ s/ID\=(.*)/gene_id \"$1\"/;
+        $name =~ s/Name\=(.*)/gene_name \"$1\"/;
+        print OUTPUT "$chromosome\t$biotype\texon\t$start\t$end\t$score\t$strand\t$frame\t$id\; $name\;\n";
+    }
+    last if eof INPUT;
+}
+
+# Close the files
+close (INPUT);
+close (OUTPUT);
+
+print STDERR "Converting from gff format to gtf format completed!\n\n";
+
+my $finish_date = localtime;
+print STDERR "Finish = $finish_date\n\n";
+
+__END__
diff --git a/Perl_scripts/miRNA_info_grepping.pl b/Perl_scripts/miRNA_info_grepping.pl
@@ -0,0 +1,125 @@
+#!/usr/bin/perl -w
+
+# Script use to collect each miRNA information from gtf annotation file such as start_position, end_position, strand, chromosome_name and gene_id
+
+use strict;
+use warnings;
+use Getopt::Long;
+use IO::File;
+use List::Util 'first';
+
+# Define the different input/output files
+my $fasta; # Input fasta file containing miRNA sequence
+my $gff;    # Input gff file
+my $output; # Output file which will contain all the different genes info
+
+# Define the parameter in order to submit input files to this script
+&GetOptions (
+    'fasta=s' => \$fasta,
+    'gff=s' => \$gff,
+    'output=s' => \$output,
+);
+
+my $start_date = localtime;
+print STDERR "\n################################\nSTART = $start_date\n################################\n\n";
+
+# Open the fasta input file
+unless ($fasta) {
+    die "Please specify the fasta file containing the miRNA sequence via -fasta parameter!\n";
+}
+open (FASTA, "<$fasta") || die "Cannot open $fasta: $!\n"; $_="1";
+
+# Open the gtf input file
+unless ($gff) {
+    die "Please specify the gff file containing the gene annotation via -gff parameter!\n";
+}
+open (GFF, "<$gff") || die "Cannot open $gff: $!\n"; $_="1";
+
+# Open the output file
+unless ($output) {
+    die "Please specify the output file via -output parameter!\n";
+}
+if (-e $output) {
+    die "This file: $output already exists, cannot overwrite it: $!\n"; $_="1";
+}
+else {
+    open (OUTPUT, ">$output") || die "Cannot open $output: $!\n"; $_="1";
+}
+
+# Define variables required for reading the fasta input file
+my %mirna_seq;   # Hash containing each sequence of mature miRNA
+my $total = 0;  # Variable to count line in input file
+my $ig_count = 0;  # Variable to count line ignored from input file
+my $keep_count = 0;  # Variable to count line kept from input file
+
+# Read in the fasta sequence file
+while(1) {
+    my $line;        # Scalar containing line from the fasta file
+    chomp($line = <FASTA>);    # Read lines one by one from the fasta file
+    if ($line =~ /^>bta-/){   # Try to match the value to collect
+        my ($name, $id, $full_name) = (split(/\s/, $line));
+        $name =~ s/^>//;
+        my $sequence;
+        chomp($sequence = <FASTA>);    # Read following line from the fasta file which contains associated sequence
+        unless (exists $mirna_seq{$name}) {
+            $mirna_seq{$name} = $sequence;
+            $total += 2;
+            $keep_count += 2;
+        }
+        else {
+            die "The miRNA name $name already has a sequence value!\n";
+        }
+    }
+    else {
+        $total ++;
+        $ig_count ++;
+    }
+    last if eof (FASTA);  # If the sequence fasta file was fully read, then exit reading the fastq file
+}
+close (FASTA);    # Close sequence fasta file
+
+print STDERR "The total line count from $fasta file is: $total; ignored line is: $ig_count and kept line is: $keep_count!\n\n";
+
+# Define variables required for reading gff input file
+my %premirna;   # Hash containing each precursor miRNA information
+$total = 0; # Variable to count line in input file
+my $out = 0;   # Variable to count line in output file
+
+# Read and split the gff file for information collection
+print OUTPUT "gene_id\tgene_name\tchromosome\tstart_position\tend_position\tstrand\tsequence\tprecursor_id\tprecursor_name\tprecursor_start\tprecursor_end\n";
+while (1) {
+    chomp (my $line = <GFF>);
+    $total ++;
+    unless ($line =~ /^#/) {
+        my ($chromosome, $source, $feature, $start, $end, $score, $strand, $frame, $attributes) = (split(/\t/, $line));
+        my ($id, $alias, $name, $derives) = (split(/;/, $attributes));
+        $id =~ s/ID=//;
+        $name =~ s/Name=//;
+        if ($feature eq "miRNA_primary_transcript") {
+            $premirna{$id}{name} = $name;
+            $premirna{$id}{start} = $start;
+            $premirna{$id}{end} = $end;
+        }
+        elsif ($feature eq "miRNA") {
+            $derives =~ s/Derives_from=//;
+            unless (exists $mirna_seq{$name}) {
+                $mirna_seq{$name} = "Undefined";
+            }
+            print OUTPUT "$id\t$name\t$chromosome\t$start\t$end\t$strand\t$mirna_seq{$name}\t$derives\t$premirna{$derives}{name}\t$premirna{$derives}{start}\t$premirna{$derives}{end}\n";
+            $out ++;
+        }
+        else {
+            die "Biotype value not recognised at gff file line: $total!\n";
+        }
+    }
+    last if eof GFF;
+}
+close (GFF);
+close (OUTPUT);
+
+print STDERR "There were $total lines from $gff input file and $out lines in the $output output file!\n";
+
+my $finish_date = localtime;
+print STDERR "\n##############################\nEND = $finish_date\n##############################\n\n";
+
+__END__