Skip to content

Commit

Permalink
Add the custom perl scripts
Browse files Browse the repository at this point in the history
Add the current version of custom perl scripts for selection of specific
value lines in fasta files, for exclusion of specific value lines in
fasta files, for converting gff file to gtf file and for getting
microRNA annotation information into a txt file
  • Loading branch information
nnalpas committed Jan 21, 2015
1 parent 6062e19 commit f1115d5
Show file tree
Hide file tree
Showing 4 changed files with 383 additions and 0 deletions.
88 changes: 88 additions & 0 deletions Perl_scripts/Fasta_ignore_value.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/perl -w

# Script used on sequence fasta file for parsing into smaller fasta files

# Define all modules to be used in script
use strict;
use warnings;
use Getopt::Long;
use IO::File;

# Define the different input/output files such as fastq sequence file, indices list file, each index output fastq file
my $fasta; # Input file listing all indices used in the pool RNA-seq library
my $ignore; # Header value (and subsequent sequence) to ignore
my $output; # Ouput file containing fasta sequences

# Define the parameter in order to submit input files to this script
&GetOptions (
'fasta=s' => \$fasta,
'ignore=s' => \$ignore,
'output=s' => \$output,
);

my $start_date = localtime;
print STDERR "START = $start_date\n\n";

# Open the input fasta file
unless ($fasta) {
die "Please specify the fasta file via -fasta parameter!\n";
}
open (FASTA, "<$fasta") || die "Cannot open $fasta: $!\n"; $_="1";

# Obtain the value to ignore
unless ($ignore) {
die "Please specify the value to ignore in the fasta header via -ignore parameter!\n";
}

# Open the ouput file which will be in gtf format
unless ($output) {
die "Please specify the output fasta file via -output parameter!\n";
}
if (-e $output) {
die "This file: $output already exists, cannot overwrite it: $!\n"; $_="1";
}
else {
open (OUTPUT, ">$output") || die "Cannot open $output: $!\n"; $_="1";
}

# Define variables required for reading the input file
my $action;
my $total = 0; # Variable to count line in input file
my $ig_count = 0; # Variable to count line ignored from input file
my $keep_count = 0; # Variable to count line kept from input file

# Read in the fastq sequence file
while(1) {
my $line; # Scalar containing line from the fasta file
chomp($line = <FASTA>); # Read lines one by one from the fasta file
if ($line =~ /${ignore}/){ # Try to match the value to ignore
$action = "remove";
$total ++;
$ig_count++;
}
elsif ($line =~ /^>/) {
$action = "keep";
$total ++;
$keep_count++;
print OUTPUT ("$line\n");
}
elsif ($action eq "remove") {
$total ++;
$ig_count++;
}
else {
$total ++;
$keep_count++;
print OUTPUT ("$line\n");
}
last if eof (FASTA); # If the sequence fastq file was fully read, then exit reading the fastq file
}
close (FASTA); # Close sequence fasta file
close (OUTPUT); # Close the output fasta file

print STDERR "The total line count from input file is: $total; ignored line is: $ig_count and kept line is: $keep_count!\n\n";

my $finish_date = localtime;
print STDERR "Finish = $finish_date\n\n";

__END__
88 changes: 88 additions & 0 deletions Perl_scripts/Fasta_keep_value.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/perl -w

# Script used on sequence fasta file for parsing into smaller fasta files

# Define all modules to be used in script
use strict;
use warnings;
use Getopt::Long;
use IO::File;

# Define the different input/output files such as fastq sequence file, indices list file, each index output fastq file
my $fasta; # Input file listing all indices used in the pool RNA-seq library
my $keep; # Header value (and subsequent sequence) to keep
my $output; # Ouput file containing fasta sequences

# Define the parameter in order to submit input files to this script
&GetOptions (
'fasta=s' => \$fasta,
'keep=s' => \$keep,
'output=s' => \$output,
);

my $start_date = localtime;
print STDERR "START = $start_date\n\n";

# Open the input fasta file
unless ($fasta) {
die "Please specify the fasta file via -fasta parameter!\n";
}
open (FASTA, "<$fasta") || die "Cannot open $fasta: $!\n"; $_="1";

# Obtain the value to keep
unless ($keep) {
die "Please specify the value to keep in the fasta header via -keep parameter!\n";
}

# Open the ouput file which will be in gtf format
unless ($output) {
die "Please specify the output fasta file via -output parameter!\n";
}
if (-e $output) {
die "This file: $output already exists, cannot overwrite it: $!\n"; $_="1";
}
else {
open (OUTPUT, ">$output") || die "Cannot open $output: $!\n"; $_="1";
}

# Define variables required for reading the input file
my $action;
my $total = 0; # Variable to count line in input file
my $ig_count = 0; # Variable to count line ignored from input file
my $keep_count = 0; # Variable to count line kept from input file

# Read in the fastq sequence file
while(1) {
my $line; # Scalar containing line from the fasta file
chomp($line = <FASTA>); # Read lines one by one from the fasta file
if ($line =~ /${keep}/){ # Try to match the value to keep
$action = "keep";
$total ++;
$keep_count ++;
print OUTPUT ("$line\n");
}
elsif ($line =~ /^>/) {
$action = "remove";
$total ++;
$ig_count ++;
}
elsif ($action eq "keep") {
$total ++;
$keep_count ++;
print OUTPUT ("$line\n");
}
else {
$total ++;
$ig_count ++;
}
last if eof (FASTA); # If the sequence fastq file was fully read, then exit reading the fastq file
}
close (FASTA); # Close sequence fasta file
close (OUTPUT); # Close the output fasta file

print STDERR "The total line count from input file is: $total; ignored line is: $ig_count and kept line is: $keep_count!\n\n";

my $finish_date = localtime;
print STDERR "Finish = $finish_date\n\n";

__END__
82 changes: 82 additions & 0 deletions Perl_scripts/gff2gtf.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/usr/bin/perl -w

# Script used to convert gff format to gtf format

# Define all modules to be used in script
use strict;
use warnings;
use Getopt::Long;
use IO::File;

# Define the input file
my $input; # Input file containing gff format annotation
my $output; # Output file containing gtf format annotation

# Define the parameter in order to submit input files to this script
&GetOptions (
'i=s' => \$input,
'o=s' => \$output,
);

my $start_date = localtime;
print STDERR "START = $start_date\n\n";

# Open the input gtf file
unless ($input) {
die "Please specify the gff file to convert via -i parameter!\n";
}
open (INPUT, "<$input") || die "Cannot open $input: $!\n"; $_="1";

# Define output file
unless ($output) {
$output = $input;
chomp ($output);
$output =~ s/(.*)\.gff3$/$1\.gtf/;
print STDERR "File name for gtf output file not provided (-o parameter), name generated from input gff file: $output\n";
}

# Open the ouput file which will be in gtf format
if (-e $output) {
die "This file: $output already exists, cannot overwrite it: $!\n"; $_="1";
}
else {
open (OUTPUT, ">$output") || die "Cannot open $output: $!\n"; $_="1";
}

# Define variables required for reading input file
my $gff_line = 0;

# Read and split the gff file for converting to gtf format
while (1) {
chomp (my $line = <INPUT>);
$gff_line ++;
my $biotype;
unless ($line =~ /^#/) {
my ($chromosome, $source, $feature, $start, $end, $score, $strand, $frame, $attributes) = (split(/\t/, $line));
my ($id, $alias, $name, $derives) = (split(/;/, $attributes));
if ($feature eq "miRNA_primary_transcript") {
$biotype = "pre-miRNA";
}
elsif ($feature eq "miRNA") {
$biotype = "miRNA";
}
else {
die "Biotype value not recognised at gff file line: $gff_line!\n";
}
$id =~ s/ID\=(.*)/gene_id \"$1\"/;
$name =~ s/Name\=(.*)/gene_name \"$1\"/;
print OUTPUT "$chromosome\t$biotype\texon\t$start\t$end\t$score\t$strand\t$frame\t$id\; $name\;\n";
}
last if eof INPUT;
}

# Close the files
close (INPUT);
close (OUTPUT);

print STDERR "Converting from gff format to gtf format completed!\n\n";

my $finish_date = localtime;
print STDERR "Finish = $finish_date\n\n";

__END__
125 changes: 125 additions & 0 deletions Perl_scripts/miRNA_info_grepping.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#!/usr/bin/perl -w

# Script use to collect each miRNA information from gtf annotation file such as start_position, end_position, strand, chromosome_name and gene_id

use strict;
use warnings;
use Getopt::Long;
use IO::File;
use List::Util 'first';

# Define the different input/output files
my $fasta; # Input fasta file containing miRNA sequence
my $gff; # Input gff file
my $output; # Output file which will contain all the different genes info

# Define the parameter in order to submit input files to this script
&GetOptions (
'fasta=s' => \$fasta,
'gff=s' => \$gff,
'output=s' => \$output,
);

my $start_date = localtime;
print STDERR "\n################################\nSTART = $start_date\n################################\n\n";

# Open the fasta input file
unless ($fasta) {
die "Please specify the fasta file containing the miRNA sequence via -fasta parameter!\n";
}
open (FASTA, "<$fasta") || die "Cannot open $fasta: $!\n"; $_="1";

# Open the gtf input file
unless ($gff) {
die "Please specify the gff file containing the gene annotation via -gff parameter!\n";
}
open (GFF, "<$gff") || die "Cannot open $gff: $!\n"; $_="1";

# Open the output file
unless ($output) {
die "Please specify the output file via -output parameter!\n";
}
if (-e $output) {
die "This file: $output already exists, cannot overwrite it: $!\n"; $_="1";
}
else {
open (OUTPUT, ">$output") || die "Cannot open $output: $!\n"; $_="1";
}

# Define variables required for reading the fasta input file
my %mirna_seq; # Hash containing each sequence of mature miRNA
my $total = 0; # Variable to count line in input file
my $ig_count = 0; # Variable to count line ignored from input file
my $keep_count = 0; # Variable to count line kept from input file

# Read in the fasta sequence file
while(1) {
my $line; # Scalar containing line from the fasta file
chomp($line = <FASTA>); # Read lines one by one from the fasta file
if ($line =~ /^>bta-/){ # Try to match the value to collect
my ($name, $id, $full_name) = (split(/\s/, $line));
$name =~ s/^>//;
my $sequence;
chomp($sequence = <FASTA>); # Read following line from the fasta file which contains associated sequence
unless (exists $mirna_seq{$name}) {
$mirna_seq{$name} = $sequence;
$total += 2;
$keep_count += 2;
}
else {
die "The miRNA name $name already has a sequence value!\n";
}
}
else {
$total ++;
$ig_count ++;
}
last if eof (FASTA); # If the sequence fasta file was fully read, then exit reading the fastq file
}
close (FASTA); # Close sequence fasta file

print STDERR "The total line count from $fasta file is: $total; ignored line is: $ig_count and kept line is: $keep_count!\n\n";

# Define variables required for reading gff input file
my %premirna; # Hash containing each precursor miRNA information
$total = 0; # Variable to count line in input file
my $out = 0; # Variable to count line in output file

# Read and split the gff file for information collection
print OUTPUT "gene_id\tgene_name\tchromosome\tstart_position\tend_position\tstrand\tsequence\tprecursor_id\tprecursor_name\tprecursor_start\tprecursor_end\n";
while (1) {
chomp (my $line = <GFF>);
$total ++;
unless ($line =~ /^#/) {
my ($chromosome, $source, $feature, $start, $end, $score, $strand, $frame, $attributes) = (split(/\t/, $line));
my ($id, $alias, $name, $derives) = (split(/;/, $attributes));
$id =~ s/ID=//;
$name =~ s/Name=//;
if ($feature eq "miRNA_primary_transcript") {
$premirna{$id}{name} = $name;
$premirna{$id}{start} = $start;
$premirna{$id}{end} = $end;
}
elsif ($feature eq "miRNA") {
$derives =~ s/Derives_from=//;
unless (exists $mirna_seq{$name}) {
$mirna_seq{$name} = "Undefined";
}
print OUTPUT "$id\t$name\t$chromosome\t$start\t$end\t$strand\t$mirna_seq{$name}\t$derives\t$premirna{$derives}{name}\t$premirna{$derives}{start}\t$premirna{$derives}{end}\n";
$out ++;
}
else {
die "Biotype value not recognised at gff file line: $total!\n";
}
}
last if eof GFF;
}
close (GFF);
close (OUTPUT);

print STDERR "There were $total lines from $gff input file and $out lines in the $output output file!\n";

my $finish_date = localtime;
print STDERR "\n##############################\nEND = $finish_date\n##############################\n\n";

__END__

0 comments on commit f1115d5

Please sign in to comment.