Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding a revcomp tool #1095

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \
$(SRC_DIR)/randomBed \
$(SRC_DIR)/regressTest \
$(SRC_DIR)/reldist \
$(SRC_DIR)/revcompBed \
$(SRC_DIR)/sampleFile \
$(SRC_DIR)/shiftBed \
$(SRC_DIR)/shuffleBed \
Expand Down
5 changes: 4 additions & 1 deletion src/bedtools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ void summary_help();
int reldist_main(int argc, char* argv[]); //
void sample_help();
int shift_main(int argc, char* argv[]); //
int revcomp_main(int argc, char* argv[]);
int shuffle_main(int argc, char* argv[]); //
int slop_main(int argc, char* argv[]); //
int split_main(int argc, char* argv[]); //
Expand Down Expand Up @@ -132,7 +133,8 @@ int main(int argc, char *argv[])
else if (subCmd == "window") return window_main(argc-1, argv+1);
else if (subCmd == "genomecov") return genomecoverage_main(argc-1, argv+1);
else if (subCmd == "cluster") return cluster_main(argc-1, argv+1);
else if (subCmd == "shift") return shift_main(argc-1, argv+1);
else if (subCmd == "shift") return shift_main(argc-1, argv+1);
else if (subCmd == "revcomp") return revcomp_main(argc-1, argv+1);
else if (subCmd == "slop") return slop_main(argc-1, argv+1);
else if (subCmd == "split") return split_main(argc-1, argv+1);
else if (subCmd == "flank") return flank_main(argc-1, argv+1);
Expand Down Expand Up @@ -236,6 +238,7 @@ int bedtools_help(void)
cout << " cluster " << "Cluster (but don't merge) overlapping/nearby intervals.\n";
cout << " complement " << "Extract intervals _not_ represented by an interval file.\n";
cout << " shift " << "Adjust the position of intervals.\n";
cout << " revcomp " << "Compute the intervals on the reverse complement.\n";
cout << " subtract " << "Remove intervals based on overlaps b/w two files.\n";
cout << " slop " << "Adjust the size of intervals.\n";
cout << " flank " << "Create new intervals from the flanks of existing intervals.\n";
Expand Down
7 changes: 7 additions & 0 deletions src/revcompBed/Makefile.frag
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
BUILT_OBJECTS += obj/revcompBedMain.o obj/revcompBed.o

obj/revcompBedMain.o: src/revcompBed/revcompBedMain.cpp obj/revcompBedMain.d
$(CXX_COMPILE)

obj/revcompBed.o: src/revcompBed/revcompBed.cpp obj/revcompBed.d
$(CXX_COMPILE)
48 changes: 48 additions & 0 deletions src/revcompBed/revcompBed.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#include "revcompBed.h"

#include "lineFileUtilities.h"

BedRevcomp::BedRevcomp(const std::string &bedFile, const std::string &genomeFile, bool printHeader):
_bedFile(bedFile),
_genomeFile(genomeFile),
_printHeader(printHeader),
_bed(_bedFile),
_genome(genomeFile)
{
Revcomp();
}

void BedRevcomp::Revcomp() {

BED bedEntry; // used to store the current BED line from the BED file.

_bed.Open();
// report header first if asked.
if (_printHeader == true) {
_bed.PrintHeader();
}
while (_bed.GetNextBed(bedEntry)) {
if (_bed._status == BED_VALID) {
Revcomp(bedEntry);
_bed.reportBedNewLine(bedEntry);
}
}
_bed.Close();
}

namespace {
// like std::clamp from C++17
CHRPOS clamp(const CHRPOS& a, const CHRPOS& b, const CHRPOS& c) {
return std::min(std::max(a, b), c);
}
}

void BedRevcomp::Revcomp(BED &bed) {
CHRPOS chromSize = _genome.getChromSize(bed.chrom);

auto rcStart = clamp(0, chromSize - bed.end, chromSize);
auto rcEnd = clamp(0, chromSize - bed.start, chromSize);

bed.end = rcEnd;
bed.start = rcStart;
}
26 changes: 26 additions & 0 deletions src/revcompBed/revcompBed.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#pragma once

#include "bedFile.h"
#include "GenomeFile.h"

#include <string>

class BedRevcomp {

public:

BedRevcomp(const std::string &bedFile, const std::string &genomeFile, bool printHeader);

private:

std::string _bedFile;
std::string _genomeFile;

bool _printHeader;

BedFile _bed;
GenomeFile _genome;

void Revcomp();
void Revcomp(BED &bed);
};
117 changes: 117 additions & 0 deletions src/revcompBed/revcompBedMain.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#include "revcompBed.h"
#include "version.h"

using namespace std;

// define our program name
#define PROGRAM_NAME "bedtools revcomp"

// define our parameter checking macro
#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen)

// function declarations
void revcomp_help(int);

int revcomp_main(int argc, char* argv[]) {

// our configuration variables
bool showHelp = false;

// input files
string bedFile = "stdin";
string genomeFile;

bool haveBed = true;
bool haveGenome = false;
bool printHeader = false;

for(int i = 1; i < argc; i++) {
int parameterLength = (int)strlen(argv[i]);

if((PARAMETER_CHECK("-h", 2, parameterLength)) ||
(PARAMETER_CHECK("--help", 5, parameterLength))) {
showHelp = true;
}
}

if(showHelp) revcomp_help(0);

// do some parsing (all of these parameters require 2 strings)
for(int i = 1; i < argc; i++) {

int parameterLength = (int)strlen(argv[i]);

if(PARAMETER_CHECK("-i", 2, parameterLength)) {
if ((i+1) < argc) {
bedFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-g", 2, parameterLength)) {
if ((i+1) < argc) {
haveGenome = true;
genomeFile = argv[i + 1];
i++;
}
}
else if(PARAMETER_CHECK("-header", 7, parameterLength)) {
printHeader = true;
}
else {
cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl;
showHelp = true;
}
}

// make sure we have both input files
if (!haveBed || !haveGenome) {
cerr << endl << "*****" << endl << "*****ERROR: Need both a BED (-i) and a genome (-g) file. " << endl << "*****" << endl;
showHelp = true;
}

if (showHelp) revcomp_help(1);

BedRevcomp(bedFile, genomeFile, printHeader);
return 0;
}

void revcomp_help(int exit_status) {

cerr << "\nTool: bedtools revcomp (aka revcompBed)" << endl;
cerr << "Version: " << VERSION << "\n";
cerr << "Summary: Compute the intervals on the reverse complement." << endl << endl;

cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i <bed/gff/vcf> -g <genome>" << endl << endl;

cerr << "Options: " << endl;
cerr << "\t-header\t" << "Print the header from the input file prior to results." << endl << endl;

cerr << "Notes: " << endl;
cerr << "\t(1) Starts will be set to 0 if options would force it below 0." << endl;
cerr << "\t(2) Ends will be set to the chromosome length if above the max chrom length." << endl;
cerr << "\t(3) The genome file should be tab delimited and structured as follows:" << endl;
cerr << "\n\t<chromName><TAB><chromSize>" << endl << endl;
cerr << "\tFor example, Human (hg19):" << endl;
cerr << "\tchr1\t249250621" << endl;
cerr << "\tchr2\t243199373" << endl;
cerr << "\t..." << endl;
cerr << "\tchr18_gl000207_random\t4262" << endl << endl;

cerr << "Tip 1. Use samtools faidx to create a genome file from a FASTA: " << endl;
cerr << "\tOne can the samtools faidx command to index a FASTA file." << endl;
cerr << "\tThe resulting .fai index is suitable as a genome file, " << endl;
cerr << "\tas bedtools will only look at the first two, relevant columns" << endl;
cerr << "\tof the .fai file." << endl << endl;
cerr << "\tFor example:" << endl;
cerr << "\tsamtools faidx GRCh38.fa" << endl;
cerr << "\tbedtools revcomp -i my.bed -g GRCh38.fa.fai" << endl << endl;

cerr << "Tip 2. Use UCSC Table Browser to create a genome file: " << endl;
cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl;
cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl;
cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \\" << endl;
cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl;

// end the program here
exit(exit_status);
}
2 changes: 2 additions & 0 deletions test/revcomp/a.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
chr1 100 200 a1 1 +
chr1 100 200 a2 2 -
2 changes: 2 additions & 0 deletions test/revcomp/b.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
chr1 66999638 67216822 NM_032291 0 +
chr1 92145899 92351836 NR_036634 0 -
1 change: 1 addition & 0 deletions test/revcomp/huge.genome
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
chr1 249250621
54 changes: 54 additions & 0 deletions test/revcomp/test-revcomp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
set -e;
BT="${BT-../../bin/bedtools}"

FAILURES=0;

check()
{
if diff $1 $2; then
echo ok

else
FAILURES=$(expr $FAILURES + 1);
echo fail

fi
}

# cat a.bed
# chr1 100 200 a1 1 +
# chr1 100 200 a2 2 -

###########################################################
# test simple
###########################################################
echo -e " revcomp.t1...\c"
echo \
"chr1 800 900 a1 1 +
chr1 800 900 a2 2 -" > exp
"$BT" revcomp -i a.bed -g tiny.genome > obs
check obs exp
rm obs exp

echo -e " revcomp.t2...\c"
echo \
"chr1 249250421 249250521 a1 1 +
chr1 249250421 249250521 a2 2 -" > exp
"$BT" revcomp -i a.bed -g huge.genome > obs
check obs exp
rm obs exp


###########################################################
# test clamp
###########################################################
echo -e " revcomp.t3...\c"
echo \
"chr1 0 0 NM_032291 0 +
chr1 0 0 NR_036634 0 -" > exp
"$BT" revcomp -i b.bed -g tiny.genome > obs
check obs exp
rm obs exp


[[ $FAILURES -eq 0 ]] || exit 1;
1 change: 1 addition & 0 deletions test/revcomp/tiny.genome
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
chr1 1000