diff --git a/Makefile b/Makefile index fce03eb8..de59a6da 100644 --- a/Makefile +++ b/Makefile @@ -77,6 +77,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/randomBed \ $(SRC_DIR)/regressTest \ $(SRC_DIR)/reldist \ + $(SRC_DIR)/revcompBed \ $(SRC_DIR)/sampleFile \ $(SRC_DIR)/shiftBed \ $(SRC_DIR)/shuffleBed \ diff --git a/src/bedtools.cpp b/src/bedtools.cpp index 0bccebb7..78b276bb 100644 --- a/src/bedtools.cpp +++ b/src/bedtools.cpp @@ -74,6 +74,7 @@ void summary_help(); int reldist_main(int argc, char* argv[]); // void sample_help(); int shift_main(int argc, char* argv[]); // +int revcomp_main(int argc, char* argv[]); int shuffle_main(int argc, char* argv[]); // int slop_main(int argc, char* argv[]); // int split_main(int argc, char* argv[]); // @@ -132,7 +133,8 @@ int main(int argc, char *argv[]) else if (subCmd == "window") return window_main(argc-1, argv+1); else if (subCmd == "genomecov") return genomecoverage_main(argc-1, argv+1); else if (subCmd == "cluster") return cluster_main(argc-1, argv+1); - else if (subCmd == "shift") return shift_main(argc-1, argv+1); + else if (subCmd == "shift") return shift_main(argc-1, argv+1); + else if (subCmd == "revcomp") return revcomp_main(argc-1, argv+1); else if (subCmd == "slop") return slop_main(argc-1, argv+1); else if (subCmd == "split") return split_main(argc-1, argv+1); else if (subCmd == "flank") return flank_main(argc-1, argv+1); @@ -236,6 +238,7 @@ int bedtools_help(void) cout << " cluster " << "Cluster (but don't merge) overlapping/nearby intervals.\n"; cout << " complement " << "Extract intervals _not_ represented by an interval file.\n"; cout << " shift " << "Adjust the position of intervals.\n"; + cout << " revcomp " << "Compute the intervals on the reverse complement.\n"; cout << " subtract " << "Remove intervals based on overlaps b/w two files.\n"; cout << " slop " << "Adjust the size of intervals.\n"; cout << " flank " << "Create new intervals from the flanks of existing intervals.\n"; diff --git a/src/revcompBed/Makefile.frag b/src/revcompBed/Makefile.frag new file mode 100644 index 00000000..7845a74d --- /dev/null +++ b/src/revcompBed/Makefile.frag @@ -0,0 +1,7 @@ +BUILT_OBJECTS += obj/revcompBedMain.o obj/revcompBed.o + +obj/revcompBedMain.o: src/revcompBed/revcompBedMain.cpp obj/revcompBedMain.d + $(CXX_COMPILE) + +obj/revcompBed.o: src/revcompBed/revcompBed.cpp obj/revcompBed.d + $(CXX_COMPILE) diff --git a/src/revcompBed/revcompBed.cpp b/src/revcompBed/revcompBed.cpp new file mode 100644 index 00000000..6803fad0 --- /dev/null +++ b/src/revcompBed/revcompBed.cpp @@ -0,0 +1,48 @@ +#include "revcompBed.h" + +#include "lineFileUtilities.h" + +BedRevcomp::BedRevcomp(const std::string &bedFile, const std::string &genomeFile, bool printHeader): + _bedFile(bedFile), + _genomeFile(genomeFile), + _printHeader(printHeader), + _bed(_bedFile), + _genome(genomeFile) +{ + Revcomp(); +} + +void BedRevcomp::Revcomp() { + + BED bedEntry; // used to store the current BED line from the BED file. + + _bed.Open(); + // report header first if asked. + if (_printHeader == true) { + _bed.PrintHeader(); + } + while (_bed.GetNextBed(bedEntry)) { + if (_bed._status == BED_VALID) { + Revcomp(bedEntry); + _bed.reportBedNewLine(bedEntry); + } + } + _bed.Close(); +} + +namespace { +// like std::clamp from C++17 +CHRPOS clamp(const CHRPOS& a, const CHRPOS& b, const CHRPOS& c) { + return std::min(std::max(a, b), c); +} +} + +void BedRevcomp::Revcomp(BED &bed) { + CHRPOS chromSize = _genome.getChromSize(bed.chrom); + + auto rcStart = clamp(0, chromSize - bed.end, chromSize); + auto rcEnd = clamp(0, chromSize - bed.start, chromSize); + + bed.end = rcEnd; + bed.start = rcStart; +} diff --git a/src/revcompBed/revcompBed.h b/src/revcompBed/revcompBed.h new file mode 100644 index 00000000..14ce7602 --- /dev/null +++ b/src/revcompBed/revcompBed.h @@ -0,0 +1,26 @@ +#pragma once + +#include "bedFile.h" +#include "GenomeFile.h" + +#include + +class BedRevcomp { + +public: + + BedRevcomp(const std::string &bedFile, const std::string &genomeFile, bool printHeader); + +private: + + std::string _bedFile; + std::string _genomeFile; + + bool _printHeader; + + BedFile _bed; + GenomeFile _genome; + + void Revcomp(); + void Revcomp(BED &bed); +}; diff --git a/src/revcompBed/revcompBedMain.cpp b/src/revcompBed/revcompBedMain.cpp new file mode 100644 index 00000000..3186ca3a --- /dev/null +++ b/src/revcompBed/revcompBedMain.cpp @@ -0,0 +1,117 @@ +#include "revcompBed.h" +#include "version.h" + +using namespace std; + +// define our program name +#define PROGRAM_NAME "bedtools revcomp" + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + +// function declarations +void revcomp_help(int); + +int revcomp_main(int argc, char* argv[]) { + + // our configuration variables + bool showHelp = false; + + // input files + string bedFile = "stdin"; + string genomeFile; + + bool haveBed = true; + bool haveGenome = false; + bool printHeader = false; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) revcomp_help(0); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-g", 2, parameterLength)) { + if ((i+1) < argc) { + haveGenome = true; + genomeFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-header", 7, parameterLength)) { + printHeader = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBed || !haveGenome) { + cerr << endl << "*****" << endl << "*****ERROR: Need both a BED (-i) and a genome (-g) file. " << endl << "*****" << endl; + showHelp = true; + } + + if (showHelp) revcomp_help(1); + + BedRevcomp(bedFile, genomeFile, printHeader); + return 0; +} + +void revcomp_help(int exit_status) { + + cerr << "\nTool: bedtools revcomp (aka revcompBed)" << endl; + cerr << "Version: " << VERSION << "\n"; + cerr << "Summary: Compute the intervals on the reverse complement." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i -g " << endl << endl; + + cerr << "Options: " << endl; + cerr << "\t-header\t" << "Print the header from the input file prior to results." << endl << endl; + + cerr << "Notes: " << endl; + cerr << "\t(1) Starts will be set to 0 if options would force it below 0." << endl; + cerr << "\t(2) Ends will be set to the chromosome length if above the max chrom length." << endl; + cerr << "\t(3) The genome file should be tab delimited and structured as follows:" << endl; + cerr << "\n\t" << endl << endl; + cerr << "\tFor example, Human (hg19):" << endl; + cerr << "\tchr1\t249250621" << endl; + cerr << "\tchr2\t243199373" << endl; + cerr << "\t..." << endl; + cerr << "\tchr18_gl000207_random\t4262" << endl << endl; + + cerr << "Tip 1. Use samtools faidx to create a genome file from a FASTA: " << endl; + cerr << "\tOne can the samtools faidx command to index a FASTA file." << endl; + cerr << "\tThe resulting .fai index is suitable as a genome file, " << endl; + cerr << "\tas bedtools will only look at the first two, relevant columns" << endl; + cerr << "\tof the .fai file." << endl << endl; + cerr << "\tFor example:" << endl; + cerr << "\tsamtools faidx GRCh38.fa" << endl; + cerr << "\tbedtools revcomp -i my.bed -g GRCh38.fa.fai" << endl << endl; + + cerr << "Tip 2. Use UCSC Table Browser to create a genome file: " << endl; + cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl; + cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl; + cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \\" << endl; + cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl; + + // end the program here + exit(exit_status); +} diff --git a/test/revcomp/a.bed b/test/revcomp/a.bed new file mode 100644 index 00000000..e0ddf220 --- /dev/null +++ b/test/revcomp/a.bed @@ -0,0 +1,2 @@ +chr1 100 200 a1 1 + +chr1 100 200 a2 2 - diff --git a/test/revcomp/b.bed b/test/revcomp/b.bed new file mode 100644 index 00000000..a147e8ac --- /dev/null +++ b/test/revcomp/b.bed @@ -0,0 +1,2 @@ +chr1 66999638 67216822 NM_032291 0 + +chr1 92145899 92351836 NR_036634 0 - diff --git a/test/revcomp/huge.genome b/test/revcomp/huge.genome new file mode 100644 index 00000000..d8ebc53a --- /dev/null +++ b/test/revcomp/huge.genome @@ -0,0 +1 @@ +chr1 249250621 diff --git a/test/revcomp/test-revcomp.sh b/test/revcomp/test-revcomp.sh new file mode 100755 index 00000000..147ecff5 --- /dev/null +++ b/test/revcomp/test-revcomp.sh @@ -0,0 +1,54 @@ +set -e; +BT="${BT-../../bin/bedtools}" + +FAILURES=0; + +check() +{ + if diff $1 $2; then + echo ok + + else + FAILURES=$(expr $FAILURES + 1); + echo fail + + fi +} + +# cat a.bed +# chr1 100 200 a1 1 + +# chr1 100 200 a2 2 - + +########################################################### +# test simple +########################################################### +echo -e " revcomp.t1...\c" +echo \ +"chr1 800 900 a1 1 + +chr1 800 900 a2 2 -" > exp +"$BT" revcomp -i a.bed -g tiny.genome > obs +check obs exp +rm obs exp + +echo -e " revcomp.t2...\c" +echo \ +"chr1 249250421 249250521 a1 1 + +chr1 249250421 249250521 a2 2 -" > exp +"$BT" revcomp -i a.bed -g huge.genome > obs +check obs exp +rm obs exp + + +########################################################### +# test clamp +########################################################### +echo -e " revcomp.t3...\c" +echo \ +"chr1 0 0 NM_032291 0 + +chr1 0 0 NR_036634 0 -" > exp +"$BT" revcomp -i b.bed -g tiny.genome > obs +check obs exp +rm obs exp + + +[[ $FAILURES -eq 0 ]] || exit 1; diff --git a/test/revcomp/tiny.genome b/test/revcomp/tiny.genome new file mode 100644 index 00000000..0dd8b222 --- /dev/null +++ b/test/revcomp/tiny.genome @@ -0,0 +1 @@ +chr1 1000