From c9b8ce2056c36e954f433e14ae18120ac852c6d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabian=20Kl=C3=B6tzl?= Date: Tue, 7 Nov 2023 16:36:29 +0000 Subject: [PATCH 1/4] add revcompBed --- Makefile | 1 + src/bedtools.cpp | 5 +- src/revcompBed/Makefile.frag | 7 ++ src/revcompBed/revcompBed.cpp | 50 +++++++++++++ src/revcompBed/revcompBed.h | 26 +++++++ src/revcompBed/revcompBedMain.cpp | 120 ++++++++++++++++++++++++++++++ 6 files changed, 208 insertions(+), 1 deletion(-) create mode 100644 src/revcompBed/Makefile.frag create mode 100644 src/revcompBed/revcompBed.cpp create mode 100644 src/revcompBed/revcompBed.h create mode 100644 src/revcompBed/revcompBedMain.cpp diff --git a/Makefile b/Makefile index fce03eb85..de59a6daf 100644 --- a/Makefile +++ b/Makefile @@ -77,6 +77,7 @@ SUBDIRS = $(SRC_DIR)/annotateBed \ $(SRC_DIR)/randomBed \ $(SRC_DIR)/regressTest \ $(SRC_DIR)/reldist \ + $(SRC_DIR)/revcompBed \ $(SRC_DIR)/sampleFile \ $(SRC_DIR)/shiftBed \ $(SRC_DIR)/shuffleBed \ diff --git a/src/bedtools.cpp b/src/bedtools.cpp index 0bccebb71..78b276bb9 100644 --- a/src/bedtools.cpp +++ b/src/bedtools.cpp @@ -74,6 +74,7 @@ void summary_help(); int reldist_main(int argc, char* argv[]); // void sample_help(); int shift_main(int argc, char* argv[]); // +int revcomp_main(int argc, char* argv[]); int shuffle_main(int argc, char* argv[]); // int slop_main(int argc, char* argv[]); // int split_main(int argc, char* argv[]); // @@ -132,7 +133,8 @@ int main(int argc, char *argv[]) else if (subCmd == "window") return window_main(argc-1, argv+1); else if (subCmd == "genomecov") return genomecoverage_main(argc-1, argv+1); else if (subCmd == "cluster") return cluster_main(argc-1, argv+1); - else if (subCmd == "shift") return shift_main(argc-1, argv+1); + else if (subCmd == "shift") return shift_main(argc-1, argv+1); + else if (subCmd == "revcomp") return revcomp_main(argc-1, argv+1); else if (subCmd == "slop") return slop_main(argc-1, argv+1); else if (subCmd == "split") return split_main(argc-1, argv+1); else if (subCmd == "flank") return flank_main(argc-1, argv+1); @@ -236,6 +238,7 @@ int bedtools_help(void) cout << " cluster " << "Cluster (but don't merge) overlapping/nearby intervals.\n"; cout << " complement " << "Extract intervals _not_ represented by an interval file.\n"; cout << " shift " << "Adjust the position of intervals.\n"; + cout << " revcomp " << "Compute the intervals on the reverse complement.\n"; cout << " subtract " << "Remove intervals based on overlaps b/w two files.\n"; cout << " slop " << "Adjust the size of intervals.\n"; cout << " flank " << "Create new intervals from the flanks of existing intervals.\n"; diff --git a/src/revcompBed/Makefile.frag b/src/revcompBed/Makefile.frag new file mode 100644 index 000000000..7845a74d1 --- /dev/null +++ b/src/revcompBed/Makefile.frag @@ -0,0 +1,7 @@ +BUILT_OBJECTS += obj/revcompBedMain.o obj/revcompBed.o + +obj/revcompBedMain.o: src/revcompBed/revcompBedMain.cpp obj/revcompBedMain.d + $(CXX_COMPILE) + +obj/revcompBed.o: src/revcompBed/revcompBed.cpp obj/revcompBed.d + $(CXX_COMPILE) diff --git a/src/revcompBed/revcompBed.cpp b/src/revcompBed/revcompBed.cpp new file mode 100644 index 000000000..3c7ac0a5f --- /dev/null +++ b/src/revcompBed/revcompBed.cpp @@ -0,0 +1,50 @@ +#include "revcompBed.h" + +#include "lineFileUtilities.h" + +BedRevcomp::BedRevcomp(/*const*/ std::string &bedFile, const std::string &genomeFile, bool printHeader): + _bedFile(bedFile), + _genomeFile(genomeFile), + _printHeader(printHeader) +{ + + _bed = new BedFile(bedFile); + _genome = new GenomeFile(genomeFile); + + Revcomp(); +} + +void BedRevcomp::Revcomp() { + + BED bedEntry; // used to store the current BED line from the BED file. + + _bed->Open(); + // report header first if asked. + if (_printHeader == true) { + _bed->PrintHeader(); + } + while (_bed->GetNextBed(bedEntry)) { + if (_bed->_status == BED_VALID) { + Revcomp(bedEntry); + _bed->reportBedNewLine(bedEntry); + } + } + _bed->Close(); +} + +namespace { +// like std::clamp from C++17 +CHRPOS clamp(const CHRPOS& a, const CHRPOS& b, const CHRPOS& c) { + return std::min(std::max(a, b), c); +} +} + +void BedRevcomp::Revcomp(BED &bed) { + CHRPOS chromSize = (CHRPOS)_genome->getChromSize(bed.chrom); + + auto rcStart = clamp(0, chromSize - bed.end, chromSize); + auto rcEnd = clamp(0, chromSize - bed.start, chromSize); + + bed.end = rcEnd; + bed.start = rcStart; +} diff --git a/src/revcompBed/revcompBed.h b/src/revcompBed/revcompBed.h new file mode 100644 index 000000000..bf4210170 --- /dev/null +++ b/src/revcompBed/revcompBed.h @@ -0,0 +1,26 @@ +#pragma once + +#include "bedFile.h" +#include "GenomeFile.h" + +#include + +class BedRevcomp { + +public: + + BedRevcomp(/*const*/ std::string &bedFile, const std::string &genomeFile, bool printHeader); + +private: + + std::string _bedFile; + std::string _genomeFile; + + bool _printHeader; + + BedFile *_bed; + GenomeFile *_genome; + + void Revcomp(); + void Revcomp(BED &bed); +}; diff --git a/src/revcompBed/revcompBedMain.cpp b/src/revcompBed/revcompBedMain.cpp new file mode 100644 index 000000000..90d6a1fff --- /dev/null +++ b/src/revcompBed/revcompBedMain.cpp @@ -0,0 +1,120 @@ +#include "revcompBed.h" +#include "version.h" + +using namespace std; + +// define our program name +#define PROGRAM_NAME "bedtools revcomp" + +// define our parameter checking macro +#define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) + +// function declarations +void revcomp_help(void); + +int revcomp_main(int argc, char* argv[]) { + + // our configuration variables + bool showHelp = false; + + // input files + string bedFile = "stdin"; + string genomeFile; + + bool haveBed = true; + bool haveGenome = false; + bool printHeader = false; + + for(int i = 1; i < argc; i++) { + int parameterLength = (int)strlen(argv[i]); + + if((PARAMETER_CHECK("-h", 2, parameterLength)) || + (PARAMETER_CHECK("--help", 5, parameterLength))) { + showHelp = true; + } + } + + if(showHelp) revcomp_help(); + + // do some parsing (all of these parameters require 2 strings) + for(int i = 1; i < argc; i++) { + + int parameterLength = (int)strlen(argv[i]); + + if(PARAMETER_CHECK("-i", 2, parameterLength)) { + if ((i+1) < argc) { + bedFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-g", 2, parameterLength)) { + if ((i+1) < argc) { + haveGenome = true; + genomeFile = argv[i + 1]; + i++; + } + } + else if(PARAMETER_CHECK("-header", 7, parameterLength)) { + printHeader = true; + } + else { + cerr << endl << "*****ERROR: Unrecognized parameter: " << argv[i] << " *****" << endl << endl; + showHelp = true; + } + } + + // make sure we have both input files + if (!haveBed || !haveGenome) { + cerr << endl << "*****" << endl << "*****ERROR: Need both a BED (-i) and a genome (-g) file. " << endl << "*****" << endl; + showHelp = true; + } + if (!showHelp) { + BedRevcomp(bedFile, genomeFile, printHeader); + return 0; + } + else { + revcomp_help(); + } + return 0; +} + +void revcomp_help(void) { + + cerr << "\nTool: bedtools revcomp (aka revcompBed)" << endl; + cerr << "Version: " << VERSION << "\n"; + cerr << "Summary: Shift each feature by requested number of base pairs." << endl << endl; + + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i -g [-s or (-p and -m)]" << endl << endl; + + cerr << "Options: " << endl; + cerr << "\t-header\t" << "Print the header from the input file prior to results." << endl << endl; + + cerr << "Notes: " << endl; + cerr << "\t(1) Starts will be set to 0 if options would force it below 0." << endl; + cerr << "\t(2) Ends will be set to the chromosome length if above the max chrom length." << endl; + cerr << "\t(3) The genome file should tab delimited and structured as follows:" << endl; + cerr << "\n\t" << endl << endl; + cerr << "\tFor example, Human (hg19):" << endl; + cerr << "\tchr1\t249250621" << endl; + cerr << "\tchr2\t243199373" << endl; + cerr << "\t..." << endl; + cerr << "\tchr18_gl000207_random\t4262" << endl << endl; + + cerr << "Tip 1. Use samtools faidx to create a genome file from a FASTA: " << endl; + cerr << "\tOne can the samtools faidx command to index a FASTA file." << endl; + cerr << "\tThe resulting .fai index is suitable as a genome file, " << endl; + cerr << "\tas bedtools will only look at the first two, relevant columns" << endl; + cerr << "\tof the .fai file." << endl << endl; + cerr << "\tFor example:" << endl; + cerr << "\tsamtools faidx GRCh38.fa" << endl; + cerr << "\tbedtools revcomp -i my.bed -g GRCh38.fa.fai" << endl << endl; + + cerr << "Tip 2. Use UCSC Table Browser to create a genome file: " << endl; + cerr << "\tOne can use the UCSC Genome Browser's MySQL database to extract" << endl; + cerr << "\tchromosome sizes. For example, H. sapiens:" << endl << endl; + cerr << "\tmysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \\" << endl; + cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl; + + // end the program here + exit(1); +} From 4100660a6daf640ee1cd7d248a7e383edebf9e1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabian=20Kl=C3=B6tzl?= Date: Tue, 7 Nov 2023 17:23:35 +0000 Subject: [PATCH 2/4] simplify --- src/revcompBed/revcompBed.cpp | 22 ++++++++++------------ src/revcompBed/revcompBed.h | 4 ++-- src/revcompBed/revcompBedMain.cpp | 25 +++++++++++-------------- 3 files changed, 23 insertions(+), 28 deletions(-) diff --git a/src/revcompBed/revcompBed.cpp b/src/revcompBed/revcompBed.cpp index 3c7ac0a5f..ea9d54fcc 100644 --- a/src/revcompBed/revcompBed.cpp +++ b/src/revcompBed/revcompBed.cpp @@ -5,12 +5,10 @@ BedRevcomp::BedRevcomp(/*const*/ std::string &bedFile, const std::string &genomeFile, bool printHeader): _bedFile(bedFile), _genomeFile(genomeFile), - _printHeader(printHeader) + _printHeader(printHeader), + _bed(bedFile), + _genome(genomeFile) { - - _bed = new BedFile(bedFile); - _genome = new GenomeFile(genomeFile); - Revcomp(); } @@ -18,18 +16,18 @@ void BedRevcomp::Revcomp() { BED bedEntry; // used to store the current BED line from the BED file. - _bed->Open(); + _bed.Open(); // report header first if asked. if (_printHeader == true) { - _bed->PrintHeader(); + _bed.PrintHeader(); } - while (_bed->GetNextBed(bedEntry)) { - if (_bed->_status == BED_VALID) { + while (_bed.GetNextBed(bedEntry)) { + if (_bed._status == BED_VALID) { Revcomp(bedEntry); - _bed->reportBedNewLine(bedEntry); + _bed.reportBedNewLine(bedEntry); } } - _bed->Close(); + _bed.Close(); } namespace { @@ -40,7 +38,7 @@ CHRPOS clamp(const CHRPOS& a, const CHRPOS& b, const CHRPOS& c) { } void BedRevcomp::Revcomp(BED &bed) { - CHRPOS chromSize = (CHRPOS)_genome->getChromSize(bed.chrom); + CHRPOS chromSize = (CHRPOS)_genome.getChromSize(bed.chrom); auto rcStart = clamp(0, chromSize - bed.end, chromSize); auto rcEnd = clamp(0, chromSize - bed.start, chromSize); diff --git a/src/revcompBed/revcompBed.h b/src/revcompBed/revcompBed.h index bf4210170..edefdd569 100644 --- a/src/revcompBed/revcompBed.h +++ b/src/revcompBed/revcompBed.h @@ -18,8 +18,8 @@ class BedRevcomp { bool _printHeader; - BedFile *_bed; - GenomeFile *_genome; + BedFile _bed; + GenomeFile _genome; void Revcomp(); void Revcomp(BED &bed); diff --git a/src/revcompBed/revcompBedMain.cpp b/src/revcompBed/revcompBedMain.cpp index 90d6a1fff..3186ca3a1 100644 --- a/src/revcompBed/revcompBedMain.cpp +++ b/src/revcompBed/revcompBedMain.cpp @@ -10,7 +10,7 @@ using namespace std; #define PARAMETER_CHECK(param, paramLen, actualLen) (strncmp(argv[i], param, min(actualLen, paramLen))== 0) && (actualLen == paramLen) // function declarations -void revcomp_help(void); +void revcomp_help(int); int revcomp_main(int argc, char* argv[]) { @@ -34,7 +34,7 @@ int revcomp_main(int argc, char* argv[]) { } } - if(showHelp) revcomp_help(); + if(showHelp) revcomp_help(0); // do some parsing (all of these parameters require 2 strings) for(int i = 1; i < argc; i++) { @@ -68,23 +68,20 @@ int revcomp_main(int argc, char* argv[]) { cerr << endl << "*****" << endl << "*****ERROR: Need both a BED (-i) and a genome (-g) file. " << endl << "*****" << endl; showHelp = true; } - if (!showHelp) { - BedRevcomp(bedFile, genomeFile, printHeader); - return 0; - } - else { - revcomp_help(); - } + + if (showHelp) revcomp_help(1); + + BedRevcomp(bedFile, genomeFile, printHeader); return 0; } -void revcomp_help(void) { +void revcomp_help(int exit_status) { cerr << "\nTool: bedtools revcomp (aka revcompBed)" << endl; cerr << "Version: " << VERSION << "\n"; - cerr << "Summary: Shift each feature by requested number of base pairs." << endl << endl; + cerr << "Summary: Compute the intervals on the reverse complement." << endl << endl; - cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i -g [-s or (-p and -m)]" << endl << endl; + cerr << "Usage: " << PROGRAM_NAME << " [OPTIONS] -i -g " << endl << endl; cerr << "Options: " << endl; cerr << "\t-header\t" << "Print the header from the input file prior to results." << endl << endl; @@ -92,7 +89,7 @@ void revcomp_help(void) { cerr << "Notes: " << endl; cerr << "\t(1) Starts will be set to 0 if options would force it below 0." << endl; cerr << "\t(2) Ends will be set to the chromosome length if above the max chrom length." << endl; - cerr << "\t(3) The genome file should tab delimited and structured as follows:" << endl; + cerr << "\t(3) The genome file should be tab delimited and structured as follows:" << endl; cerr << "\n\t" << endl << endl; cerr << "\tFor example, Human (hg19):" << endl; cerr << "\tchr1\t249250621" << endl; @@ -116,5 +113,5 @@ void revcomp_help(void) { cerr << "\t\"select chrom, size from hg19.chromInfo\" > hg19.genome" << endl << endl; // end the program here - exit(1); + exit(exit_status); } From 21d6f7153f90b999fc4d663ac9f419efcd885ee0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabian=20Kl=C3=B6tzl?= Date: Fri, 14 Jun 2024 11:03:18 +0100 Subject: [PATCH 3/4] simplifications --- src/revcompBed/revcompBed.cpp | 6 +++--- src/revcompBed/revcompBed.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/revcompBed/revcompBed.cpp b/src/revcompBed/revcompBed.cpp index ea9d54fcc..6803fad01 100644 --- a/src/revcompBed/revcompBed.cpp +++ b/src/revcompBed/revcompBed.cpp @@ -2,11 +2,11 @@ #include "lineFileUtilities.h" -BedRevcomp::BedRevcomp(/*const*/ std::string &bedFile, const std::string &genomeFile, bool printHeader): +BedRevcomp::BedRevcomp(const std::string &bedFile, const std::string &genomeFile, bool printHeader): _bedFile(bedFile), _genomeFile(genomeFile), _printHeader(printHeader), - _bed(bedFile), + _bed(_bedFile), _genome(genomeFile) { Revcomp(); @@ -38,7 +38,7 @@ CHRPOS clamp(const CHRPOS& a, const CHRPOS& b, const CHRPOS& c) { } void BedRevcomp::Revcomp(BED &bed) { - CHRPOS chromSize = (CHRPOS)_genome.getChromSize(bed.chrom); + CHRPOS chromSize = _genome.getChromSize(bed.chrom); auto rcStart = clamp(0, chromSize - bed.end, chromSize); auto rcEnd = clamp(0, chromSize - bed.start, chromSize); diff --git a/src/revcompBed/revcompBed.h b/src/revcompBed/revcompBed.h index edefdd569..14ce76026 100644 --- a/src/revcompBed/revcompBed.h +++ b/src/revcompBed/revcompBed.h @@ -9,7 +9,7 @@ class BedRevcomp { public: - BedRevcomp(/*const*/ std::string &bedFile, const std::string &genomeFile, bool printHeader); + BedRevcomp(const std::string &bedFile, const std::string &genomeFile, bool printHeader); private: From 575e4540b5b081d44006aa84466b120d7206b5f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabian=20Kl=C3=B6tzl?= Date: Fri, 14 Jun 2024 11:20:10 +0100 Subject: [PATCH 4/4] add tests --- test/revcomp/a.bed | 2 ++ test/revcomp/b.bed | 2 ++ test/revcomp/huge.genome | 1 + test/revcomp/test-revcomp.sh | 54 ++++++++++++++++++++++++++++++++++++ test/revcomp/tiny.genome | 1 + 5 files changed, 60 insertions(+) create mode 100644 test/revcomp/a.bed create mode 100644 test/revcomp/b.bed create mode 100644 test/revcomp/huge.genome create mode 100755 test/revcomp/test-revcomp.sh create mode 100644 test/revcomp/tiny.genome diff --git a/test/revcomp/a.bed b/test/revcomp/a.bed new file mode 100644 index 000000000..e0ddf2203 --- /dev/null +++ b/test/revcomp/a.bed @@ -0,0 +1,2 @@ +chr1 100 200 a1 1 + +chr1 100 200 a2 2 - diff --git a/test/revcomp/b.bed b/test/revcomp/b.bed new file mode 100644 index 000000000..a147e8ac1 --- /dev/null +++ b/test/revcomp/b.bed @@ -0,0 +1,2 @@ +chr1 66999638 67216822 NM_032291 0 + +chr1 92145899 92351836 NR_036634 0 - diff --git a/test/revcomp/huge.genome b/test/revcomp/huge.genome new file mode 100644 index 000000000..d8ebc53aa --- /dev/null +++ b/test/revcomp/huge.genome @@ -0,0 +1 @@ +chr1 249250621 diff --git a/test/revcomp/test-revcomp.sh b/test/revcomp/test-revcomp.sh new file mode 100755 index 000000000..147ecff55 --- /dev/null +++ b/test/revcomp/test-revcomp.sh @@ -0,0 +1,54 @@ +set -e; +BT="${BT-../../bin/bedtools}" + +FAILURES=0; + +check() +{ + if diff $1 $2; then + echo ok + + else + FAILURES=$(expr $FAILURES + 1); + echo fail + + fi +} + +# cat a.bed +# chr1 100 200 a1 1 + +# chr1 100 200 a2 2 - + +########################################################### +# test simple +########################################################### +echo -e " revcomp.t1...\c" +echo \ +"chr1 800 900 a1 1 + +chr1 800 900 a2 2 -" > exp +"$BT" revcomp -i a.bed -g tiny.genome > obs +check obs exp +rm obs exp + +echo -e " revcomp.t2...\c" +echo \ +"chr1 249250421 249250521 a1 1 + +chr1 249250421 249250521 a2 2 -" > exp +"$BT" revcomp -i a.bed -g huge.genome > obs +check obs exp +rm obs exp + + +########################################################### +# test clamp +########################################################### +echo -e " revcomp.t3...\c" +echo \ +"chr1 0 0 NM_032291 0 + +chr1 0 0 NR_036634 0 -" > exp +"$BT" revcomp -i b.bed -g tiny.genome > obs +check obs exp +rm obs exp + + +[[ $FAILURES -eq 0 ]] || exit 1; diff --git a/test/revcomp/tiny.genome b/test/revcomp/tiny.genome new file mode 100644 index 000000000..0dd8b2229 --- /dev/null +++ b/test/revcomp/tiny.genome @@ -0,0 +1 @@ +chr1 1000