From 2111f6029fb0e9f8a1ac90347f9e0f70e4fbb551 Mon Sep 17 00:00:00 2001 From: Zilong-Li Date: Sat, 28 Sep 2024 15:52:31 +0200 Subject: [PATCH] v0.6.0 --- DESCRIPTION | 2 +- NEWS.md | 5 ++ R/RcppExports.R | 2 + inst/include/vcfpp.h | 161 +++++++++++++++++++++++++++++++------------ man/vcfreader.Rd | 4 ++ src/vcf-reader.cpp | 29 ++++++-- 6 files changed, 152 insertions(+), 51 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 18ba4c8..9546291 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: vcfppR Title: Rapid Manipulation of the Variant Call Format (VCF) -Version: 0.5.1 +Version: 0.6.0 Authors@R: c( person("Zilong", "Li", , "zilong.dk@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-5859-2078")), diff --git a/NEWS.md b/NEWS.md index 7657fb9..b85b6e0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,8 @@ +# vcfppR 0.6.0 +* upgrade `vcfpp.h` to v0.6.0 +* new API `vcfreader$setRegion` +* new API `vcfreader$getStatus` + # vcfppR 0.5.0 * upgrade `vcfpp.h` to v0.5.1 * add `bcfreader$updateSamples()` function diff --git a/R/RcppExports.R b/R/RcppExports.R index f05b313..e412550 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -21,6 +21,8 @@ heterozygosity <- function(vcffile, region = "", samples = "-", pass = FALSE, qu #' \item Parameter: region - The region to be constrained #' \item Parameter: samples - The samples to be constrained. Comma separated list of samples to include (or exclude with "^" prefix). #' } +#' @field setRegion try to set specific region to work with. will throw errors if no index or region found. Use getStatus to check if the region is valid or empty! +#' @field getStatus return 1: region is valid and not empty. 0: region is valid but empty. -1: no index file. -2: region not found or invalid region form #' @field variant Try to get next variant record. return FALSE if there are no more variants or hit the end of file, otherwise TRUE. #' @field chr Return the CHROM field of current variant #' @field pos Return the POS field of current variant diff --git a/inst/include/vcfpp.h b/inst/include/vcfpp.h index fd9acf5..46394d9 100644 --- a/inst/include/vcfpp.h +++ b/inst/include/vcfpp.h @@ -2,7 +2,7 @@ * @file https://github.com/Zilong-Li/vcfpp/vcfpp.h * @author Zilong Li * @email zilong.dk@gmail.com - * @version v0.5.2 + * @version v0.6.0 * @breif a single C++ file for manipulating VCF * Copyright (C) 2022-2023.The use of this code is governed by the LICENSE file. ******************************************************************************/ @@ -49,6 +49,7 @@ // make sure you have htslib installed extern "C" { +# include # include # include # include @@ -94,6 +95,9 @@ using isFormatVector = typename std::enable_if>::value, bool>::type; +namespace details +{ + template isScalar isMissing(T const & v) { @@ -151,7 +155,7 @@ inline std::vector split_string(const std::string & s, const std::s } // deleter for htsFile -struct htsFile_close +struct hts_file_close { void operator()(htsFile * x) { @@ -159,8 +163,35 @@ struct htsFile_close } }; +// deleter for hts iterator +struct hts_iter_close +{ + void operator()(hts_itr_t * x) + { + if(x) hts_itr_destroy(x); + } +}; + +// deleter for hts idx +struct hts_idx_close +{ + void operator()(hts_idx_t * x) + { + if(x) hts_idx_destroy(x); + } +}; + +// deleter for tabix idx +struct tabix_idx_close +{ + void operator()(tbx_t * x) + { + if(x) tbx_destroy(x); + } +}; + // deleter for variant -struct variant_close +struct bcf_line_close { void operator()(bcf1_t * x) { @@ -177,6 +208,8 @@ struct bcf_hdr_close } }; +} // namespace details + /** * @class BcfHeader * @brief Object represents header of the VCF, offering methods to access and modify the tags @@ -380,7 +413,7 @@ class BcfHeader * */ void updateSamples(const std::string & samples) { - auto ss = split_string(samples, ","); + auto ss = details::split_string(samples, ","); const int nsamples = nSamples(); if(nsamples != (int)ss.size()) throw std::runtime_error("You provide either too few or too many samples"); @@ -466,7 +499,7 @@ class BcfRecord private: BcfHeader * header; - std::shared_ptr line = std::shared_ptr(bcf_init(), variant_close()); // variant + std::shared_ptr line = std::shared_ptr(bcf_init(), details::bcf_line_close()); // variant bcf_hdr_t * hdr_d = NULL; // a dup header by bcf_hdr_dup(header->hdr) bcf_fmt_t * fmt = NULL; bcf_info_t * info = NULL; @@ -1517,9 +1550,9 @@ class BcfReader { private: std::shared_ptr fp; // hts file - hts_idx_t * hidx = NULL; // hts index file - tbx_t * tidx = NULL; // .tbi .csi index file for vcf files - hts_itr_t * itr = NULL; // tabix iterator + std::shared_ptr hidx; // hts index file + std::shared_ptr tidx; // .tbi .csi index file for vcf files + std::shared_ptr itr; // hts iterator kstring_t s = {0, 0, NULL}; // kstring std::string fname; bool isBcf; // if the input file is bcf or vcf; @@ -1529,7 +1562,7 @@ class BcfReader BcfHeader header; /// number of samples in the VCF int nsamples; - /// number of samples in the VCF + /// a vector of samples name in the VCF std::vector SamplesName; /// Construct an empty BcfReader @@ -1575,24 +1608,25 @@ class BcfReader ~BcfReader() { - close(); + if(s.s) free(s.s); } - /// close the BcfReader object. + /// Close the VCF file and its associated files void close() { if(fp) fp.reset(); - if(itr) hts_itr_destroy(itr); - if(hidx) hts_idx_destroy(hidx); - if(tidx) tbx_destroy(tidx); - if(s.s) free(s.s); + if(hidx) hidx.reset(); + if(itr) itr.reset(); + if(tidx) tidx.reset(); } - /// open a VCF/BCF/STDIN file for streaming in + /// Open a VCF/BCF/STDIN file for streaming in void open(const std::string & file) { + if(!fname.empty() && fname != file) + throw std::runtime_error("does not support re-open a file yet. " + fname); fname = file; - fp = std::shared_ptr(hts_open(file.c_str(), "r"), htsFile_close()); + fp = std::shared_ptr(hts_open(fname.c_str(), "r"), details::hts_file_close()); if(!fp) throw std::invalid_argument("I/O error: input file is invalid"); header.hdr = bcf_hdr_read(fp.get()); nsamples = bcf_hdr_nsamples(header.hdr); @@ -1611,12 +1645,44 @@ class BcfReader return header; } - /** @brief get the number of records of given region */ - uint64_t getVariantsCount(BcfRecord & r, const std::string & region) + /** + * @brief query the status of a given region in the VCF + * @return -2: the region is not a valid bcftools-like format, + * or it is not presenting in the VCF even though it's bcftols-like format. \n + * -1: there is no index file found. \n + * 0: the region is valid but empty. \n + * 1: vaild and not empty. \n + */ + int getStatus(const std::string & region) + { + try + { + setRegion(region); + BcfRecord v(header); + if(!getNextVariant(v)) return 0; + } + catch(const std::invalid_argument & e) + { + return -1; + } + catch(const std::runtime_error & e) + { + return -2; + } + return 1; + } + + /** + * @brief count the number of variants by iterating through a given region. + * @note If you want to continue work on that region, remember to reset the region by setRegion()! \n + * Also, check the status of the region first to handle the different cases! + */ + int getVariantsCount(const std::string & region) { - uint64_t c{0}; - while(getNextVariant(r)) c++; - setRegion(region); // reset the region + int c{0}; + setRegion(region); + BcfRecord v(header); + while(getNextVariant(v)) c++; return c; } @@ -1633,54 +1699,61 @@ class BcfReader } /** - * @brief explicitly stream to specific region - * @param region the string is samtools-like format which is chr:start-end + * @brief explicitly stream to specific region. throw invalid_argument error if index file not found. + * throw runtime_error if the region was not a valid bcftools-like format or was not presenting in the + * VCF. + * @param region the string for region is samtools-like format, which can be 'chr', 'chr:start' and + * 'chr:start-end' * */ void setRegion(const std::string & region) { // 1. check and load index first // 2. query iterval region // 3. if region is empty, use "." - if(isEndWith(fname, "bcf") || isEndWith(fname, "bcf.gz")) + if(details::isEndWith(fname, "bcf") || details::isEndWith(fname, "bcf.gz")) { isBcf = true; - hidx = bcf_index_load(fname.c_str()); - if(itr) bcf_itr_destroy(itr); // reset current region. + hidx = std::shared_ptr(bcf_index_load(fname.c_str()), details::hts_idx_close()); + if(itr) itr.reset(); // reset current region. if(region.empty()) - itr = bcf_itr_querys(hidx, header.hdr, "."); + itr = std::shared_ptr(bcf_itr_querys(hidx.get(), header.hdr, "."), + details::hts_iter_close()); else - itr = bcf_itr_querys(hidx, header.hdr, region.c_str()); + itr = std::shared_ptr(bcf_itr_querys(hidx.get(), header.hdr, region.c_str()), + details::hts_iter_close()); } else { isBcf = false; - tidx = tbx_index_load(fname.c_str()); - if(tidx == NULL) throw std::runtime_error("error in loading tabix index!"); - if(itr) tbx_itr_destroy(itr); // reset current region. + tidx = std::shared_ptr(tbx_index_load(fname.c_str()), details::tabix_idx_close()); + if(tidx.get() == NULL) throw std::invalid_argument(" no tabix index found!"); + if(itr) itr.reset(); // reset if(region.empty()) - itr = tbx_itr_querys(tidx, "."); + itr = std::shared_ptr(tbx_itr_querys(tidx.get(), "."), details::hts_iter_close()); else - itr = tbx_itr_querys(tidx, region.c_str()); - if(itr == NULL) throw std::runtime_error("no interval region found!"); + itr = std::shared_ptr(tbx_itr_querys(tidx.get(), region.c_str()), + details::hts_iter_close()); } + if(itr.get() == NULL) + throw std::runtime_error("region was not found! make sure the region format is correct"); } /** @brief read in the next variant - * @param r r is a BcfRecord object to be filled in. */ + * @param r the BcfRecord object to be filled in. */ bool getNextVariant(BcfRecord & r) { int ret = -1; - if(itr != NULL) + if(itr.get() != NULL) { if(isBcf) { - ret = bcf_itr_next(fp.get(), itr, r.line.get()); + ret = bcf_itr_next(fp.get(), itr.get(), r.line.get()); bcf_unpack(r.line.get(), BCF_UN_ALL); return (ret >= 0); } else { - int slen = tbx_itr_next(fp.get(), tidx, itr, &s); + int slen = tbx_itr_next(fp.get(), tidx.get(), itr.get(), &s); if(slen > 0) { ret = vcf_parse1(&s, r.header->hdr, r.line.get()); // ret > 0, error @@ -1707,7 +1780,7 @@ class BcfWriter { private: std::shared_ptr fp; // hts file - std::shared_ptr b = std::shared_ptr(bcf_init(), variant_close()); // variant + std::shared_ptr b = std::shared_ptr(bcf_init(), details::bcf_line_close()); // variant int ret; bool isHeaderWritten = false; const BcfHeader * hp; @@ -1783,8 +1856,8 @@ class BcfWriter */ void open(const std::string & fname) { - auto mode = getMode(fname, "w"); - fp = std::shared_ptr(hts_open(fname.c_str(), mode.c_str()), htsFile_close()); + auto mode = details::getMode(fname, "w"); + fp = std::shared_ptr(hts_open(fname.c_str(), mode.c_str()), details::hts_file_close()); if(!fp) throw std::invalid_argument("I/O error: input file is invalid"); } @@ -1799,7 +1872,7 @@ class BcfWriter */ void open(const std::string & fname, const std::string & mode) { - fp = std::shared_ptr(hts_open(fname.c_str(), mode.c_str()), htsFile_close()); + fp = std::shared_ptr(hts_open(fname.c_str(), mode.c_str()), details::hts_file_close()); if(!fp) throw std::invalid_argument("I/O error: input file is invalid"); } @@ -1872,7 +1945,7 @@ class BcfWriter } /// streams out the given variant of BcfRecord type - inline bool writeRecord(BcfRecord & v) + bool writeRecord(BcfRecord & v) { if(!isHeaderWritten) writeHeader(); if(bcf_write(fp.get(), v.header->hdr, v.line.get()) < 0) return false; diff --git a/man/vcfreader.Rd b/man/vcfreader.Rd index a4562a0..0d5a33f 100644 --- a/man/vcfreader.Rd +++ b/man/vcfreader.Rd @@ -27,6 +27,10 @@ Type the name of the class to see the details and methods \item Parameter: samples - The samples to be constrained. Comma separated list of samples to include (or exclude with "^" prefix). }} +\item{\code{setRegion}}{try to set specific region to work with. will throw errors if no index or region found. Use getStatus to check if the region is valid or empty!} + +\item{\code{getStatus}}{return 1: region is valid and not empty. 0: region is valid but empty. -1: no index file. -2: region not found or invalid region form} + \item{\code{variant}}{Try to get next variant record. return FALSE if there are no more variants or hit the end of file, otherwise TRUE.} \item{\code{chr}}{Return the CHROM field of current variant} diff --git a/src/vcf-reader.cpp b/src/vcf-reader.cpp index bddcc52..1cde5c0 100644 --- a/src/vcf-reader.cpp +++ b/src/vcf-reader.cpp @@ -19,6 +19,8 @@ using namespace std; //' \item Parameter: region - The region to be constrained //' \item Parameter: samples - The samples to be constrained. Comma separated list of samples to include (or exclude with "^" prefix). //' } +//' @field setRegion try to set specific region to work with. will throw errors if no index or region found. Use getStatus to check if the region is valid or empty! +//' @field getStatus return 1: region is valid and not empty. 0: region is valid but empty. -1: no index file. -2: region not found or invalid region form //' @field variant Try to get next variant record. return FALSE if there are no more variants or hit the end of file, otherwise TRUE. //' @field chr Return the CHROM field of current variant //' @field pos Return the POS field of current variant @@ -146,7 +148,10 @@ class vcfreader { ~vcfreader() {} - bool variant() { return br.getNextVariant(var); } + inline void setRegion(const std::string& region) { br.setRegion(region); } + inline int getStatus(const std::string& region) { return br.getStatus(region); } + + inline bool variant() { return br.getNextVariant(var); } inline std::string chr() const { return var.CHROM(); } inline std::string id() const { return var.ID(); } @@ -194,7 +199,9 @@ class vcfreader { } vector genotypes(bool collapse) { - if (!var.getGenotypes(v_int)) { return vector(); } + if (!var.getGenotypes(v_int)) { + return vector(); + } if (var.ploidy() == 2 && collapse) { for (size_t i = 0; i < v_int.size(); i += 2) { v_int[i + 1] += v_int[i]; @@ -213,7 +220,9 @@ class vcfreader { } vector formatInt(std::string tag) { - if (!var.getFORMAT(tag, v_int)) { return vector(); } + if (!var.getFORMAT(tag, v_int)) { + return vector(); + } int nvals = v_int.size() / br.nsamples; // how many values per sample for (int i = 0; i < br.nsamples; i++) { for (int j = 0; j < nvals; j++) @@ -289,7 +298,9 @@ class vcfreader { modifiable = true; } inline void updateSamples(const std::string& s) { - if (!modifiable) { modify(); } + if (!modifiable) { + modify(); + } bw.header.updateSamples(s); } inline void write() { @@ -351,7 +362,9 @@ class vcfreader { Rcpp::Rcout << "please call the `output()` function first to creat an output VCF\n"; return; } - if (!modifiable) { modify(); } + if (!modifiable) { + modify(); + } bw.header.addINFO(id, number, type, desc); } inline void addFORMAT(const std::string& id, const std::string& number, const std::string& type, @@ -360,7 +373,9 @@ class vcfreader { Rcpp::Rcout << "please call the `output()` function first to creat an output VCF\n"; return; } - if (!modifiable) { modify(); } + if (!modifiable) { + modify(); + } bw.header.addFORMAT(id, number, type, desc); } @@ -385,6 +400,8 @@ RCPP_MODULE(vcfreader) { .constructor("construct vcfreader given vcffile and region") .constructor( "construct vcfreader given vcf file, region and samples") + .method("setRegion", &vcfreader::setRegion, "set region to work with") + .method("getStatus", &vcfreader::getStatus, "query the status of a region") .method("variant", &vcfreader::variant, "get next variant record") .method("chr", &vcfreader::chr, "get CHROM") .method("id", &vcfreader::id, "get ID")