-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
152 additions
and
51 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
Package: vcfppR | ||
Title: Rapid Manipulation of the Variant Call Format (VCF) | ||
Version: 0.5.1 | ||
Version: 0.6.0 | ||
Authors@R: c( | ||
person("Zilong", "Li", , "[email protected]", role = c("aut", "cre"), | ||
comment = c(ORCID = "0000-0001-5859-2078")), | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,7 @@ | |
* @file https://github.com/Zilong-Li/vcfpp/vcfpp.h | ||
* @author Zilong Li | ||
* @email [email protected] | ||
* @version v0.5.2 | ||
* @version v0.6.0 | ||
* @breif a single C++ file for manipulating VCF | ||
* Copyright (C) 2022-2023.The use of this code is governed by the LICENSE file. | ||
******************************************************************************/ | ||
|
@@ -49,6 +49,7 @@ | |
// make sure you have htslib installed | ||
extern "C" | ||
{ | ||
# include <htslib/hts.h> | ||
# include <htslib/kstring.h> | ||
# include <htslib/tbx.h> | ||
# include <htslib/vcf.h> | ||
|
@@ -94,6 +95,9 @@ using isFormatVector = typename std::enable_if<std::is_same<T, std::vector<float | |
|| std::is_same<T, std::vector<int>>::value, | ||
bool>::type; | ||
|
||
namespace details | ||
{ | ||
|
||
template<typename T> | ||
isScalar<T> isMissing(T const & v) | ||
{ | ||
|
@@ -151,16 +155,43 @@ inline std::vector<std::string> split_string(const std::string & s, const std::s | |
} | ||
|
||
// deleter for htsFile | ||
struct htsFile_close | ||
struct hts_file_close | ||
{ | ||
void operator()(htsFile * x) | ||
{ | ||
if(x) hts_close(x); | ||
} | ||
}; | ||
|
||
// deleter for hts iterator | ||
struct hts_iter_close | ||
{ | ||
void operator()(hts_itr_t * x) | ||
{ | ||
if(x) hts_itr_destroy(x); | ||
} | ||
}; | ||
|
||
// deleter for hts idx | ||
struct hts_idx_close | ||
{ | ||
void operator()(hts_idx_t * x) | ||
{ | ||
if(x) hts_idx_destroy(x); | ||
} | ||
}; | ||
|
||
// deleter for tabix idx | ||
struct tabix_idx_close | ||
{ | ||
void operator()(tbx_t * x) | ||
{ | ||
if(x) tbx_destroy(x); | ||
} | ||
}; | ||
|
||
// deleter for variant | ||
struct variant_close | ||
struct bcf_line_close | ||
{ | ||
void operator()(bcf1_t * x) | ||
{ | ||
|
@@ -177,6 +208,8 @@ struct bcf_hdr_close | |
} | ||
}; | ||
|
||
} // namespace details | ||
|
||
/** | ||
* @class BcfHeader | ||
* @brief Object represents header of the VCF, offering methods to access and modify the tags | ||
|
@@ -380,7 +413,7 @@ class BcfHeader | |
* */ | ||
void updateSamples(const std::string & samples) | ||
{ | ||
auto ss = split_string(samples, ","); | ||
auto ss = details::split_string(samples, ","); | ||
const int nsamples = nSamples(); | ||
if(nsamples != (int)ss.size()) | ||
throw std::runtime_error("You provide either too few or too many samples"); | ||
|
@@ -466,7 +499,7 @@ class BcfRecord | |
|
||
private: | ||
BcfHeader * header; | ||
std::shared_ptr<bcf1_t> line = std::shared_ptr<bcf1_t>(bcf_init(), variant_close()); // variant | ||
std::shared_ptr<bcf1_t> line = std::shared_ptr<bcf1_t>(bcf_init(), details::bcf_line_close()); // variant | ||
bcf_hdr_t * hdr_d = NULL; // a dup header by bcf_hdr_dup(header->hdr) | ||
bcf_fmt_t * fmt = NULL; | ||
bcf_info_t * info = NULL; | ||
|
@@ -1517,9 +1550,9 @@ class BcfReader | |
{ | ||
private: | ||
std::shared_ptr<htsFile> fp; // hts file | ||
hts_idx_t * hidx = NULL; // hts index file | ||
tbx_t * tidx = NULL; // .tbi .csi index file for vcf files | ||
hts_itr_t * itr = NULL; // tabix iterator | ||
std::shared_ptr<hts_idx_t> hidx; // hts index file | ||
std::shared_ptr<tbx_t> tidx; // .tbi .csi index file for vcf files | ||
std::shared_ptr<hts_itr_t> itr; // hts iterator | ||
kstring_t s = {0, 0, NULL}; // kstring | ||
std::string fname; | ||
bool isBcf; // if the input file is bcf or vcf; | ||
|
@@ -1529,7 +1562,7 @@ class BcfReader | |
BcfHeader header; | ||
/// number of samples in the VCF | ||
int nsamples; | ||
/// number of samples in the VCF | ||
/// a vector of samples name in the VCF | ||
std::vector<std::string> SamplesName; | ||
|
||
/// Construct an empty BcfReader | ||
|
@@ -1575,24 +1608,25 @@ class BcfReader | |
|
||
~BcfReader() | ||
{ | ||
close(); | ||
if(s.s) free(s.s); | ||
} | ||
|
||
/// close the BcfReader object. | ||
/// Close the VCF file and its associated files | ||
void close() | ||
{ | ||
if(fp) fp.reset(); | ||
if(itr) hts_itr_destroy(itr); | ||
if(hidx) hts_idx_destroy(hidx); | ||
if(tidx) tbx_destroy(tidx); | ||
if(s.s) free(s.s); | ||
if(hidx) hidx.reset(); | ||
if(itr) itr.reset(); | ||
if(tidx) tidx.reset(); | ||
} | ||
|
||
/// open a VCF/BCF/STDIN file for streaming in | ||
/// Open a VCF/BCF/STDIN file for streaming in | ||
void open(const std::string & file) | ||
{ | ||
if(!fname.empty() && fname != file) | ||
throw std::runtime_error("does not support re-open a file yet. " + fname); | ||
fname = file; | ||
fp = std::shared_ptr<htsFile>(hts_open(file.c_str(), "r"), htsFile_close()); | ||
fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), "r"), details::hts_file_close()); | ||
if(!fp) throw std::invalid_argument("I/O error: input file is invalid"); | ||
header.hdr = bcf_hdr_read(fp.get()); | ||
nsamples = bcf_hdr_nsamples(header.hdr); | ||
|
@@ -1611,12 +1645,44 @@ class BcfReader | |
return header; | ||
} | ||
|
||
/** @brief get the number of records of given region */ | ||
uint64_t getVariantsCount(BcfRecord & r, const std::string & region) | ||
/** | ||
* @brief query the status of a given region in the VCF | ||
* @return -2: the region is not a valid bcftools-like format, | ||
* or it is not presenting in the VCF even though it's bcftols-like format. \n | ||
* -1: there is no index file found. \n | ||
* 0: the region is valid but empty. \n | ||
* 1: vaild and not empty. \n | ||
*/ | ||
int getStatus(const std::string & region) | ||
{ | ||
try | ||
{ | ||
setRegion(region); | ||
BcfRecord v(header); | ||
if(!getNextVariant(v)) return 0; | ||
} | ||
catch(const std::invalid_argument & e) | ||
{ | ||
return -1; | ||
} | ||
catch(const std::runtime_error & e) | ||
{ | ||
return -2; | ||
} | ||
return 1; | ||
} | ||
|
||
/** | ||
* @brief count the number of variants by iterating through a given region. | ||
* @note If you want to continue work on that region, remember to reset the region by setRegion()! \n | ||
* Also, check the status of the region first to handle the different cases! | ||
*/ | ||
int getVariantsCount(const std::string & region) | ||
{ | ||
uint64_t c{0}; | ||
while(getNextVariant(r)) c++; | ||
setRegion(region); // reset the region | ||
int c{0}; | ||
setRegion(region); | ||
BcfRecord v(header); | ||
while(getNextVariant(v)) c++; | ||
return c; | ||
} | ||
|
||
|
@@ -1633,54 +1699,61 @@ class BcfReader | |
} | ||
|
||
/** | ||
* @brief explicitly stream to specific region | ||
* @param region the string is samtools-like format which is chr:start-end | ||
* @brief explicitly stream to specific region. throw invalid_argument error if index file not found. | ||
* throw runtime_error if the region was not a valid bcftools-like format or was not presenting in the | ||
* VCF. | ||
* @param region the string for region is samtools-like format, which can be 'chr', 'chr:start' and | ||
* 'chr:start-end' | ||
* */ | ||
void setRegion(const std::string & region) | ||
{ | ||
// 1. check and load index first | ||
// 2. query iterval region | ||
// 3. if region is empty, use "." | ||
if(isEndWith(fname, "bcf") || isEndWith(fname, "bcf.gz")) | ||
if(details::isEndWith(fname, "bcf") || details::isEndWith(fname, "bcf.gz")) | ||
{ | ||
isBcf = true; | ||
hidx = bcf_index_load(fname.c_str()); | ||
if(itr) bcf_itr_destroy(itr); // reset current region. | ||
hidx = std::shared_ptr<hts_idx_t>(bcf_index_load(fname.c_str()), details::hts_idx_close()); | ||
if(itr) itr.reset(); // reset current region. | ||
if(region.empty()) | ||
itr = bcf_itr_querys(hidx, header.hdr, "."); | ||
itr = std::shared_ptr<hts_itr_t>(bcf_itr_querys(hidx.get(), header.hdr, "."), | ||
details::hts_iter_close()); | ||
else | ||
itr = bcf_itr_querys(hidx, header.hdr, region.c_str()); | ||
itr = std::shared_ptr<hts_itr_t>(bcf_itr_querys(hidx.get(), header.hdr, region.c_str()), | ||
details::hts_iter_close()); | ||
} | ||
else | ||
{ | ||
isBcf = false; | ||
tidx = tbx_index_load(fname.c_str()); | ||
if(tidx == NULL) throw std::runtime_error("error in loading tabix index!"); | ||
if(itr) tbx_itr_destroy(itr); // reset current region. | ||
tidx = std::shared_ptr<tbx_t>(tbx_index_load(fname.c_str()), details::tabix_idx_close()); | ||
if(tidx.get() == NULL) throw std::invalid_argument(" no tabix index found!"); | ||
if(itr) itr.reset(); // reset | ||
if(region.empty()) | ||
itr = tbx_itr_querys(tidx, "."); | ||
itr = std::shared_ptr<hts_itr_t>(tbx_itr_querys(tidx.get(), "."), details::hts_iter_close()); | ||
else | ||
itr = tbx_itr_querys(tidx, region.c_str()); | ||
if(itr == NULL) throw std::runtime_error("no interval region found!"); | ||
itr = std::shared_ptr<hts_itr_t>(tbx_itr_querys(tidx.get(), region.c_str()), | ||
details::hts_iter_close()); | ||
} | ||
if(itr.get() == NULL) | ||
throw std::runtime_error("region was not found! make sure the region format is correct"); | ||
} | ||
|
||
/** @brief read in the next variant | ||
* @param r r is a BcfRecord object to be filled in. */ | ||
* @param r the BcfRecord object to be filled in. */ | ||
bool getNextVariant(BcfRecord & r) | ||
{ | ||
int ret = -1; | ||
if(itr != NULL) | ||
if(itr.get() != NULL) | ||
{ | ||
if(isBcf) | ||
{ | ||
ret = bcf_itr_next(fp.get(), itr, r.line.get()); | ||
ret = bcf_itr_next(fp.get(), itr.get(), r.line.get()); | ||
bcf_unpack(r.line.get(), BCF_UN_ALL); | ||
return (ret >= 0); | ||
} | ||
else | ||
{ | ||
int slen = tbx_itr_next(fp.get(), tidx, itr, &s); | ||
int slen = tbx_itr_next(fp.get(), tidx.get(), itr.get(), &s); | ||
if(slen > 0) | ||
{ | ||
ret = vcf_parse1(&s, r.header->hdr, r.line.get()); // ret > 0, error | ||
|
@@ -1707,7 +1780,7 @@ class BcfWriter | |
{ | ||
private: | ||
std::shared_ptr<htsFile> fp; // hts file | ||
std::shared_ptr<bcf1_t> b = std::shared_ptr<bcf1_t>(bcf_init(), variant_close()); // variant | ||
std::shared_ptr<bcf1_t> b = std::shared_ptr<bcf1_t>(bcf_init(), details::bcf_line_close()); // variant | ||
int ret; | ||
bool isHeaderWritten = false; | ||
const BcfHeader * hp; | ||
|
@@ -1783,8 +1856,8 @@ class BcfWriter | |
*/ | ||
void open(const std::string & fname) | ||
{ | ||
auto mode = getMode(fname, "w"); | ||
fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()), htsFile_close()); | ||
auto mode = details::getMode(fname, "w"); | ||
fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()), details::hts_file_close()); | ||
if(!fp) throw std::invalid_argument("I/O error: input file is invalid"); | ||
} | ||
|
||
|
@@ -1799,7 +1872,7 @@ class BcfWriter | |
*/ | ||
void open(const std::string & fname, const std::string & mode) | ||
{ | ||
fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()), htsFile_close()); | ||
fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()), details::hts_file_close()); | ||
if(!fp) throw std::invalid_argument("I/O error: input file is invalid"); | ||
} | ||
|
||
|
@@ -1872,7 +1945,7 @@ class BcfWriter | |
} | ||
|
||
/// streams out the given variant of BcfRecord type | ||
inline bool writeRecord(BcfRecord & v) | ||
bool writeRecord(BcfRecord & v) | ||
{ | ||
if(!isHeaderWritten) writeHeader(); | ||
if(bcf_write(fp.get(), v.header->hdr, v.line.get()) < 0) return false; | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
Oops, something went wrong.