Skip to content

Commit

Permalink
v0.6.0
Browse files Browse the repository at this point in the history
  • Loading branch information
Zilong-Li committed Sep 28, 2024
1 parent 0b4c224 commit 2111f60
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 51 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: vcfppR
Title: Rapid Manipulation of the Variant Call Format (VCF)
Version: 0.5.1
Version: 0.6.0
Authors@R: c(
person("Zilong", "Li", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-5859-2078")),
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# vcfppR 0.6.0
* upgrade `vcfpp.h` to v0.6.0
* new API `vcfreader$setRegion`
* new API `vcfreader$getStatus`

# vcfppR 0.5.0
* upgrade `vcfpp.h` to v0.5.1
* add `bcfreader$updateSamples()` function
Expand Down
2 changes: 2 additions & 0 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ heterozygosity <- function(vcffile, region = "", samples = "-", pass = FALSE, qu
#' \item Parameter: region - The region to be constrained
#' \item Parameter: samples - The samples to be constrained. Comma separated list of samples to include (or exclude with "^" prefix).
#' }
#' @field setRegion try to set specific region to work with. will throw errors if no index or region found. Use getStatus to check if the region is valid or empty!
#' @field getStatus return 1: region is valid and not empty. 0: region is valid but empty. -1: no index file. -2: region not found or invalid region form
#' @field variant Try to get next variant record. return FALSE if there are no more variants or hit the end of file, otherwise TRUE.
#' @field chr Return the CHROM field of current variant
#' @field pos Return the POS field of current variant
Expand Down
161 changes: 117 additions & 44 deletions inst/include/vcfpp.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
* @file https://github.com/Zilong-Li/vcfpp/vcfpp.h
* @author Zilong Li
* @email [email protected]
* @version v0.5.2
* @version v0.6.0
* @breif a single C++ file for manipulating VCF
* Copyright (C) 2022-2023.The use of this code is governed by the LICENSE file.
******************************************************************************/
Expand Down Expand Up @@ -49,6 +49,7 @@
// make sure you have htslib installed
extern "C"
{
# include <htslib/hts.h>
# include <htslib/kstring.h>
# include <htslib/tbx.h>
# include <htslib/vcf.h>
Expand Down Expand Up @@ -94,6 +95,9 @@ using isFormatVector = typename std::enable_if<std::is_same<T, std::vector<float
|| std::is_same<T, std::vector<int>>::value,
bool>::type;

namespace details
{

template<typename T>
isScalar<T> isMissing(T const & v)
{
Expand Down Expand Up @@ -151,16 +155,43 @@ inline std::vector<std::string> split_string(const std::string & s, const std::s
}

// deleter for htsFile
struct htsFile_close
struct hts_file_close
{
void operator()(htsFile * x)
{
if(x) hts_close(x);
}
};

// deleter for hts iterator
struct hts_iter_close
{
void operator()(hts_itr_t * x)
{
if(x) hts_itr_destroy(x);
}
};

// deleter for hts idx
struct hts_idx_close
{
void operator()(hts_idx_t * x)
{
if(x) hts_idx_destroy(x);
}
};

// deleter for tabix idx
struct tabix_idx_close
{
void operator()(tbx_t * x)
{
if(x) tbx_destroy(x);
}
};

// deleter for variant
struct variant_close
struct bcf_line_close
{
void operator()(bcf1_t * x)
{
Expand All @@ -177,6 +208,8 @@ struct bcf_hdr_close
}
};

} // namespace details

/**
* @class BcfHeader
* @brief Object represents header of the VCF, offering methods to access and modify the tags
Expand Down Expand Up @@ -380,7 +413,7 @@ class BcfHeader
* */
void updateSamples(const std::string & samples)
{
auto ss = split_string(samples, ",");
auto ss = details::split_string(samples, ",");
const int nsamples = nSamples();
if(nsamples != (int)ss.size())
throw std::runtime_error("You provide either too few or too many samples");
Expand Down Expand Up @@ -466,7 +499,7 @@ class BcfRecord

private:
BcfHeader * header;
std::shared_ptr<bcf1_t> line = std::shared_ptr<bcf1_t>(bcf_init(), variant_close()); // variant
std::shared_ptr<bcf1_t> line = std::shared_ptr<bcf1_t>(bcf_init(), details::bcf_line_close()); // variant
bcf_hdr_t * hdr_d = NULL; // a dup header by bcf_hdr_dup(header->hdr)
bcf_fmt_t * fmt = NULL;
bcf_info_t * info = NULL;
Expand Down Expand Up @@ -1517,9 +1550,9 @@ class BcfReader
{
private:
std::shared_ptr<htsFile> fp; // hts file
hts_idx_t * hidx = NULL; // hts index file
tbx_t * tidx = NULL; // .tbi .csi index file for vcf files
hts_itr_t * itr = NULL; // tabix iterator
std::shared_ptr<hts_idx_t> hidx; // hts index file
std::shared_ptr<tbx_t> tidx; // .tbi .csi index file for vcf files
std::shared_ptr<hts_itr_t> itr; // hts iterator
kstring_t s = {0, 0, NULL}; // kstring
std::string fname;
bool isBcf; // if the input file is bcf or vcf;
Expand All @@ -1529,7 +1562,7 @@ class BcfReader
BcfHeader header;
/// number of samples in the VCF
int nsamples;
/// number of samples in the VCF
/// a vector of samples name in the VCF
std::vector<std::string> SamplesName;

/// Construct an empty BcfReader
Expand Down Expand Up @@ -1575,24 +1608,25 @@ class BcfReader

~BcfReader()
{
close();
if(s.s) free(s.s);
}

/// close the BcfReader object.
/// Close the VCF file and its associated files
void close()
{
if(fp) fp.reset();
if(itr) hts_itr_destroy(itr);
if(hidx) hts_idx_destroy(hidx);
if(tidx) tbx_destroy(tidx);
if(s.s) free(s.s);
if(hidx) hidx.reset();
if(itr) itr.reset();
if(tidx) tidx.reset();
}

/// open a VCF/BCF/STDIN file for streaming in
/// Open a VCF/BCF/STDIN file for streaming in
void open(const std::string & file)
{
if(!fname.empty() && fname != file)
throw std::runtime_error("does not support re-open a file yet. " + fname);
fname = file;
fp = std::shared_ptr<htsFile>(hts_open(file.c_str(), "r"), htsFile_close());
fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), "r"), details::hts_file_close());
if(!fp) throw std::invalid_argument("I/O error: input file is invalid");
header.hdr = bcf_hdr_read(fp.get());
nsamples = bcf_hdr_nsamples(header.hdr);
Expand All @@ -1611,12 +1645,44 @@ class BcfReader
return header;
}

/** @brief get the number of records of given region */
uint64_t getVariantsCount(BcfRecord & r, const std::string & region)
/**
* @brief query the status of a given region in the VCF
* @return -2: the region is not a valid bcftools-like format,
* or it is not presenting in the VCF even though it's bcftols-like format. \n
* -1: there is no index file found. \n
* 0: the region is valid but empty. \n
* 1: vaild and not empty. \n
*/
int getStatus(const std::string & region)
{
try
{
setRegion(region);
BcfRecord v(header);
if(!getNextVariant(v)) return 0;
}
catch(const std::invalid_argument & e)
{
return -1;
}
catch(const std::runtime_error & e)
{
return -2;
}
return 1;
}

/**
* @brief count the number of variants by iterating through a given region.
* @note If you want to continue work on that region, remember to reset the region by setRegion()! \n
* Also, check the status of the region first to handle the different cases!
*/
int getVariantsCount(const std::string & region)
{
uint64_t c{0};
while(getNextVariant(r)) c++;
setRegion(region); // reset the region
int c{0};
setRegion(region);
BcfRecord v(header);
while(getNextVariant(v)) c++;
return c;
}

Expand All @@ -1633,54 +1699,61 @@ class BcfReader
}

/**
* @brief explicitly stream to specific region
* @param region the string is samtools-like format which is chr:start-end
* @brief explicitly stream to specific region. throw invalid_argument error if index file not found.
* throw runtime_error if the region was not a valid bcftools-like format or was not presenting in the
* VCF.
* @param region the string for region is samtools-like format, which can be 'chr', 'chr:start' and
* 'chr:start-end'
* */
void setRegion(const std::string & region)
{
// 1. check and load index first
// 2. query iterval region
// 3. if region is empty, use "."
if(isEndWith(fname, "bcf") || isEndWith(fname, "bcf.gz"))
if(details::isEndWith(fname, "bcf") || details::isEndWith(fname, "bcf.gz"))
{
isBcf = true;
hidx = bcf_index_load(fname.c_str());
if(itr) bcf_itr_destroy(itr); // reset current region.
hidx = std::shared_ptr<hts_idx_t>(bcf_index_load(fname.c_str()), details::hts_idx_close());
if(itr) itr.reset(); // reset current region.
if(region.empty())
itr = bcf_itr_querys(hidx, header.hdr, ".");
itr = std::shared_ptr<hts_itr_t>(bcf_itr_querys(hidx.get(), header.hdr, "."),
details::hts_iter_close());
else
itr = bcf_itr_querys(hidx, header.hdr, region.c_str());
itr = std::shared_ptr<hts_itr_t>(bcf_itr_querys(hidx.get(), header.hdr, region.c_str()),
details::hts_iter_close());
}
else
{
isBcf = false;
tidx = tbx_index_load(fname.c_str());
if(tidx == NULL) throw std::runtime_error("error in loading tabix index!");
if(itr) tbx_itr_destroy(itr); // reset current region.
tidx = std::shared_ptr<tbx_t>(tbx_index_load(fname.c_str()), details::tabix_idx_close());
if(tidx.get() == NULL) throw std::invalid_argument(" no tabix index found!");
if(itr) itr.reset(); // reset
if(region.empty())
itr = tbx_itr_querys(tidx, ".");
itr = std::shared_ptr<hts_itr_t>(tbx_itr_querys(tidx.get(), "."), details::hts_iter_close());
else
itr = tbx_itr_querys(tidx, region.c_str());
if(itr == NULL) throw std::runtime_error("no interval region found!");
itr = std::shared_ptr<hts_itr_t>(tbx_itr_querys(tidx.get(), region.c_str()),
details::hts_iter_close());
}
if(itr.get() == NULL)
throw std::runtime_error("region was not found! make sure the region format is correct");
}

/** @brief read in the next variant
* @param r r is a BcfRecord object to be filled in. */
* @param r the BcfRecord object to be filled in. */
bool getNextVariant(BcfRecord & r)
{
int ret = -1;
if(itr != NULL)
if(itr.get() != NULL)
{
if(isBcf)
{
ret = bcf_itr_next(fp.get(), itr, r.line.get());
ret = bcf_itr_next(fp.get(), itr.get(), r.line.get());
bcf_unpack(r.line.get(), BCF_UN_ALL);
return (ret >= 0);
}
else
{
int slen = tbx_itr_next(fp.get(), tidx, itr, &s);
int slen = tbx_itr_next(fp.get(), tidx.get(), itr.get(), &s);
if(slen > 0)
{
ret = vcf_parse1(&s, r.header->hdr, r.line.get()); // ret > 0, error
Expand All @@ -1707,7 +1780,7 @@ class BcfWriter
{
private:
std::shared_ptr<htsFile> fp; // hts file
std::shared_ptr<bcf1_t> b = std::shared_ptr<bcf1_t>(bcf_init(), variant_close()); // variant
std::shared_ptr<bcf1_t> b = std::shared_ptr<bcf1_t>(bcf_init(), details::bcf_line_close()); // variant
int ret;
bool isHeaderWritten = false;
const BcfHeader * hp;
Expand Down Expand Up @@ -1783,8 +1856,8 @@ class BcfWriter
*/
void open(const std::string & fname)
{
auto mode = getMode(fname, "w");
fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()), htsFile_close());
auto mode = details::getMode(fname, "w");
fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()), details::hts_file_close());
if(!fp) throw std::invalid_argument("I/O error: input file is invalid");
}

Expand All @@ -1799,7 +1872,7 @@ class BcfWriter
*/
void open(const std::string & fname, const std::string & mode)
{
fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()), htsFile_close());
fp = std::shared_ptr<htsFile>(hts_open(fname.c_str(), mode.c_str()), details::hts_file_close());
if(!fp) throw std::invalid_argument("I/O error: input file is invalid");
}

Expand Down Expand Up @@ -1872,7 +1945,7 @@ class BcfWriter
}

/// streams out the given variant of BcfRecord type
inline bool writeRecord(BcfRecord & v)
bool writeRecord(BcfRecord & v)
{
if(!isHeaderWritten) writeHeader();
if(bcf_write(fp.get(), v.header->hdr, v.line.get()) < 0) return false;
Expand Down
4 changes: 4 additions & 0 deletions man/vcfreader.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 2111f60

Please sign in to comment.