diff --git a/.appveyor.yml b/.appveyor.yml index 45550bade..8fe288094 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -27,7 +27,7 @@ install: - set MSYSTEM=MINGW64 - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH% - set MINGWPREFIX=x86_64-w64-mingw32 - - "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl\"" + - "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-autotools mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl mingw-w64-x86_64-tools-git\"" build_script: - set HOME=. diff --git a/.cirrus.yml b/.cirrus.yml index 2e45de544..83e36af9c 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -117,11 +117,11 @@ ubuntu_task: << : *TEST -# CentOS -centos_task: - name: centos-gcc +# Rocky Linux +rocky_task: + name: rockylinux-gcc container: - image: centos:latest + image: rockylinux:latest cpu: 2 memory: 1G @@ -146,7 +146,7 @@ centos_task: macosx_task: name: macosx + clang osx_instance: - image: catalina-base + image: monterey-base environment: CC: clang diff --git a/INSTALL b/INSTALL index d0eb638c5..2f6deacb2 100644 --- a/INSTALL +++ b/INSTALL @@ -53,11 +53,9 @@ library is used. Systems that do not have CCHmac will get this from libcrypto. libcrypto is part of OpenSSL or one of its derivatives (LibreSSL or BoringSSL). -On Microsoft Windows we recommend use of Mingw64/Msys2. Note that -currently for the test harness to work you will need to override the -test temporary directory with e.g.: make check TEST_OPTS="-t C:/msys64/tmp/_" -Whilst the code may work on Windows with other environments, these have -not been verified. +On Microsoft Windows we recommend use of Mingw64/Msys2. Whilst the +code may work on Windows with other environments, these have not been +verified. Use of the configure script is a requirement too. Update htscodecs submodule ========================== @@ -259,13 +257,37 @@ RedHat / CentOS sudo yum install autoconf automake make gcc perl-Data-Dumper zlib-devel bzip2 bzip2-devel xz-devel curl-devel openssl-devel +Note: On some versions perl FindBin will need to be installed to make the tests work. + +sudo yum install perl-FindBin + Alpine Linux ------------ -sudo apk update # Ensure the package list is up to date -sudo apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev libressl-dev +doas apk update # Ensure the package list is up to date +doas apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev libressl-dev OpenSUSE -------- sudo zypper install autoconf automake make gcc perl zlib-devel libbz2-devel xz-devel libcurl-devel libopenssl-devel + +Windows MSYS2/MINGW64 +--------------------- + +The configure script must be used as without it the compilation will +likely fail. + +Follow MSYS2 installation instructions at +https://www.msys2.org/wiki/MSYS2-installation/ + +Then relaunch to MSYS2 shell using the "MSYS2 MinGW x64" executable. +Once in that environment (check $MSYSTEM equals "MINGW64") install the +compilers using pacman -S and the following package list: + +base-devel mingw-w64-x86_64-toolchain +mingw-w64-x86_64-libdeflate mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 +mingw-w64-x86_64-xz mingw-w64-x86_64-curl mingw-w64-x86_64-autotools +mingw-w64-x86_64-tools-git + +(The last is only needed for building libraries compatible with MSVC.) diff --git a/LICENSE b/LICENSE index 5075c3f53..cf2e9f82a 100644 --- a/LICENSE +++ b/LICENSE @@ -3,7 +3,7 @@ according to the terms of the following MIT/Expat license.] The MIT/Expat License -Copyright (C) 2012-2021 Genome Research Ltd. +Copyright (C) 2012-2022 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,7 +29,7 @@ according to the terms of the following Modified 3-Clause BSD license.] The Modified-BSD License -Copyright (C) 2012-2020 Genome Research Ltd. +Copyright (C) 2012-2022 Genome Research Ltd. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/Makefile b/Makefile index 37ae76565..0871580ce 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Makefile for htslib, a C library for high-throughput sequencing data formats. # -# Copyright (C) 2013-2021 Genome Research Ltd. +# Copyright (C) 2013-2022 Genome Research Ltd. # # Author: John Marshall # @@ -131,8 +131,8 @@ LIBHTS_SOVERSION = 3 # is not strictly necessary and should be removed the next time # LIBHTS_SOVERSION is bumped (see #1144 and # https://developer.apple.com/library/archive/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html#//apple_ref/doc/uid/TP40002013-SW23) -MACH_O_COMPATIBILITY_VERSION = 3.1.14 -MACH_O_CURRENT_VERSION = 3.1.14 +MACH_O_COMPATIBILITY_VERSION = 3.1.15 +MACH_O_CURRENT_VERSION = 3.1.15 # $(NUMERIC_VERSION) is for items that must have a numeric X.Y.Z string # even if this is a dirty or untagged Git working tree. @@ -282,10 +282,10 @@ SHLIB_FLAVOUR = cygdll lib-shared: cyghts-$(LIBHTS_SOVERSION).dll else ifeq "$(findstring MSYS,$(PLATFORM))" "MSYS" SHLIB_FLAVOUR = dll -lib-shared: hts-$(LIBHTS_SOVERSION).dll +lib-shared: hts-$(LIBHTS_SOVERSION).dll hts-$(LIBHTS_SOVERSION).def hts-$(LIBHTS_SOVERSION).lib else ifeq "$(findstring MINGW,$(PLATFORM))" "MINGW" SHLIB_FLAVOUR = dll -lib-shared: hts-$(LIBHTS_SOVERSION).dll +lib-shared: hts-$(LIBHTS_SOVERSION).dll hts-$(LIBHTS_SOVERSION).def hts-$(LIBHTS_SOVERSION).lib else SHLIB_FLAVOUR = so lib-shared: libhts.so @@ -330,6 +330,41 @@ cyghts-$(LIBHTS_SOVERSION).dll libhts.dll.a: $(LIBHTS_OBJS) hts-$(LIBHTS_SOVERSION).dll hts.dll.a: $(LIBHTS_OBJS) $(CC) -shared -Wl,--out-implib=hts.dll.a -Wl,--enable-auto-import -Wl,--exclude-all-symbols $(LDFLAGS) -o $@ -Wl,--whole-archive $(LIBHTS_OBJS) -Wl,--no-whole-archive $(LIBS) -lpthread +hts-$(LIBHTS_SOVERSION).def: hts-$(LIBHTS_SOVERSION).dll + gendef hts-$(LIBHTS_SOVERSION).dll + +hts-$(LIBHTS_SOVERSION).lib: hts-$(LIBHTS_SOVERSION).def + dlltool -m i386:x86-64 -d hts-$(LIBHTS_SOVERSION).def -l hts-$(LIBHTS_SOVERSION).lib + +# Bundling libraries, binaries, dll dependencies, and licenses into a +# single directory. NB: This is not needed for end-users, but a test bed +# for maintainers building binary distributions. +# +# NOTE: only tested on the supported MSYS2/MINGW64 environment. +dist-windows: DESTDIR= +dist-windows: prefix=dist-windows +dist-windows: install + cp hts-$(LIBHTS_SOVERSION).def hts-$(LIBHTS_SOVERSION).lib dist-windows/lib + cp `ldd hts-$(LIBHTS_SOVERSION).dll| awk '/mingw64/ {print $$3}'` dist-windows/bin + mkdir -p dist-windows/share/licenses/htslib + -cp -r /mingw64/share/licenses/mingw-w64-libraries \ + /mingw64/share/licenses/brotli \ + /mingw64/share/licenses/bzip2 \ + /mingw64/share/licenses/gcc-libs \ + /mingw64/share/licenses/libdeflate \ + /mingw64/share/licenses/libpsl \ + /mingw64/share/licenses/libtre \ + /mingw64/share/licenses/libwinpthread \ + /mingw64/share/licenses/openssl \ + /mingw64/share/licenses/xz \ + /mingw64/share/licenses/zlib \ + /mingw64/share/licenses/zstd \ + dist-windows/share/licenses/ + -cp -r /usr/share/licenses/curl \ + dist-windows/share/licenses/ + cp LICENSE dist-windows/share/licenses/htslib/ + + # Target to allow htslib.mk to build all the object files before it # links the shared and static libraries. hts-object-files: $(LIBHTS_OBJS) diff --git a/NEWS b/NEWS index 60646df55..51f65559a 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,86 @@ +Noteworthy changes in release 1.15 (21st February 2022) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Features and Updates +-------------------- + +* Bgzip now has a --keep option to not remove the input file after + compressing. (PR#1331) + +* Improved file format detection so some BED files are no longer + detected as FASTQ or FASTA. (PR#1350, thanks to John Marshall) + +* Added xz (lzma), zstd and D4 formats to the file type detection + functions. We don't actively support reading these data types, but + function calls and htsfile can detect them. (PR#1340, thanks to + John Marshall) + +* CRAM now also uses libdeflate for read-names if the libdeflate + version is new enough (1.9 onwards). Previously we used zlib for + this due to poor performance of libdeflate. This gives a slight + speed up and reduction in file size. (PR#1383) + +* The VCF and BCF readers will now issue a warning if contig, INFO + or FORMAT IDs do not match the formats described in the VCFv4.3 + specification. Note that while the invalid names will mostly still + be accepted, future updates will convert the warnings to errors + causing files including invalid names to be rejected. (PR#1389) + +Build changes +------------- + +These are compiler, configuration and makefile based changes. + +* HTSlib now uses libhtscodecs release 1.2.1. + +* Improved support for compiling and linking against HTSlib with + Microsoft Visual Studio. (PR#1380, #1377, #1375. Thanks to + Aidan Bickford and John Marshall) + +* Various internal CI improvements. + +Bug fixes +--------- + +* Fixed CRAM index queries for HTSJDK output (PR#1388, reported by + Chris Norman). Note this also fixes writing CRAM writing, to match + the specification (and HTSJDK), from version 3.1 onwards. + +* Fixed CRAM index queries when required-fields settings are selected + to ignore CIGARs (PR#1372, reported by Giulio Genovese). + +* Unmapped but placed (having chr/pos) are now included in the BAM + indices. (PR#1352, thanks to John Marshall) + +* CRAM now honours the filename##idx##index nomenclature for + specifying non-standard index locations. (PR#1360, reported by + Michael Cariaso) + +* Minor CRAM v1.0 read-group fix (PR#1349, thanks to John Marshall) + +* Permit .fa and .fq file type detection as synonyms for FASTA and + FASTQ. (PR#1386). + +* Empty VCF format fields are now output ":.:" as instead of "::". + (PR#1370) + +* Repeated bcf_sr_seek calls now work. (PR#1363, reported by + Giulio Genovese) + +* Bcf_remove_allele_set now works on unpacked BCF records. (PR#1358, + reported by Brent Pedersen). + +* The hts_parse_decimal() function used to read numbers in region lists + is now better at rejecting non-numeric values. In particular it + now rejects a lone 'G' instead of interpreting it as '0G', i.e. zero. + (PR#1396, PR#1400, reported by SSSimon Yang; thanks to John Marshall). + +* Improve support for GPU issues listed by -Wdouble-promotion. + (PR#1365, reported by David Seisert) + +* Fix example code in header file documentation. (PR#1381, Thanks to + Aidan Bickford) + Noteworthy changes in release 1.14 (22nd October 2021) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bgzip.1 b/bgzip.1 index 86c144f46..90630345c 100644 --- a/bgzip.1 +++ b/bgzip.1 @@ -1,4 +1,4 @@ -.TH bgzip 1 "22 October 2021" "htslib-1.14" "Bioinformatics tools" +.TH bgzip 1 "21 February 2022" "htslib-1.15" "Bioinformatics tools" .SH NAME .PP bgzip \- Block compression/decompression utility diff --git a/bgzip.c b/bgzip.c index f9abf99c5..bd0374811 100644 --- a/bgzip.c +++ b/bgzip.c @@ -173,7 +173,7 @@ int main(int argc, char **argv) case 1: printf( "bgzip (htslib) %s\n" -"Copyright (C) 2021 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2022 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; case 'h': return bgzip_main_usage(stdout, EXIT_SUCCESS); case '?': return bgzip_main_usage(stderr, EXIT_FAILURE); diff --git a/cram/cram_decode.c b/cram/cram_decode.c index e65e11d08..b352fc633 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020 Genome Research Ltd. +Copyright (c) 2012-2020, 2022 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -1777,7 +1777,7 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } cr->ncigar = ncigar - cr->cigar; - cr->aend = ref_pos; + cr->aend = ref_pos > cr->apos ? ref_pos : cr->apos; //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos); @@ -3228,7 +3228,8 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { } // position beyond end of range; bail out - if (c_next->ref_seq_start > fd->range.end) { + if (fd->range.refid != -1 && + c_next->ref_seq_start > fd->range.end) { cram_free_container(c_next); fd->ctr_mt = NULL; fd->ooc = 1; @@ -3236,7 +3237,8 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { } // before start of range; skip to next container - if (c_next->ref_seq_start + c_next->ref_seq_span-1 < + if (fd->range.refid != -1 && + c_next->ref_seq_start + c_next->ref_seq_span-1 < fd->range.start) { c_next->curr_slice_mt = c_next->max_slice; cram_seek(fd, c_next->length, SEEK_CUR); @@ -3301,7 +3303,8 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { } // position beyond end of range; bail out - if (s_next->hdr->ref_seq_start > fd->range.end) { + if (fd->range.refid != -1 && + s_next->hdr->ref_seq_start > fd->range.end) { fd->ooc = 1; cram_free_slice(s_next); c_next->slice = s_next = NULL; @@ -3309,7 +3312,8 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { } // before start of range; skip to next slice - if (s_next->hdr->ref_seq_start + s_next->hdr->ref_seq_span-1 < + if (fd->range.refid != -1 && + s_next->hdr->ref_seq_start + s_next->hdr->ref_seq_span-1 < fd->range.start) { cram_free_slice(s_next); c_next->slice = s_next = NULL; diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 192278f17..d35643a92 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020 Genome Research Ltd. +Copyright (c) 2012-2020, 2022 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -1397,6 +1397,9 @@ static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s, return -1; } +// CRAM version >= 3.1 +#define CRAM_ge31(v) ((v) >= 0x301) + /* * Encodes all slices in a container into blocks. * Returns 0 on success @@ -1526,6 +1529,12 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { s->hdr->ref_seq_id = -2; s->hdr->ref_seq_start = 0; s->hdr->ref_seq_span = 0; + } else if (c->ref_id == -1 && CRAM_ge31(fd->version)) { + // Spec states span=0, but it broke our range queries. + // See commit message for this and prior. + s->hdr->ref_seq_id = -1; + s->hdr->ref_seq_start = 0; + s->hdr->ref_seq_span = 0; } else { s->hdr->ref_seq_id = c->ref_id; s->hdr->ref_seq_start = first_base; @@ -1923,8 +1932,15 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { } c->ref_seq_id = c->slices[0]->hdr->ref_seq_id; - c->ref_seq_start = c->slices[0]->hdr->ref_seq_start; - c->ref_seq_span = c->slices[0]->hdr->ref_seq_span; + if (c->ref_seq_id == -1 && CRAM_ge31(fd->version)) { + // Spec states span=0, but it broke our range queries. + // See commit message for this and prior. + c->ref_seq_start = 0; + c->ref_seq_span = 0; + } else { + c->ref_seq_start = c->slices[0]->hdr->ref_seq_start; + c->ref_seq_span = c->slices[0]->hdr->ref_seq_span; + } for (i = 0; i < c->curr_slice; i++) { cram_slice *s = c->slices[i]; @@ -2592,12 +2608,18 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, * * See cram_next_container() and cram_close(). */ -void cram_update_curr_slice(cram_container *c) { +void cram_update_curr_slice(cram_container *c, int version) { cram_slice *s = c->slice; if (c->multi_seq) { s->hdr->ref_seq_id = -2; s->hdr->ref_seq_start = 0; s->hdr->ref_seq_span = 0; + } else if (c->curr_ref == -1 && CRAM_ge31(version)) { + // Spec states span=0, but it broke our range queries. + // See commit message for this and prior. + s->hdr->ref_seq_id = -1; + s->hdr->ref_seq_start = 0; + s->hdr->ref_seq_span = 0; } else { s->hdr->ref_seq_id = c->curr_ref; s->hdr->ref_seq_start = c->first_base; @@ -2632,7 +2654,7 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { c->curr_ref = bam_ref(b); if (c->slice) - cram_update_curr_slice(c); + cram_update_curr_slice(c, fd->version); /* Flush container */ if (c->curr_slice == c->max_slice || @@ -3057,7 +3079,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, cr->rg = brg ? brg->id : -1; } else if (CRAM_MAJOR_VERS(fd->version) == 1) { sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, "UNKNOWN"); - assert(brg); + if (!brg) goto block_err; + cr->rg = brg->id; } else { cr->rg = -1; } diff --git a/cram/cram_encode.h b/cram/cram_encode.h index c779b46a7..7cccae9af 100644 --- a/cram/cram_encode.h +++ b/cram/cram_encode.h @@ -106,7 +106,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c); * * See cram_next_container() and cram_close(). */ -void cram_update_curr_slice(cram_container *c); +void cram_update_curr_slice(cram_container *c, int version); #ifdef __cplusplus } diff --git a/cram/cram_index.c b/cram/cram_index.c index b567d3e05..45d420df2 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -173,6 +173,11 @@ int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { idx_stack[idx_stack_ptr] = idx; + // Support pathX.cram##idx##pathY.crai + const char *fn_delim = strstr(fn, HTS_IDX_DELIM); + if (fn_delim && !fn_idx) + fn_idx = fn_delim + strlen(HTS_IDX_DELIM); + if (!fn_idx) { if (hts_idx_check_local(fn, HTS_FMT_CRAI, &tfn_idx) == 0 && hisremote(fn)) tfn_idx = hts_idx_getfn(fn, ".crai"); diff --git a/cram/cram_io.c b/cram/cram_io.c index bc4f03646..c9dcb5014 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2021 Genome Research Ltd. +Copyright (c) 2012-2022 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -1108,8 +1108,8 @@ char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) { static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size, int level, int strat) { level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default - level *= 1.2; // NB levels go up to 12 here; 5 onwards is +1 - if (level >= 8) level += level/8; // 8->10, 9->12 + level *= 1.23; // NB levels go up to 12 here; 5 onwards is +1 + level += level>=8; // 5,6,7->6,7,8 8->10 9->12 if (level > 12) level = 12; if (strat == Z_RLE) // not supported by libdeflate @@ -1213,6 +1213,7 @@ char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) { } #endif +#if !defined(HAVE_LIBDEFLATE) || LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8) static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size, int level, int strat) { z_stream s; @@ -1269,6 +1270,7 @@ static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size, } return (char *)cdata; } +#endif #ifdef HAVE_LIBLZMA /* ------------------------------------------------------------------------ */ @@ -1754,9 +1756,11 @@ static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size, // // Eg RN at level 5; libdeflate=55.9MB zlib=51.6MB #ifdef HAVE_LIBDEFLATE +# if (LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8)) if (content_id == DS_RN && level >= 4 && level <= 7) return zlib_mem_deflate(in, in_size, out_size, level, strat); else +# endif return libdeflate_deflate(in, in_size, out_size, level, strat); #else return zlib_mem_deflate(in, in_size, out_size, level, strat); @@ -5338,7 +5342,7 @@ int cram_flush(cram_fd *fd) { if (fd->mode == 'w' && fd->ctr) { if(fd->ctr->slice) - cram_update_curr_slice(fd->ctr); + cram_update_curr_slice(fd->ctr, fd->version); if (-1 == cram_flush_container_mt(fd, fd->ctr)) return -1; @@ -5445,7 +5449,7 @@ int cram_close(cram_fd *fd) { if (fd->mode == 'w' && fd->ctr) { if(fd->ctr->slice) - cram_update_curr_slice(fd->ctr); + cram_update_curr_slice(fd->ctr, fd->version); if (-1 == cram_flush_container_mt(fd, fd->ctr)) return -1; diff --git a/hts.c b/hts.c index 194fb1f7a..ddaa60bfb 100644 --- a/hts.c +++ b/hts.c @@ -505,7 +505,12 @@ static int colmatch(const char *columns, const char *pattern) int hts_detect_format(hFILE *hfile, htsFormat *fmt) { - char columns[24]; + return hts_detect_format2(hfile, NULL, fmt); +} + +int hts_detect_format2(hFILE *hfile, const char *fname, htsFormat *fmt) +{ + char extension[HTS_MAX_EXT_LEN], columns[24]; unsigned char s[1024]; int complete = 0; ssize_t len = hpeek(hfile, s, 18); @@ -568,6 +573,18 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) return 0; } + // We avoid using filename extensions wherever possible (as filenames are + // not always available), but in a few cases they must be considered: + // - FASTA/Q indexes are simply tab-separated text; files that match these + // patterns but not the fai/fqi extension are usually generic BED files + // - GZI indexes have no magic numbers so can only be detected by filename + if (fname && strcmp(fname, "-") != 0) { + char *s; + if (find_file_extension(fname, extension) < 0) extension[0] = '\0'; + for (s = extension; *s; s++) *s = tolower_c(*s); + } + else extension[0] = '\0'; + if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=7 && s[5]<=7) { fmt->category = sequence_data; fmt->format = cram; @@ -613,6 +630,13 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) fmt->format = tbi; return 0; } + // GZI indexes have no magic numbers, so must be recognised solely by + // filename extension. + else if (strcmp(extension, "gzi") == 0) { + fmt->category = index_file; + fmt->format = gzi; + return 0; + } } else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) { fmt->category = variant_data; @@ -679,12 +703,12 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) fmt->format = crai; return 0; } - else if (colmatch(columns, "Ziiiii") == 6) { + else if (strstr(extension, "fqi") && colmatch(columns, "Ziiiii") == 6) { fmt->category = index_file; fmt->format = fqi_format; return 0; } - else if (colmatch(columns, "Ziiii") == 5) { + else if (strstr(extension, "fai") && colmatch(columns, "Ziiii") == 5) { fmt->category = index_file; fmt->format = fai_format; return 0; @@ -1367,7 +1391,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) if (strchr(simple_mode, 'r')) { const int max_loops = 5; // Should be plenty int loops = 0; - if (hts_detect_format(hfile, &fp->format) < 0) goto error; + if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error; // Deal with formats that re-direct an underlying file via a plug-in. // Loops as we may have crypt4gh served via htsget, or @@ -1392,7 +1416,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) } // Re-detect format against the result of the redirection - if (hts_detect_format(hfile, &fp->format) < 0) goto error; + if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error; } } else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) { @@ -2014,7 +2038,7 @@ int hts_file_type(const char *fname) if (f == NULL) return 0; htsFormat fmt; - if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; } + if (hts_detect_format2(f, fname, &fmt) < 0) { hclose_abruptly(f); return 0; } if (hclose(f) < 0) return 0; switch (fmt.format) { @@ -2384,14 +2408,12 @@ int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t if ( tid>=0 ) { if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin); - if (is_mapped) { - // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin - if (beg < 0) beg = 0; - if (end <= 0) end = 1; - // idx->z.last_off points to the start of the current record - if (insert_to_l(&idx->lidx[tid], beg, end, - idx->z.last_off, idx->min_shift) < 0) return -1; - } + // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin + if (beg < 0) beg = 0; + if (end <= 0) end = 1; + // idx->z.last_off points to the start of the current record + if (insert_to_l(&idx->lidx[tid], beg, end, + idx->z.last_off, idx->min_shift) < 0) return -1; } else idx->n_no_coor++; bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls); @@ -3460,32 +3482,32 @@ static inline long long push_digit(long long i, char c) long long hts_parse_decimal(const char *str, char **strend, int flags) { long long n = 0; - int decimals = 0, e = 0, lost = 0; + int digits = 0, decimals = 0, e = 0, lost = 0; char sign = '+', esign = '+'; - const char *s; + const char *s, *str_orig = str; while (isspace_c(*str)) str++; s = str; if (*s == '+' || *s == '-') sign = *s++; while (*s) - if (isdigit_c(*s)) n = push_digit(n, *s++); + if (isdigit_c(*s)) digits++, n = push_digit(n, *s++); else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++; else break; if (*s == '.') { s++; - while (isdigit_c(*s)) decimals++, n = push_digit(n, *s++); + while (isdigit_c(*s)) decimals++, digits++, n = push_digit(n, *s++); } - if (*s == 'E' || *s == 'e') { + switch (*s) { + case 'e': case 'E': s++; if (*s == '+' || *s == '-') esign = *s++; while (isdigit_c(*s)) e = push_digit(e, *s++); if (esign == '-') e = -e; - } + break; - switch (*s) { case 'k': case 'K': e += 3; s++; break; case 'm': case 'M': e += 6; s++; break; case 'g': case 'G': e += 9; s++; break; @@ -3500,7 +3522,10 @@ long long hts_parse_decimal(const char *str, char **strend, int flags) } if (strend) { - *strend = (char *)s; + // Set to the original input str pointer if not valid number syntax + *strend = (digits > 0)? (char *)s : (char *)str_orig; + } else if (digits == 0) { + hts_log_warning("Invalid numeric value %.8s[truncated]", str); } else if (*s) { if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ',')) hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s); @@ -4220,7 +4245,7 @@ static int idx_test_and_fetch(const char *fn, const char **local_fn, int *local_ free(s.s); return -1; } - if (hts_detect_format(remote_hfp, &fmt)) { + if (hts_detect_format2(remote_hfp, fn, &fmt)) { hts_log_error("Failed to detect format of index file '%s'", fn); goto fail; } diff --git a/hts_internal.h b/hts_internal.h index b4aa4c0d9..61956da21 100644 --- a/hts_internal.h +++ b/hts_internal.h @@ -139,7 +139,8 @@ static inline int find_file_extension(const char *fn, char ext_out[static HTS_MA { for (ext--; ext > fn && *ext != '.' && *ext != '/'; --ext) {} } - if (*ext != '.' || delim - ext > HTS_MAX_EXT_LEN || delim - ext < 4) return -1; + if (*ext != '.' || delim - ext > HTS_MAX_EXT_LEN || delim - ext < 3) + return -1; memcpy(ext_out, ext + 1, delim - ext - 1); ext_out[delim - ext - 1] = '\0'; return 0; diff --git a/htscodecs b/htscodecs index ed325d7e4..c6a459a44 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit ed325d7e406926b2be7697bdb185cf1692879c2a +Subproject commit c6a459a4488624d5e4b2d4d642febbd55a78a9b1 diff --git a/htsfile.1 b/htsfile.1 index 6160905ab..055ab9af6 100644 --- a/htsfile.1 +++ b/htsfile.1 @@ -1,4 +1,4 @@ -.TH htsfile 1 "22 October 2021" "htslib-1.14" "Bioinformatics tools" +.TH htsfile 1 "21 February 2022" "htslib-1.15" "Bioinformatics tools" .SH NAME htsfile \- identify high-throughput sequencing data files .\" diff --git a/htsfile.c b/htsfile.c index d6d6b4e69..0391a98fa 100644 --- a/htsfile.c +++ b/htsfile.c @@ -258,7 +258,7 @@ int main(int argc, char **argv) case 1: printf( "htsfile (htslib) %s\n" -"Copyright (C) 2021 Genome Research Ltd.\n", +"Copyright (C) 2022 Genome Research Ltd.\n", hts_version()); exit(EXIT_SUCCESS); break; @@ -283,7 +283,7 @@ int main(int argc, char **argv) if (mode == identify) { htsFormat fmt; - if (hts_detect_format(fp, &fmt) < 0) { + if (hts_detect_format2(fp, argv[i], &fmt) < 0) { error("detecting \"%s\" format failed", argv[i]); hclose_abruptly(fp); continue; diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7 index 635a6a3d5..5860ff0f5 100644 --- a/htslib-s3-plugin.7 +++ b/htslib-s3-plugin.7 @@ -1,4 +1,4 @@ -.TH htslib-s3-plugin 7 "22 October 2021" "htslib-1.14" "Bioinformatics tools" +.TH htslib-s3-plugin 7 "21 February 2022" "htslib-1.15" "Bioinformatics tools" .SH NAME s3 plugin \- htslib AWS S3 plugin .\" diff --git a/htslib/bgzf.h b/htslib/bgzf.h index 8e6b9b17e..24d787bdf 100644 --- a/htslib/bgzf.h +++ b/htslib/bgzf.h @@ -35,6 +35,13 @@ #include "hts_defs.h" +// Ensure ssize_t exists within this header. All #includes must precede this, +// and ssize_t must be undefined again at the end of this header. +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t +#define HTSLIB_SSIZE_T +#define ssize_t intptr_t +#endif + #ifdef __cplusplus extern "C" { #endif @@ -450,4 +457,9 @@ typedef struct BGZF BGZF; } #endif +#ifdef HTSLIB_SSIZE_T +#undef HTSLIB_SSIZE_T +#undef ssize_t +#endif + #endif diff --git a/htslib/hfile.h b/htslib/hfile.h index 987acb7c8..038591cbc 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -32,6 +32,13 @@ DEALINGS IN THE SOFTWARE. */ #include "hts_defs.h" +// Ensure ssize_t exists within this header. All #includes must precede this, +// and ssize_t must be undefined again at the end of this header. +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t +#define HTSLIB_SSIZE_T +#define ssize_t intptr_t +#endif + #ifdef __cplusplus extern "C" { #endif @@ -368,4 +375,9 @@ int hfile_has_plugin(const char *name); } #endif +#ifdef HTSLIB_SSIZE_T +#undef HTSLIB_SSIZE_T +#undef ssize_t +#endif + #endif diff --git a/htslib/hts.h b/htslib/hts.h index 99d0ff9d4..357593bd2 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -486,7 +486,7 @@ const char *hts_version(void); // Immediately after release, bump ZZ to 90 to distinguish in-development // Git repository builds from the release; you may wish to increment this // further when significant features are merged. -#define HTS_VERSION 101400 +#define HTS_VERSION 101500 /*! @abstract Introspection on the features enabled in htslib * @@ -534,10 +534,29 @@ const char *hts_feature_string(void); @param fp File opened for reading, positioned at the beginning @param fmt Format structure that will be filled out on return @return 0 for success, or negative if an error occurred. + + Equivalent to hts_detect_format2(fp, NULL, fmt). */ HTSLIB_EXPORT int hts_detect_format(struct hFILE *fp, htsFormat *fmt); +/*! + @abstract Determine format primarily by peeking at the start of a file + @param fp File opened for reading, positioned at the beginning + @param fname Name of the file, or NULL if not available + @param fmt Format structure that will be filled out on return + @return 0 for success, or negative if an error occurred. + @since 1.15 + +Some formats are only recognised if the filename is available and has the +expected extension, as otherwise more generic files may be misrecognised. +In particular: + - FASTA/Q indexes must have .fai/.fqi extensions; without this requirement, + some similar BED files would be misrecognised as indexes. +*/ +HTSLIB_EXPORT +int hts_detect_format2(struct hFILE *fp, const char *fname, htsFormat *fmt); + /*! @abstract Get a human-readable description of the file format @param fmt Format structure holding type, version, compression, etc. @@ -1118,10 +1137,26 @@ int hts_idx_nseq(const hts_idx_t *idx); @param strend If non-NULL, set on return to point to the first character in @a str after those forming the parsed number @param flags Or'ed-together combination of HTS_PARSE_* flags - @return Converted value of the parsed number. - - When @a strend is NULL, a warning will be printed (if hts_verbose is HTS_LOG_WARNING - or more) if there are any trailing characters after the number. + @return Integer value of the parsed number, or 0 if no valid number + + The input string is parsed as: optional whitespace; an optional '+' or + '-' sign; decimal digits possibly including ',' characters (if @a flags + includes HTS_PARSE_THOUSANDS_SEP) and a '.' decimal point; and an optional + case-insensitive suffix, which may be either 'k', 'M', 'G', or scientific + notation consisting of 'e'/'E' followed by an optional '+' or '-' sign and + decimal digits. To be considered a valid numeric value, the main part (not + including any suffix or scientific notation) must contain at least one + digit (either before or after the decimal point). + + When @a strend is NULL, @a str is expected to contain only (optional + whitespace followed by) the numeric value. A warning will be printed + (if hts_verbose is HTS_LOG_WARNING or more) if no valid parsable number + is found or if there are any unused characters after the number. + + When @a strend is non-NULL, @a str starts with (optional whitespace + followed by) the numeric value. On return, @a strend is set to point + to the first unused character after the numeric value, or to @a str + if no valid parsable number is found. */ HTSLIB_EXPORT long long hts_parse_decimal(const char *str, char **strend, int flags); diff --git a/htslib/hts_os.h b/htslib/hts_os.h index b71cb89e7..c715b0612 100644 --- a/htslib/hts_os.h +++ b/htslib/hts_os.h @@ -77,4 +77,10 @@ extern int is_cygpty(int fd); #define random rand #endif +/* MSVC does not provide ssize_t in its . This ensures the type + is available (unless suppressed by defining HTS_NO_SSIZE_T first). */ +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined HTS_NO_SSIZE_T && !defined ssize_t +#define ssize_t intptr_t +#endif + #endif // HTSLIB_HTS_OS_H diff --git a/htslib/knetfile.h b/htslib/knetfile.h index da9cdc5e8..cfddd6b67 100644 --- a/htslib/knetfile.h +++ b/htslib/knetfile.h @@ -44,6 +44,13 @@ #define netclose(fd) closesocket(fd) #endif +// Ensure ssize_t exists within this header. All #includes must precede this, +// and ssize_t must be undefined again at the end of this header. +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t +#define HTSLIB_SSIZE_T +#define ssize_t intptr_t +#endif + // FIXME: currently I/O is unbuffered #define KNF_TYPE_LOCAL 1 @@ -102,4 +109,9 @@ extern "C" { } #endif +#ifdef HTSLIB_SSIZE_T +#undef HTSLIB_SSIZE_T +#undef ssize_t +#endif + #endif diff --git a/htslib/kstring.h b/htslib/kstring.h index 150757ca6..09bc9e3d9 100644 --- a/htslib/kstring.h +++ b/htslib/kstring.h @@ -55,6 +55,13 @@ #endif #endif +// Ensure ssize_t exists within this header. All #includes must precede this, +// and ssize_t must be undefined again at the end of this header. +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t +#define HTSLIB_SSIZE_T +#define ssize_t intptr_t +#endif + /* kstring_t is a simple non-opaque type whose fields are likely to be * used directly by user code (but see also ks_str() and ks_len() below). * A kstring_t object is initialised by either of @@ -396,4 +403,9 @@ static inline int *ksplit(kstring_t *s, int delimiter, int *n) return offsets; } +#ifdef HTSLIB_SSIZE_T +#undef HTSLIB_SSIZE_T +#undef ssize_t +#endif + #endif diff --git a/htslib/sam.h b/htslib/sam.h index d37877e48..ea585a45a 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -33,6 +33,13 @@ DEALINGS IN THE SOFTWARE. */ #include "hts.h" #include "hts_endian.h" +// Ensure ssize_t exists within this header. All #includes must precede this, +// and ssize_t must be undefined again at the end of this header. +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t +#define HTSLIB_SSIZE_T +#define ssize_t intptr_t +#endif + #ifdef __cplusplus extern "C" { #endif @@ -937,10 +944,12 @@ void bam_destroy1(bam1_t *b); // ... use data ... cleanup: - for (size_t i = 0; i < nrecs; i++) - bam_destroy1(i); + if (recs) { + for (size_t i = 0; i < nrecs; i++) + bam_destroy1(&recs[i]); + free(recs); + } free(buffer); - free(recs); \endcode */ @@ -1492,7 +1501,8 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, ++s; } else if (type == 'f') { if (end - s >= 4) { - ksprintf(ks, "f:%g", le_to_float(s)); + // cast to avoid triggering -Wdouble-promotion + ksprintf(ks, "f:%g", (double)le_to_float(s)); s += 4; } else goto bad_aux; @@ -1594,7 +1604,8 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, if (ks_expand(ks, n*8) < 0) goto mem_err; for (i = 0; i < n; ++i) { ks->s[ks->l++] = ','; - r |= kputd(le_to_float(s), ks) < 0; + // cast to avoid triggering -Wdouble-promotion + r |= kputd((double)le_to_float(s), ks) < 0; s += 4; } break; @@ -2264,4 +2275,9 @@ int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, } #endif +#ifdef HTSLIB_SSIZE_T +#undef HTSLIB_SSIZE_T +#undef ssize_t +#endif + #endif diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index dae70099d..c980e3b45 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -75,6 +75,7 @@ static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, static bcf_sr_regions_t *_regions_init_string(const char *str); static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec); static void _regions_sort_and_merge(bcf_sr_regions_t *reg); +static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end, int missed_reg_handler); char *bcf_sr_strerror(int errnum) { @@ -838,6 +839,7 @@ static void bcf_sr_seek_start(bcf_srs_t *readers) for (i=0; inseqs; i++) reg->regs[i].creg = -1; reg->iseq = 0; + reg->prev_seq = -1; } @@ -851,8 +853,18 @@ int bcf_sr_seek(bcf_srs_t *readers, const char *seq, hts_pos_t pos) bcf_sr_seek_start(readers); return 0; } - bcf_sr_regions_overlap(readers->regions, seq, pos, pos); + int i, nret = 0; + + // Need to position both the readers and the regions. The latter is a bit of a mess + // because we can have in memory or external regions. The safe way is: + // - reset all regions as if they were not read from at all (bcf_sr_seek_start) + // - find the requested iseq (stored in the seq_hash) + // - position regions to the requested position (bcf_sr_regions_overlap) + bcf_sr_seek_start(readers); + if ( khash_str2int_get(readers->regions->seq_hash, seq, &i)>=0 ) readers->regions->iseq = i; + _bcf_sr_regions_overlap(readers->regions, seq, pos, pos, 0); + for (i=0; inreaders; i++) { nret += _reader_seek(&readers->readers[i],seq,pos,MAX_CSI_COOR-1); @@ -1095,7 +1107,7 @@ static int _regions_parse_line(char *line, int ichr, int ifrom, int ito, char ** if ( k==l ) { *from = *to = hts_parse_decimal(ss, &tmp, 0); - if ( tmp==ss ) return -1; + if ( tmp==ss || (*tmp && *tmp!='\t') ) return -1; } else { @@ -1103,7 +1115,7 @@ static int _regions_parse_line(char *line, int ichr, int ifrom, int ito, char ** *from = hts_parse_decimal(ss, &tmp, 0); else *to = hts_parse_decimal(ss, &tmp, 0); - if ( ss==tmp ) return -1; + if ( ss==tmp || (*tmp && *tmp!='\t') ) return -1; for (i=k; ifile); reg->file = NULL; free(reg); return NULL; } + ito = ifrom; } + else if ( ito<0 ) + ito = abs(ito); if ( !ret ) continue; if ( is_bed ) from++; *chr_end = 0; @@ -1406,14 +1421,20 @@ static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *re } int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end) +{ + return _bcf_sr_regions_overlap(reg,seq,start,end,1); +} + +static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end, int missed_reg_handler) { int iseq; if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1; // no such sequence + if ( missed_reg_handler && !reg->missed_reg_handler ) missed_reg_handler = 0; if ( reg->prev_seq==-1 || iseq!=reg->prev_seq || reg->prev_start > start ) // new chromosome or after a seek { // flush regions left on previous chromosome - if ( reg->missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 ) + if ( missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 ) bcf_sr_regions_flush(reg); bcf_sr_regions_seek(reg, seq); @@ -1427,7 +1448,7 @@ int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t sta { if ( bcf_sr_regions_next(reg) < 0 ) return -2; // no more regions left if ( reg->iseq != iseq ) return -1; // does not overlap any regions - if ( reg->missed_reg_handler && reg->end < start ) reg->missed_reg_handler(reg, reg->missed_reg_data); + if ( missed_reg_handler && reg->end < start ) reg->missed_reg_handler(reg, reg->missed_reg_data); } if ( reg->start <= end ) return 0; // region overlap return -1; // no overlap diff --git a/tabix.1 b/tabix.1 index 4b023bd9a..e4bf4b7cb 100644 --- a/tabix.1 +++ b/tabix.1 @@ -1,4 +1,4 @@ -.TH tabix 1 "22 October 2021" "htslib-1.14" "Bioinformatics tools" +.TH tabix 1 "21 February 2022" "htslib-1.15" "Bioinformatics tools" .SH NAME .PP tabix \- Generic indexer for TAB-delimited genome position files diff --git a/tabix.c b/tabix.c index 969f902f6..a79a7b968 100644 --- a/tabix.c +++ b/tabix.c @@ -581,7 +581,7 @@ int main(int argc, char *argv[]) case 1: printf( "tabix (htslib) %s\n" -"Copyright (C) 2021 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2022 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; case 2: return usage(stdout, EXIT_SUCCESS); diff --git a/test/index2.sam b/test/index2.sam new file mode 100644 index 000000000..97d39e65b --- /dev/null +++ b/test/index2.sam @@ -0,0 +1,11 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:1 LN:249250621 M5:1b22b98cdeb4a9304cb5d48026a85128 +@SQ SN:2 LN:243199373 M5:a0d9851da00400dec1098a9255ac712e +um1 69 1 1000000 0 * * 0 0 AAAAAAAAAA * +um1 137 1 1000000 44 10M * 0 0 AAAAAAAAAA * +um2 69 1 2000000 0 * * 0 0 AAAAAAAAAA * +um2 137 1 2000000 44 10M * 0 0 AAAAAAAAAA * +mu1 137 2 1000000 44 10M * 0 0 AAAAAAAAAA * +mu1 69 2 1000000 0 * * 0 0 AAAAAAAAAA * +mu2 137 2 2000000 44 10M * 0 0 AAAAAAAAAA * +mu2 69 2 2000000 0 * * 0 0 AAAAAAAAAA * diff --git a/test/sam.c b/test/sam.c index b6f6c0e04..cc5bfe77a 100644 --- a/test/sam.c +++ b/test/sam.c @@ -1655,6 +1655,37 @@ static int read_data_block(const char *in_name, samFile *fp_in, return ret; } +static void test_parse_decimal1(long long exp, const char *str, size_t exp_consumed, int flags, const char *warning) +{ + if (warning) fprintf(stderr, "(Expect %s message for \"%s\")\n", warning, str); + + long long val = hts_parse_decimal(str, NULL, flags); + if (val != exp) fail("hts_parse_decimal(\"%s\", NULL, %d) returned %lld, expected %lld", str, flags, val, exp); + + char *end; + val = hts_parse_decimal(str, &end, flags); + if (val != exp) fail("hts_parse_decimal(\"%s\", ..., %d) returned %lld, expected %lld", str, flags, val, exp); + size_t consumed = end - str; + if (consumed != exp_consumed) fail("hts_parse_decimal(\"%s\", ..., %d) consumed %zu chars, expected %zu", str, flags, consumed, exp_consumed); +} + +static void test_parse_decimal(void) +{ + test_parse_decimal1(37, "+37", 3, 0, NULL); + test_parse_decimal1(-1001, " \t -1,001x", 9, HTS_PARSE_THOUSANDS_SEP, "trailing 'x'"); + test_parse_decimal1(LLONG_MAX, "+9223372036854775807", 20, 0, NULL); + test_parse_decimal1(LLONG_MIN, "-9,223,372,036,854,775,808", 26, HTS_PARSE_THOUSANDS_SEP, NULL); + test_parse_decimal1(1500, "1.5e3", 5, 0, NULL); + test_parse_decimal1(1500, "1.5e+3k", 6, 0, "trailing 'k'"); + test_parse_decimal1(1500000000, "1.5G", 4, 0, NULL); + test_parse_decimal1(12345, "12.345k", 7, 0, NULL); + test_parse_decimal1(12345, "12.3456k", 8, 0, "dropped fraction"); + test_parse_decimal1(0, "A", 0, 0, "invalid numeric"); + test_parse_decimal1(0, "G", 0, 0, "invalid numeric"); + test_parse_decimal1(0, " +/-", 0, 0, "invalid numeric"); + test_parse_decimal1(0, " \t -.e+9999", 0, 0, "invalid numeric"); +} + static void test_mempolicy(void) { size_t bufsz = MAX_RECS * REC_LENGTH, nrecs = 0, i; @@ -2194,6 +2225,7 @@ int main(int argc, char **argv) check_cigar_tab(); check_big_ref(0); check_big_ref(1); + test_parse_decimal(); test_mempolicy(); set_qname(); for (i = 1; i < argc; i++) faidx1(argv[i]); diff --git a/test/test.pl b/test/test.pl index 5ba6022b4..ff5601b0d 100755 --- a/test/test.pl +++ b/test/test.pl @@ -840,6 +840,31 @@ sub test_index $wtmp =~ s/\//\\\\/g; } test_cmd($opts,out=>'tabix.out',cmd=>"$$opts{bin}/tabix $wtmp/index.vcf.gz##idx##$wtmp/index.vcf.gz.tbi 1:10000060-10000060"); + + cmd("$$opts{path}/test_view -b -p $$opts{tmp}/index2.bam -x $$opts{tmp}/index2.bam.bai $$opts{path}/index2.sam"); + for (my $tid = 1; $tid <= 2; $tid++) { + for (my $pos = 1; $pos <= 2; $pos++) { + # All queries should return exactly two sequences. + # The input data consists of mapped/unmapped and unmapped/mapped + # in both orders. + # Done verbatim as test_cmd cannot return $out for us to check. + my $test = "$$opts{path}/test_view $$opts{tmp}/index2.bam $tid:${pos}000000-${pos}000000"; + print "test_index:\n\t$test\n"; + my ($ret, $out) = _cmd($test); + if ($ret ne 0) { + failed($opts, $test); + } else { + my $rnum = ($out =~ s/^[^@].*\n//gm); + if ($rnum ne 2) { + failed($opts, $test); + } else { + passed($opts, $test); + } + } + } + } + unlink("$$opts{tmp}/index2.bam"); + unlink("$$opts{tmp}/index2.bam.bai"); } sub test_bcf2vcf diff --git a/vcf.c b/vcf.c index 0d59be6e6..18c97662a 100644 --- a/vcf.c +++ b/vcf.c @@ -1,7 +1,7 @@ /* vcf.c -- VCF/BCF API functions. Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Portions copyright (C) 2014 Intel Corporation. Author: Heng Li @@ -376,6 +376,126 @@ int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key) return -1; } +static void bcf_hrec_set_type(bcf_hrec_t *hrec) +{ + if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG; + else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO; + else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT; + else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT; + else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR; + else hrec->type = BCF_HL_GEN; +} + + +/** + The arrays were generated with + + valid_ctg: + perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48 + + valid_tag: + perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48 +*/ +static const uint8_t valid_ctg[256] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +static const uint8_t valid_tag[256] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/** + bcf_hrec_check() - check the validity of structured header lines + + Returns 0 on success or negative value on error. + + Currently the return status is not checked by the caller + and only a warning is printed on stderr. This should be improved + to propagate the error all the way up to the caller and let it + decide what to do: throw an error or proceed anyway. + */ +static int bcf_hrec_check(bcf_hrec_t *hrec) +{ + int i; + bcf_hrec_set_type(hrec); + + if ( hrec->type==BCF_HL_CTG ) + { + i = bcf_hrec_find_key(hrec,"ID"); + if ( i<0 ) goto err_missing_id; + char *val = hrec->vals[i]; + if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg; + while ( *(++val) ) + if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg; + return 0; + } + if ( hrec->type==BCF_HL_INFO ) + { + i = bcf_hrec_find_key(hrec,"ID"); + if ( i<0 ) goto err_missing_id; + char *val = hrec->vals[i]; + if ( !strcmp(val,"1000G") ) return 0; + if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag; + while ( *(++val) ) + if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag; + return 0; + } + if ( hrec->type==BCF_HL_FMT ) + { + i = bcf_hrec_find_key(hrec,"ID"); + if ( i<0 ) goto err_missing_id; + char *val = hrec->vals[i]; + if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag; + while ( *(++val) ) + if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag; + return 0; + } + return 0; + + err_missing_id: + hts_log_warning("Missing ID attribute in one or more header lines"); + return -1; + + err_invalid_ctg: + hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]); + return -1; + + err_invalid_tag: + hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]); + return -1; +} + static inline int is_escaped(const char *min, const char *str) { int n = 0; @@ -402,6 +522,7 @@ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) if (!hrec->key) goto fail; memcpy(hrec->key,p,n); hrec->key[n] = 0; + hrec->type = -1; p = ++q; if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579 @@ -483,8 +604,14 @@ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0) goto fail; if ( quoted && *q==ending ) q++; - if ( *q=='>' ) { nopen--; q++; } + if ( *q=='>' ) + { + if (nopen) nopen--; // this can happen with nested angle brackets <> + q++; + } } + if ( nopen ) + hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]); // Skip to end of line int nonspace = 0; @@ -555,10 +682,11 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) khint_t k; char *str = NULL; - if ( !strcmp(hrec->key, "contig") ) + bcf_hrec_set_type(hrec); + + if ( hrec->type==BCF_HL_CTG ) { hts_pos_t len = 0; - hrec->type = BCF_HL_CTG; // Get the contig ID ($str) and length ($j) i = bcf_hrec_find_key(hrec,"length"); @@ -623,11 +751,8 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) return 1; } - if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO; - else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT; - else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT; - else if ( hrec->nkeys>0 ) { hrec->type = BCF_HL_STR; return 1; } - else return 0; + if ( hrec->type==BCF_HL_STR ) return 1; + if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0; // INFO/FILTER/FORMAT char *id = NULL; @@ -738,7 +863,8 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) int res; if ( !hrec ) return 0; - hrec->type = BCF_HL_GEN; + bcf_hrec_check(hrec); // todo: check return status and propagate errors up + res = bcf_hdr_register_hrec(hdr,hrec); if (res < 0) return -1; if ( !res ) diff --git a/vcfutils.c b/vcfutils.c index 4153c5c60..890c50a16 100644 --- a/vcfutils.c +++ b/vcfutils.c @@ -1,6 +1,6 @@ /* vcfutils.c -- allele-related utility functions. - Copyright (C) 2012-2018, 2020-2021 Genome Research Ltd. + Copyright (C) 2012-2018, 2020-2022 Genome Research Ltd. Author: Petr Danecek @@ -256,10 +256,10 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int *map = (int*) calloc(line->n_allele, sizeof(int)); uint8_t *dat = NULL; + bcf_unpack(line, BCF_UN_ALL); + // create map of indexes from old to new ALT numbering and modify ALT kstring_t str = {0,0,0}; - if (!line->d.allele) - bcf_unpack(line, BCF_UN_STR); kputs(line->d.allele[0], &str); int nrm = 0, i,j; // i: ori alleles, j: new alleles @@ -747,7 +747,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nnew = nR_new; } - #define BRANCH(type_t,is_vector_end) \ + #define BRANCH(type_t,is_vector_end,set_missing) \ { \ for (j=0; jn_sample; j++) \ { \ @@ -757,7 +757,12 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int k_src, k_dst = 0; \ for (k_src=0; k_src