From 6ba09a2453e4a70a32ae826c4486cb89c6e1eaae Mon Sep 17 00:00:00 2001 From: eneskuluk <54481799+eneskuluk@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:55:34 -0700 Subject: [PATCH 1/7] initial commit, htslib genomicsdb pull --- Makefile | 24 +- faidx.c | 52 +++++ hfile.c | 2 +- htscodecs | 2 +- htslib/faidx.h | 4 + htslib/synced_bcf_reader.h | 1 + htslib/vcf.h | 68 +++++- synced_bcf_reader.c | 35 ++- vcf.c | 463 ++++++++++++++++++++++++++++++++++++- 9 files changed, 627 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index 88af18adc..ef5d61147 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,7 @@ CC = gcc AR = ar RANLIB = ranlib - +SOURCE_DIR = . # Default libraries to link if configure is not used htslib_default_libs = -lz -lm -lbz2 -llzma -lcurl @@ -35,10 +35,20 @@ CPPFLAGS = # TODO: probably update cram code to make it compile cleanly with -Wc++-compat # For testing strict C99 support add -std=c99 -D_XOPEN_SOURCE=600 #CFLAGS = -g -Wall -O2 -pedantic -std=c99 -D_XOPEN_SOURCE=600 -CFLAGS = -g -Wall -O2 -fvisibility=hidden +ifdef DEBUG + CFLAGS = -DDEBUG -g3 -gdwarf-3 + LDFLAGS = -g3 -gdwarf-3 +else + CFLAGS = -O3 + LDFLAGS = +endif +ifdef PROFILE + CFLAGS += -pg +endif +CFLAGS += -Wall -fPIC EXTRA_CFLAGS_PIC = -fpic TARGET_CFLAGS = -LDFLAGS = -fvisibility=hidden +LDFLAGS = VERSION_SCRIPT_LDFLAGS = -Wl,-version-script,$(srcprefix)htslib.map LIBS = $(htslib_default_libs) @@ -123,7 +133,7 @@ htscodecs.mk: $(srcdir)/hts_probe_cc.sh '$(CC)' '$(CFLAGS) $(CPPFLAGS)' '$(LDFLAGS)' >> $@ srcdir = . -srcprefix = +srcprefix = $(SOURCE_DIR)/ HTSPREFIX = # Flags for SIMD code @@ -138,7 +148,7 @@ HTS_BUILD_SSSE3 = HTS_BUILD_POPCNT = HTS_BUILD_SSE4_1 = -include htslib_vars.mk +include $(SOURCE_DIR)/htslib_vars.mk include htscodecs.mk # If not using GNU make, you need to copy the version number from version.sh @@ -189,10 +199,10 @@ config_vars.h: .SUFFIXES: .bundle .c .cygdll .dll .o .pico .so .c.o: - $(CC) $(CFLAGS) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) -c -o $@ $< + $(CC) $(CFLAGS) -I$(SOURCE_DIR) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) -c -o $@ $< .c.pico: - $(CC) $(CFLAGS) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< + $(CC) $(CFLAGS) -I$(SOURCE_DIR) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< LIBHTS_OBJS = \ diff --git a/faidx.c b/faidx.c index 5dd4bf1c0..8ac149485 100644 --- a/faidx.c +++ b/faidx.c @@ -914,6 +914,58 @@ int fai_adjust_region(const faidx_t *fai, int tid, return ((orig_beg != *beg ? 1 : 0) | (orig_end != *end && orig_end < HTS_POS_MAX ? 2 : 0)); } +static void fai_retrieve_into_buffer(const faidx_t *fai, const faidx1_t *val, + const uint64_t offset, const hts_pos_t beg, const hts_pos_t end, + char* s, hts_pos_t *len) { + size_t l; + int c = 0; + int ret; + + if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) { + hts_log_error("Range %"PRId64"..%"PRId64" too big", beg, end); + *len = -1; + return; + } + + ret = bgzf_useek(fai->bgzf, + offset + + beg / val->line_blen * val->line_len + + beg % val->line_blen, SEEK_SET); + + if (ret < 0) { + *len = -1; + hts_log_error("Failed to retrieve block. (Seeking in a compressed, .gzi unindexed, file?)"); + return; + } + + l = 0; + + while ( l < end - beg && (c=bgzf_getc(fai->bgzf))>=0 ) + if (isgraph(c)) s[l++] = c; + if (c < 0) { + hts_log_error("Failed to retrieve block: %s", + c == -1 ? "unexpected end of file" : "error reading file"); + *len = -1; + return; + } + + s[l] = '\0'; + *len = l < INT_MAX ? l : INT_MAX; +} + +void faidx_fetch_seq_into_buffer(const faidx_t *fai, + const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, char* s, hts_pos_t *len) +{ + faidx1_t val; + + // Adjust position + if (faidx_adjust_position(fai, 1,&val, c_name, &p_beg_i, &p_end_i, len)) { + *len = 0; + return; + } + + fai_retrieve_into_buffer(fai, &val, val.seq_offset, p_beg_i, p_end_i + 1, s, len); +} char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len) { diff --git a/hfile.c b/hfile.c index f8d42e49a..b8143928b 100644 --- a/hfile.c +++ b/hfile.c @@ -1123,7 +1123,7 @@ static hFILE *hopen_unknown_scheme(const char *fname, const char *mode) } /* Returns the appropriate handler, or NULL if the string isn't an URL. */ -static const struct hFILE_scheme_handler *find_scheme_handler(const char *s) +const struct hFILE_scheme_handler *find_scheme_handler(const char *s) { static const struct hFILE_scheme_handler unknown_scheme = { hopen_unknown_scheme, hfile_always_local, "built-in", 0 }; diff --git a/htscodecs b/htscodecs index 11b5007ff..dcb331678 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 11b5007ffb68bea9f6c777874a215e4187ce659a +Subproject commit dcb33167839622903897fc985a8cccf89b3358e2 diff --git a/htslib/faidx.h b/htslib/faidx.h index 4351b3fbe..12f3f4b40 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -237,6 +237,10 @@ by end users by calling `free()` on it. HTSLIB_EXPORT char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len); +void faidx_fetch_seq_into_buffer(const faidx_t *fai, + const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, + char* s, hts_pos_t *len); + /// Fetch the sequence in a region /** @param fai Pointer to the faidx_t struct @param c_name Region name diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index 9a6b48438..58d3d9389 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -141,6 +141,7 @@ typedef struct bcf_sr_t { htsFile *file; tbx_t *tbx_idx; + unsigned char read_one_record_only; hts_idx_t *bcf_idx; bcf_hdr_t *header; hts_itr_t *itr; diff --git a/htslib/vcf.h b/htslib/vcf.h index 70cf95372..d2cb6e6a8 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -61,16 +61,24 @@ extern "C" { #define BCF_HT_FLAG 0 // header type #define BCF_HT_INT 1 -#define BCF_HT_REAL 2 -#define BCF_HT_STR 3 -#define BCF_HT_LONG (BCF_HT_INT | 0x100) // BCF_HT_INT, but for int64_t values; VCF only! +#define BCF_HT_REAL 7 +#define BCF_HT_STR 8 +#define BCF_HT_CHAR 9 +#define BCF_HT_INT64 10 +#define BCF_HT_LONG BCF_HT_INT64 // BCF_HT_INT, but for int64_t values; VCF only! +#define BCF_HT_VOID 12 +#define BCF_NUM_HT_TYPES 14 +#define BCF_HT_UINT 2 +#define BCF_HT_UINT64 11 +#define BCF_HT_DOUBLE 13 #define BCF_VL_FIXED 0 // variable length #define BCF_VL_VAR 1 #define BCF_VL_A 2 #define BCF_VL_G 3 #define BCF_VL_R 4 - +#define BCF_VL_P 5 //ploidy +#define BCF_VL_Phased_Ploidy 6 //ploidy with phase /* === Dictionary === The header keeps three dictionaries. The first keeps IDs in the @@ -87,6 +95,10 @@ extern "C" { #define BCF_DT_CTG 1 #define BCF_DT_SAMPLE 2 +#define BCF_V_2_1_HEADER_MAGIC_STRING "BCF\2\1" +#define BCF_V_2_2_HEADER_MAGIC_STRING "BCF\2\2" +#define BCF_HEADER_MAGIC_STRING_LENGTH 5 + // Complete textual representation of a header line typedef struct bcf_hrec_t { int type; // One of the BCF_HL_* type @@ -142,12 +154,14 @@ extern uint8_t bcf_type_shift[]; #define VCF_SNP (1<<0) #define VCF_MNP (1<<1) #define VCF_INDEL (1<<2) -#define VCF_OTHER (1<<3) -#define VCF_BND (1<<4) // breakend -#define VCF_OVERLAP (1<<5) // overlapping deletion, ALT=* +#define VCF_OTHER 32 +#define VCF_BND 64 // breakend +#define VCF_OVERLAP 16 // overlapping deletion, ALT=* #define VCF_INS (1<<6) // implies VCF_INDEL #define VCF_DEL (1<<7) // implies VCF_INDEL #define VCF_ANY (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP|VCF_INS|VCF_DEL) // any variant type (but not VCF_REF) +#define VCF_NON_REF 8 +#define VCF_SPANNING_DELETION 16 typedef struct bcf_variant_t { int type, n; // variant type and the number of bases affected, negative for deletions @@ -237,6 +251,7 @@ typedef struct bcf1_t { hts_pos_t rlen; // length of REF int32_t rid; // CHROM float qual; // QUAL + hts_pos_t m_end_point; //END - must be after QUAL due to a memcpy() in vcf.c uint32_t n_info:16, n_allele:16; uint32_t n_fmt:8, n_sample:24; kstring_t shared, indiv; @@ -338,6 +353,7 @@ typedef struct bcf1_t { */ HTSLIB_EXPORT bcf_hdr_t *bcf_hdr_read(htsFile *fp) HTS_RESULT_USED; + bcf_hdr_t *bcf_hdr_read_required_sample_line(htsFile *hfp, const uint8_t is_sample_line_required); /** * bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed @@ -375,6 +391,16 @@ typedef struct bcf1_t { HTSLIB_EXPORT int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h) HTS_RESULT_USED; + + /* + * Serialize BCF header into buffer + * + * Returns new offset value in buffer if the new data fits within the buffer capacity, + * else returns the same offset value without modifying the buffer + */ + size_t bcf_hdr_serialize(bcf_hdr_t* h, uint8_t* buffer, size_t offset, const size_t capacity, const uint8_t is_bcf, const uint8_t keep_idx_fields); + + size_t bcf_hdr_deserialize(bcf_hdr_t* h, const uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf); /** * Parse VCF line contained in kstring and populate the bcf1_t struct * The line must not end with \n or \r characters. @@ -396,6 +422,27 @@ typedef struct bcf1_t { HTSLIB_EXPORT int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s); + /* + * Same as vcf_format, but for bcfs + * + * Returns new offset value in buffer if the new data fits within the buffer capacity, + * else returns the same offset value without modifying the buffer + * + * If vcf, then the hdr and tmp pointers must be valid. For bcfs, they might be null + */ + size_t bcf_serialize(bcf1_t* v, uint8_t* buffer, size_t offset, const size_t capacity, const uint8_t is_bcf, const bcf_hdr_t* hdr, kstring_t* tmp); + /* + * Same as vcf_parse, but for bcfs + * + * Returns new offset value in buffer if a full vcf record is read, + * else returns the same offset value + * + * If vcf, then the hdr and tmp pointers must be valid. For bcfs, they might be null + * + * Note that vcf parsing modifies the buffer (tokenize function) + */ + size_t bcf_deserialize(bcf1_t* v, uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf, const bcf_hdr_t* hdr); + /// Read next VCF or BCF record /** @param fp The file to read the record from @param h The header for the vcf/bcf file @@ -468,7 +515,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). */ HTSLIB_EXPORT bcf_hdr_t *vcf_hdr_read(htsFile *fp) HTS_RESULT_USED; - + bcf_hdr_t *vcf_hdr_read_required_sample_line(htsFile *fp, const uint8_t is_sample_line_required); /// Write a VCF format header /** @param fp Output file @param h The header to write @@ -651,6 +698,8 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). /** The following functions are for internal use and should rarely be called directly */ HTSLIB_EXPORT int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt); + int bcf_hdr_parse_required_sample_line(bcf_hdr_t *hdr, char *htxt, size_t* hdr_length, + const uint8_t is_sample_line_required); /// Synchronize internal header structures /** @param h Header @@ -988,6 +1037,8 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). return bcf_update_info(hdr, line, key, values, n, BCF_HT_LONG); } + void bcf_set_end_point_from_info(const bcf_hdr_t* hdr, bcf1_t* line); + /* * bcf_update_format_*() - functions for updating FORMAT fields * @values: pointer to the array of values, the same number of elements @@ -1248,6 +1299,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). #define bcf_hdr_id2coltype(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf) #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id)>=0 && (int_id)<(hdr)->n[BCF_DT_ID] && (hdr)->id[BCF_DT_ID][int_id].val && bcf_hdr_id2coltype((hdr),(type),(int_id))!=0xf) #define bcf_hdr_id2hrec(hdr,dict_type,col_type,int_id) ((hdr)->id[(dict_type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val->hrec[(dict_type)==BCF_DT_CTG?0:(col_type)]) + uint64_t bcf_hdr_id2contig_length(const bcf_hdr_t* hdr, const int id); /// Convert BCF FORMAT data to string form /** * @param s kstring to write into diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index a43ab15ae..a86aebc18 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -420,8 +420,12 @@ static void bcf_sr_destroy1(bcf_sr_t *reader) free(reader->fname); if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx); if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx); - bcf_hdr_destroy(reader->header); - hts_close(reader->file); + if (reader->header) { + bcf_hdr_destroy(reader->header); + } + if (reader->file) { + hts_close(reader->file); + } if ( reader->itr ) tbx_itr_destroy(reader->itr); int j; for (j=0; jmbuffer; j++) @@ -693,7 +697,7 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) reader->nbuffer++; if ( reader->buffer[reader->nbuffer]->rid != reader->buffer[1]->rid ) break; - if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full + if ( reader->read_one_record_only || reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full } if ( ret<0 ) { @@ -1044,6 +1048,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) kstring_t tmp = {0,0,0}; const char *sp = str, *ep = str; hts_pos_t from, to; + unsigned char inside_quotes = 0; while ( 1 ) { tmp.l = 0; @@ -1060,8 +1065,28 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) } else { - while ( *ep && *ep!=',' && *ep!=':' ) ep++; - kputsn(sp,ep-sp,&tmp); + //A quote is seen, flip flag inside_quotes + if(*ep == '"') + { + inside_quotes = 1 ^ inside_quotes; + sp = ++ep; + } + while ( *ep && ((inside_quotes && *ep!='"') || (!inside_quotes && *ep!=',' && *ep!=':')) ) ep++; + tmp.l = 0; + kputsn(sp,ep-sp,&tmp); + if(inside_quotes) + { + if(*ep == '"') + { + inside_quotes = 0; + ++ep; + } + else + { + fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s - terminating \" missing\n", __FILE__,__LINE__,__FUNCTION__,str); + free(reg); free(tmp.s); return NULL; + } + } } if ( *ep==':' ) { diff --git a/vcf.c b/vcf.c index c126f7354..4f87db5ff 100644 --- a/vcf.c +++ b/vcf.c @@ -1175,6 +1175,7 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) { int len, done = 0; char *p = htxt; + int return_val = 0; // Check sanity: "fileformat" string must come as first bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len); @@ -1195,6 +1196,12 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) // Parse the whole header do { while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) { + if(len < 0) + { + return_val = -1; + done = -1; + break; + } if (bcf_hdr_add_hrec(hdr, hrec) < 0) { bcf_hrec_destroy(hrec); return -1; @@ -1212,7 +1219,11 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) // of the next one). p += len; continue; - } + } + if(done < 0) + break; + + // Next should be the sample line. If not, it was a malformed // header, in which case print a warning and skip (many VCF @@ -2140,6 +2151,7 @@ static int bcf1_sync(bcf1_t *line) return 0; } + bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src) { bcf1_sync(src); @@ -4353,6 +4365,432 @@ bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src) } return dst; } +typedef union { + uint32_t i; + float f; +} if_pair; + +bcf_hdr_t *vcf_hdr_read_required_sample_line(htsFile *fp, const uint8_t is_sample_line_required) +{ + kstring_t txt, *s = &fp->line; + int ret; + bcf_hdr_t *h; + tbx_t *idx = NULL; + const char **names = NULL; + h = bcf_hdr_init("r"); + if (!h) { + hts_log_error("Failed to allocate bcf header"); + return NULL; + } + txt.l = txt.m = 0; txt.s = 0; + while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) { + int e = 0; + if (s->l == 0) continue; + if (s->s[0] != '#') { + hts_log_error("No sample line"); + goto error; + } + if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here + kstring_t tmp = { 0, 0, NULL }; + hFILE *f = hopen(fp->fn_aux, "r"); + if (f == NULL) { + hts_log_error("Couldn't open \"%s\"", fp->fn_aux); + goto error; + } + while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) { + char *tab = strchr(tmp.s, '\t'); + if (tab == NULL) continue; + e |= (kputs("##contig=\n", 2, &txt) < 0); + } + free(tmp.s); + if (hclose(f) != 0) { + hts_log_error("Error on closing %s", fp->fn_aux); + goto error; + } + if (e) goto error; + } + if (kputsn(s->s, s->l, &txt) < 0) goto error; + if (kputc('\n', &txt) < 0) goto error; + if (s->s[1] != '#') break; + } + if ( ret < -1 ) goto error; + if ( !txt.s ) + { + hts_log_error("Could not read the header"); + goto error; + } + size_t hdr_length = 0ull; + if ( bcf_hdr_parse_required_sample_line(h, txt.s, &hdr_length, is_sample_line_required) < 0 ) goto error; + + // check tabix index, are all contigs listed in the header? add the missing ones + idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL); + if ( idx ) + { + int i, n, need_sync = 0; + names = tbx_seqnames(idx, &n); + if (!names) goto error; + for (i=0; ierrcode ) + { + // vcf_parse1() encountered a new contig or tag, undeclared in the + // header. At this point, the header must have been printed, + // proceeding would lead to a broken BCF file. Errors must be checked + // and cleared by the caller before we can proceed. + hts_log_error("Unchecked error (%d)", v->errcode); + return -1; + } + bcf1_sync(v); // check if the BCF record was modified + if(is_bcf) + { + if((offset+8*sizeof(int)+v->shared.l+v->indiv.l) <= capacity) + { + //First 8 integers represent various lengths + if_pair* x = (if_pair*)(buffer+offset); + x[0].i = v->shared.l + 24; // to include six 32-bit integers + x[1].i = v->indiv.l; + x[2].i = v->rid; + x[3].i = v->pos; + x[4].i = v->rlen; + x[5].f = v->qual; + x[6].i = (uint32_t)v->n_allele<<16 | v->n_info; + x[7].i = (uint32_t)v->n_fmt<<24 | v->n_sample; + offset += 8*sizeof(int); + memcpy(buffer+offset, v->shared.s, v->shared.l); + offset += v->shared.l; + memcpy(buffer+offset, v->indiv.s, v->indiv.l); + offset += v->indiv.l; + } + } + else + { + tmp->l = 0; + int status = vcf_format(hdr, v, tmp); + assert(status == 0); + if((offset+tmp->l) <= capacity) + { + memcpy(buffer+offset, tmp->s, tmp->l); + offset += tmp->l; + } + } + return offset; +} + +bcf_hdr_t *bcf_hdr_read_required_sample_line(htsFile *hfp, const uint8_t is_sample_line_required) +{ + if (hfp->format.format == vcf) + return vcf_hdr_read_required_sample_line(hfp, is_sample_line_required); + if (hfp->format.format != bcf) { + hts_log_error("Input is not detected as bcf or vcf format"); + return NULL; + } + + assert(hfp->is_bgzf); + + BGZF *fp = hfp->fp.bgzf; + uint8_t magic[5]; + bcf_hdr_t *h; + h = bcf_hdr_init("r"); + if (!h) { + hts_log_error("Failed to allocate bcf header"); + return NULL; + } + if (bgzf_read(fp, magic, 5) != 5) + { + hts_log_error("Failed to read the header (reading BCF in text mode?)"); + bcf_hdr_destroy(h); + return NULL; + } + if (strncmp((char*)magic, "BCF\2\2", 5) != 0) + { + if (!strncmp((char*)magic, "BCF", 3)) + hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported"); + else + hts_log_error("Invalid BCF2 magic string"); + bcf_hdr_destroy(h); + return NULL; + } + uint8_t buf[4]; + size_t hlen; + char *htxt = NULL; + if (bgzf_read(fp, buf, 4) != 4) goto fail; + hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24); + if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; } + htxt = (char*)malloc(hlen + 1); + if (!htxt) goto fail; + if (bgzf_read(fp, htxt, hlen) != hlen) goto fail; + htxt[hlen] = '\0'; // Ensure htxt is terminated + size_t hdr_length = 0ull; + bcf_hdr_parse_required_sample_line(h, htxt, &hdr_length, is_sample_line_required); // FIXME: Does this return anything meaningful? + free(htxt); + return h; + fail: + hts_log_error("Failed to read BCF header"); + free(htxt); + bcf_hdr_destroy(h); + return NULL; +} + + +int bcf_hdr_parse_required_sample_line(bcf_hdr_t *hdr, char *htxt, size_t* hdr_length, + const uint8_t is_sample_line_required) +{ + int len, done = 0; + char *p = htxt; + int return_val = 0; + + // Check sanity: "fileformat" string must come as first + bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len); + if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") ) + hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?"); + + if (bcf_hdr_add_hrec(hdr, hrec) < 0) { + bcf_hrec_destroy(hrec); + return -1; + } + + // The filter PASS must appear first in the dictionary + hrec = bcf_hdr_parse_line(hdr,"##FILTER=",&len); + if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) { + bcf_hrec_destroy(hrec); + return -1; + } + + // Parse the whole header + do { + while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) { + if(len < 0) + { + return_val = -1; + done = -1; + break; + } + if (bcf_hdr_add_hrec(hdr, hrec) < 0) { + bcf_hrec_destroy(hrec); + return -1; + } + p += len; + } + assert(hrec == NULL); + if (len < 0) { + // len < 0 indicates out-of-memory, or similar error + hts_log_error("Could not parse header line: %s", strerror(errno)); + return -1; + } else if (len > 0) { + // Bad header line. bcf_hdr_parse_line() will have logged it. + // Skip and try again on the next line (p + len will be the start + // of the next one). + p += len; + continue; + } + + if(done < 0) + break; + + // Next should be the sample line. If not, it was a malformed + // header, in which case print a warning and skip (many VCF + // operations do not really care about a few malformed lines). + // In the future we may want to add a strict mode that errors in + // this case. + if ( strncmp("#CHROM\tPOS",p,10) != 0 ) { + char *eol = strchr(p, '\n'); + if (*p != '\0') { + char buffer[320]; + hts_log_warning("Could not parse header line: %s", + hts_strprint(buffer, sizeof(buffer), + '"', p, + eol ? (eol - p) : SIZE_MAX)); + } + if (eol) { + p = eol + 1; // Try from the next line. + } else { + done = -1; // No more lines left, give up. + } + } else { + done = 1; // Sample line found + } + } while (!done); + + size_t sample_line_length = 0; + if (done < 0) { + if(is_sample_line_required) + { + // No sample line is fatal. + hts_log_error("Could not parse the header, sample line not found"); + return -1; + } + } + else + { + if(return_val >= 0) + return_val = bcf_hdr_parse_sample_line(hdr,p); + } + (*hdr_length) = ((size_t)(p - htxt)) + sample_line_length; + if(return_val >= 0) + return_val = bcf_hdr_sync(hdr); + if(return_val >= 0) + bcf_hdr_check_sanity(hdr); + return return_val; +} + + + + + +size_t bcf_hdr_serialize(bcf_hdr_t* h, uint8_t* buffer, size_t offset, const size_t capacity, const uint8_t is_bcf, const uint8_t keep_idx_fields) +{ + if (!h) { + errno = EINVAL; + return offset; + } + if ( h->dirty ) { + if (bcf_hdr_sync(h) < 0) return offset; + } + + kstring_t htxt = {0,0,0}; + bcf_hdr_format(h, (is_bcf & keep_idx_fields), &htxt); + uint32_t hlen = htxt.l; + if(is_bcf) + { + kputc('\0', &htxt); // include the \0 byte + ++hlen; + + if((offset+5+sizeof(int)+hlen) <= capacity) + { + if(!keep_idx_fields) //htsjdk cannot deal with 2.2 header + memcpy(buffer+offset, "BCF\2\1", 5); + else + memcpy(buffer+offset, "BCF\2\2", 5); + offset += 5; + memcpy(buffer+offset, &hlen, sizeof(int)); + offset += sizeof(int); + memcpy(buffer+offset, htxt.s, hlen); + offset += hlen; + } + } + else + { + if(offset+hlen <= capacity) + { + memcpy(buffer+offset, htxt.s, hlen); + offset += hlen; + } + } + free(htxt.s); + return offset; +} + +size_t bcf_hdr_deserialize(bcf_hdr_t* h, const uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf) +{ + size_t hdr_length = 0ull; + size_t curr_offset = offset; + if(is_bcf) + { + //magic string + hdr length + if(curr_offset+BCF_HEADER_MAGIC_STRING_LENGTH+sizeof(int) > capacity) + return offset; + const char* buffer_magic_string = (const char*)(buffer+curr_offset); + if(strncmp(buffer_magic_string, BCF_V_2_2_HEADER_MAGIC_STRING, BCF_HEADER_MAGIC_STRING_LENGTH) != 0 + && strncmp(buffer_magic_string, BCF_V_2_1_HEADER_MAGIC_STRING, BCF_HEADER_MAGIC_STRING_LENGTH) != 0) + { + fprintf(stderr,"[%s:%d %s] invalid BCF2 magic string: only BCFv2.2 and BCFv2.1 are supported.\n", __FILE__,__LINE__,__FUNCTION__); + return offset; + } + curr_offset += BCF_HEADER_MAGIC_STRING_LENGTH; + //Header length + memcpy(&hdr_length, buffer+curr_offset, sizeof(int)); + curr_offset += sizeof(int); + if(curr_offset+hdr_length > capacity) + return offset; + } + bcf_hdr_parse(h, (char*)(buffer+curr_offset)); + return curr_offset+hdr_length; +} + +size_t bcf_deserialize(bcf1_t* v, uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf, const bcf_hdr_t* hdr) +{ + if(is_bcf) + { + bcf_clear(v); + size_t curr_offset = offset; + if(curr_offset+8*sizeof(uint32_t) >= capacity) + return offset; + const if_pair* x = (if_pair*)(buffer+curr_offset); + size_t shared_length = x[0].i-6*sizeof(int); + size_t indiv_length = x[1].i; + if(curr_offset+8*sizeof(uint32_t)+shared_length+indiv_length > capacity) + return offset; + ks_resize(&v->shared, shared_length); + ks_resize(&v->indiv, indiv_length); + v->rid = x[2].i; + v->pos = x[3].i; + v->rlen = x[4].i; + v->qual = x[5].f; + v->n_allele = (x[6].i)>>16; v->n_info = (x[6].i)&0xffff; + v->n_fmt = (x[7].i)>>24; v->n_sample = (x[7].i)&0xffffff; + v->shared.l = shared_length, v->indiv.l = indiv_length; + // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4 + if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0; + curr_offset += 8*sizeof(uint32_t); + + memcpy(v->shared.s, buffer+curr_offset, shared_length); + curr_offset += shared_length; + + memcpy(v->indiv.s, buffer+curr_offset, indiv_length); + curr_offset += indiv_length; + return curr_offset; + } + else + { + kstring_t tmp; + assert(offset < capacity); + tmp.s = (char*)(buffer+offset); + size_t max_length = capacity-offset; + size_t line_length = max_length; + //See if newline exists + char* line_end_ptr = (char*)(memchr(tmp.s, '\n', max_length)); + if(line_end_ptr) + { + line_length = ((size_t)(line_end_ptr - tmp.s)); + *line_end_ptr = 0; //replace '\n' with null byte, vcf_parse doesn't like '\n' + } + tmp.l = line_length; + tmp.m = max_length; + int status = vcf_parse(&tmp, hdr, v); + //vcf parsed succesfully + if(status == 0) + return offset + line_length + (line_end_ptr ? 1u : 0u); //for the \n character + else + return offset; + } +} int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line) { @@ -4721,6 +5159,7 @@ static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t if ( alt[0]=='<' ) { if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant + if( strncmp(alt, "", 9) == 0) { var->n = 0; var->type = VCF_NON_REF; return; } if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; } var->type = VCF_OTHER; @@ -4830,7 +5269,7 @@ int bcf_get_variant_type(bcf1_t *rec, int ith_allele) hts_log_error("Requested allele outside valid range"); exit(1); } - return rec->d.var[ith_allele].type & ORIG_VAR_TYPES; + return rec->d.var[ith_allele].type; } #undef ORIG_VAR_TYPES @@ -5664,3 +6103,23 @@ const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) { return buffer; } +uint64_t bcf_hdr_id2contig_length(const bcf_hdr_t* hdr, const int id) +{ + bcf_hrec_t* hrec = bcf_hdr_id2hrec(hdr, BCF_DT_CTG, 0, id); + int i = 0; + for(i=0;inkeys;++i) + if(strcmp(hrec->keys[i], "length") == 0) + return strtoull(hrec->vals[i], 0, 10); + return 0; +} + +void bcf_set_end_point_from_info(const bcf_hdr_t* hdr, bcf1_t* line) +{ + bcf_unpack(line, BCF_UN_INFO); + bcf_info_t* info = bcf_get_info(hdr, line, "END"); + if(info) + line->m_end_point = info->v1.i - 1; //END value is 1 based, line->pos is 0 based, change to 0 based + else //no END tag, end is same as pos if not deletion, else depends on rlen + line->m_end_point = line->pos + line->rlen - 1; +} + From 952ce612c7b1ef5afcd55cf19fde2da9f6f7eb87 Mon Sep 17 00:00:00 2001 From: eneskuluk <54481799+eneskuluk@users.noreply.github.com> Date: Mon, 28 Aug 2023 14:05:50 -0700 Subject: [PATCH 2/7] rebase, merged with latest develop, includes speed ups --- htslib/vcf.h | 17 +++-- kstring.c | 10 +-- vcf.c | 181 ++++++++++++++++++++++++++++++++------------------- 3 files changed, 132 insertions(+), 76 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index d2cb6e6a8..67c8cb5db 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -157,8 +157,8 @@ extern uint8_t bcf_type_shift[]; #define VCF_OTHER 32 #define VCF_BND 64 // breakend #define VCF_OVERLAP 16 // overlapping deletion, ALT=* -#define VCF_INS (1<<6) // implies VCF_INDEL -#define VCF_DEL (1<<7) // implies VCF_INDEL +#define VCF_INS VCF_INDEL // implies VCF_INDEL +#define VCF_DEL VCF_INDEL // implies VCF_INDEL #define VCF_ANY (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP|VCF_INS|VCF_DEL) // any variant type (but not VCF_REF) #define VCF_NON_REF 8 #define VCF_SPANNING_DELETION 16 @@ -1572,7 +1572,7 @@ static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) return e == 0 ? 0 : -1; } -static inline int bcf_enc_size(kstring_t *s, int size, int type) +static inline int bcf_enc_size(kstring_t *s, size_t size, int type) { // Most common case is first if (size < 15) { @@ -1598,11 +1598,17 @@ static inline int bcf_enc_size(kstring_t *s, int size, int type) *p++ = 1<<4|BCF_BT_INT16; i16_to_le(size, p); s->l += 4; - } else { + } + else if(size <= INT32_MAX){ *p++ = 1<<4|BCF_BT_INT32; i32_to_le(size, p); s->l += 6; } + else{ + *p++ = 1<<4|BCF_BT_INT64; + s->l += 10; + return -1; + } } return 0; } @@ -1611,7 +1617,8 @@ static inline int bcf_enc_inttype(long x) { if (x <= BCF_MAX_BT_INT8 && x >= BCF_MIN_BT_INT8) return BCF_BT_INT8; if (x <= BCF_MAX_BT_INT16 && x >= BCF_MIN_BT_INT16) return BCF_BT_INT16; - return BCF_BT_INT32; + if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32) return BCF_BT_INT32; + return BCF_BT_INT64; } static inline int bcf_enc_int1(kstring_t *s, int32_t x) diff --git a/kstring.c b/kstring.c index f8e0f9f3d..5284d53d0 100644 --- a/kstring.c +++ b/kstring.c @@ -57,7 +57,7 @@ int kputd(double d, kstring_t *s) { if (ks_resize(s, s->l + 50) < 0) return EOF; // We let stdio handle the exponent cases - int s2 = snprintf(s->s + s->l, s->m - s->l, "%g", d); + int s2 = snprintf(s->s + s->l, s->m - s->l, "%#g", d); len += s2; s->l += s2; return len; @@ -116,7 +116,7 @@ int kputd(double d, kstring_t *s) { } xp[0] = '.'; cp[7] = 0; ep=cp+6; - if (cp[6] == '.') cp[6] = 0; + //if (cp[6] == '.') cp[6] = 0; } // Cull trailing zeros @@ -125,8 +125,10 @@ int kputd(double d, kstring_t *s) { char *z = ep+1; while (ep > cp) { if (*ep == '.') { - if (z[-1] == '.') - z[-1] = 0; + if (z[-1] == '.'){ + z[0] = '0'; + z[1] = 0; + } else z[0] = 0; break; diff --git a/vcf.c b/vcf.c index 4f87db5ff..db7a01745 100644 --- a/vcf.c +++ b/vcf.c @@ -2621,6 +2621,76 @@ static int bcf_enc_long1(kstring_t *s, int64_t x) { } #endif +int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize) +{ + int64_t max = INT64_MIN, min = INT64_MAX; + int i; + if (n <= 0) return bcf_enc_size(s, 0, BCF_BT_NULL); + else if (n == 1) return bcf_enc_long1(s, a[0]); + else { + if (wsize <= 0) wsize = n; + for (i = 0; i < n; ++i) { + if (a[i] == bcf_int64_missing || a[i] == bcf_int64_vector_end ) continue; + if (max < a[i]) max = a[i]; + if (min > a[i]) min = a[i]; + } + if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) { + bcf_enc_size(s, wsize, BCF_BT_INT8); + for (i = 0; i < n; ++i) + if ( a[i]==bcf_int64_vector_end ) kputc(bcf_int8_vector_end, s); + else if ( a[i]==bcf_int64_missing ) kputc(bcf_int8_missing, s); + else kputc(a[i], s); + } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) { + uint8_t *p; + bcf_enc_size(s, wsize, BCF_BT_INT16); + ks_resize(s, s->l + n * sizeof(int16_t)); + p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i) + { + int16_t x; + if ( a[i]==bcf_int64_vector_end ) x = bcf_int16_vector_end; + else if ( a[i]==bcf_int64_missing ) x = bcf_int16_missing; + else x = a[i]; + i16_to_le(x, p); + p += sizeof(int16_t); + } + s->l += n * sizeof(int16_t); + } else if(max <= BCF_MAX_BT_INT32 && min >= BCF_MIN_BT_INT32){ + uint8_t *p; + bcf_enc_size(s, wsize, BCF_BT_INT32); + ks_resize(s, s->l + n * sizeof(int32_t)); + p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i) { + int32_t x; + if ( a[i]==bcf_int64_vector_end ) x = bcf_int32_vector_end; + else if ( a[i]==bcf_int64_missing ) x = bcf_int32_missing; + else x = a[i]; + i32_to_le(x, p); + p += sizeof(int32_t); + } + s->l += n * sizeof(int32_t); + } +#ifdef VCF_ALLOW_INT64 + else { + uint8_t *p; + bcf_enc_size(s, wsize, BCF_BT_INT64); + ks_resize(s, s->l + n * sizeof(int64_t)); + p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i) { + int64_t x = a[i]; + i64_to_le(x, p); + p += sizeof(int64_t); + } + s->l += n * sizeof(int64_t); + } +#else + return -1; +#endif + } + + return 0; // FIXME: check for errs in this function +} + static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) { uint8_t *p; size_t i; @@ -2687,6 +2757,7 @@ int bcf_fmt_array(kstring_t *s, int n, int type, void *data) case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, v==bcf_int8_missing, v==bcf_int8_vector_end, kputw(v, s)); break; case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break; case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break; + case BCF_BT_INT64: BRANCH(int64_t, le_to_i64, v==bcf_int64_missing, v==bcf_int64_vector_end, kputll(v, s)); break; case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break; default: hts_log_error("Unexpected type %d", type); exit(1); break; } @@ -3360,8 +3431,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p char *r, *key; khint_t k; vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; - int32_t *a_val = NULL; - + int64_t *a_val = NULL; v->n_info = 0; if (*(q-1) == ';') *(q-1) = 0; for (r = key = p;; ++r) { @@ -3417,7 +3487,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p if (*t == ',') ++n_val; // Check both int and float size in one step for simplicity if (n_val > max_n_val) { - int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val)); + int64_t *a_tmp = (int64_t *)realloc(a_val, n_val * sizeof(*a_val)); if (!a_tmp) { hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; // No appropriate code? @@ -3426,67 +3496,35 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p a_val = a_tmp; max_n_val = n_val; } - if ((y>>4&0xf) == BCF_HT_INT) { + if (((y >> 4 & 0xf) == BCF_HT_INT) || + ((y >> 4 & 0xf) == BCF_HT_LONG)) { i = 0, t = val; int64_t val1; - int is_int64 = 0; -#ifdef VCF_ALLOW_INT64 - if ( n_val==1 ) - { - overflow = 0; - long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow); - if ( te==val ) tmp_val = bcf_int32_missing; - else if ( overflow || tmp_valBCF_MAX_BT_INT64 ) - { - if ( !extreme_int_warned ) - { - hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1); - extreme_int_warned = 1; - } - tmp_val = bcf_int32_missing; - } - else - is_int64 = 1; - val1 = tmp_val; - t = te; - i = 1; // this is just to avoid adding another nested block... - } -#endif - for (; i < n_val; ++i, ++t) - { + for (; i < n_val; ++i, ++t) { overflow = 0; - long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow); - if ( te==t ) tmp_val = bcf_int32_missing; - else if ( overflow || tmp_valBCF_MAX_BT_INT32 ) - { - if ( !extreme_int_warned ) - { - hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1); + long long int tmp_val = hts_str2int( + t, &te, sizeof(tmp_val) * CHAR_BIT, &overflow); + if (te == t) + tmp_val = bcf_int64_missing; + else if (overflow || tmp_val < BCF_MIN_BT_INT64 || + tmp_val > BCF_MAX_BT_INT64) { + if (!extreme_int_warned) { + hts_log_warning( + "Extreme INFO/%s value encountered and set to " + "missing at %s:%" PRIhts_pos, + key, bcf_seqname_safe(h, v), v->pos + 1); extreme_int_warned = 1; } - tmp_val = bcf_int32_missing; + tmp_val = bcf_int64_missing; } a_val[i] = tmp_val; - for (t = te; *t && *t != ','; t++); - } - if (n_val == 1) { -#ifdef VCF_ALLOW_INT64 - if ( is_int64 ) - { - v->unpacked |= BCF_IS_64BIT; - bcf_enc_long1(str, val1); - } - else - bcf_enc_int1(str, (int32_t)val1); -#else - val1 = a_val[0]; - bcf_enc_int1(str, (int32_t)val1); -#endif - } else { - bcf_enc_vint(str, n_val, a_val, -1); + for (t = te; *t && *t != ','; t++) + ; } - if (n_val==1 && (val1!=bcf_int32_missing || is_int64) - && memcmp(key, "END", 4) == 0) + v->unpacked |= BCF_IS_64BIT; + bcf_enc_vlong(str, n_val, a_val, -1); + val1 = a_val[0]; + if (n_val==1 && strcmp(key, "END") == 0) { if ( val1 <= v->pos ) { @@ -3499,7 +3537,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p else v->rlen = val1 - v->pos; } - } else if ((y>>4&0xf) == BCF_HT_REAL) { + } else if ((y >> 4 & 0xf) == BCF_HT_REAL) { float *val_f = (float *)a_val; for (i = 0, t = val; i < n_val; ++i, ++t) { @@ -4730,8 +4768,7 @@ size_t bcf_hdr_deserialize(bcf_hdr_t* h, const uint8_t* buffer, const size_t off if(curr_offset+hdr_length > capacity) return offset; } - bcf_hdr_parse(h, (char*)(buffer+curr_offset)); - return curr_offset+hdr_length; + return bcf_hdr_parse(h, (char*)(buffer+curr_offset)); } size_t bcf_deserialize(bcf1_t* v, uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf, const bcf_hdr_t* hdr) @@ -5245,7 +5282,7 @@ static int bcf_set_variant_types(bcf1_t *b) // to be compatible with callers that are not expecting newer values // like VCF_INS, VCF_DEL. The full set is available from the newer // vcf_has_variant_type* interfaces. -#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP) +#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP|VCF_NON_REF) int bcf_get_variant_types(bcf1_t *rec) { if ( rec->d.var_type==-1 ) { @@ -5269,7 +5306,7 @@ int bcf_get_variant_type(bcf1_t *rec, int ith_allele) hts_log_error("Requested allele outside valid range"); exit(1); } - return rec->d.var[ith_allele].type; + return rec->d.var[ith_allele].type & ORIG_VAR_TYPES; } #undef ORIG_VAR_TYPES @@ -5385,11 +5422,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v #ifdef VCF_ALLOW_INT64 else if ( type==BCF_HT_LONG ) { - if (n != 1) { - hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1); - abort(); - } - bcf_enc_long1(&str, *(int64_t *) values); + bcf_enc_vlong(&str, n, (const int64_t*)values, -1); } #endif else @@ -5843,7 +5876,12 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi { int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header - if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2; // expected different type + if((type & 0xff) == BCF_HT_LONG) { + const int ht_type_in_hdr = bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id); + if(ht_type_in_hdr != BCF_HT_INT && ht_type_in_hdr != BCF_HT_LONG) return -2; // expected different type + } + else + if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2; // expected different type if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); @@ -5916,6 +5954,14 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi } else { BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; } + case BCF_BT_INT64: + if (type == BCF_HT_LONG) { + BRANCH(int64_t, le_to_i64, p==bcf_int64_missing, p==bcf_int64_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); + } else { + hts_log_error("Trying to get 32-bit int data from a field which contains 64 bit values"); + return -2; + } + break; case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break; default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2; } @@ -6025,6 +6071,7 @@ int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, v case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break; case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break; case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break; + case BCF_BT_INT64: BRANCH(int64_t, le_to_i64, p==bcf_int64_missing, p==bcf_int64_vector_end, *tmp=bcf_int64_missing, *tmp=bcf_int64_vector_end, *tmp=p, int64_t); break; case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break; default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1); } From 12759e4478acd527981a4974211a8e0a76d750dc Mon Sep 17 00:00:00 2001 From: eneskuluk <54481799+eneskuluk@users.noreply.github.com> Date: Wed, 6 Sep 2023 11:55:49 -0700 Subject: [PATCH 3/7] tests are passing with speed-up improvements, a few improvements are left out to make test pass for now --- htslib/vcf.h | 5 +-- synced_bcf_reader.c | 25 ++----------- vcf.c | 87 +++++++++++++++++++++++++++++++++------------ 3 files changed, 69 insertions(+), 48 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index 67c8cb5db..de830dda5 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -1605,8 +1605,9 @@ static inline int bcf_enc_size(kstring_t *s, size_t size, int type) s->l += 6; } else{ - *p++ = 1<<4|BCF_BT_INT64; - s->l += 10; + *p++ = 1<<4|BCF_BT_INT64;// + i64_to_le(size,p); + s->l += 10; // not so sure about +10 here, whether it is accurate or changes anything. return -1; } } diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index a86aebc18..acb208488 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -1048,7 +1048,6 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) kstring_t tmp = {0,0,0}; const char *sp = str, *ep = str; hts_pos_t from, to; - unsigned char inside_quotes = 0; while ( 1 ) { tmp.l = 0; @@ -1065,28 +1064,8 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) } else { - //A quote is seen, flip flag inside_quotes - if(*ep == '"') - { - inside_quotes = 1 ^ inside_quotes; - sp = ++ep; - } - while ( *ep && ((inside_quotes && *ep!='"') || (!inside_quotes && *ep!=',' && *ep!=':')) ) ep++; - tmp.l = 0; - kputsn(sp,ep-sp,&tmp); - if(inside_quotes) - { - if(*ep == '"') - { - inside_quotes = 0; - ++ep; - } - else - { - fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s - terminating \" missing\n", __FILE__,__LINE__,__FUNCTION__,str); - free(reg); free(tmp.s); return NULL; - } - } + while ( *ep && *ep!=',' && *ep!=':' ) ep++; + kputsn(sp,ep-sp,&tmp); } if ( *ep==':' ) { diff --git a/vcf.c b/vcf.c index db7a01745..f5e00b675 100644 --- a/vcf.c +++ b/vcf.c @@ -2621,29 +2621,68 @@ static int bcf_enc_long1(kstring_t *s, int64_t x) { } #endif -int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize) +int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize)//supposed to be optimized, not working yet. { int64_t max = INT64_MIN, min = INT64_MAX; int i; - if (n <= 0) return bcf_enc_size(s, 0, BCF_BT_NULL); - else if (n == 1) return bcf_enc_long1(s, a[0]); - else { + if (n <= 0) { + return bcf_enc_size(s, 0, BCF_BT_NULL); + } else if (n == 1) { + return bcf_enc_long1(s, a[0]); + } else { if (wsize <= 0) wsize = n; - for (i = 0; i < n; ++i) { - if (a[i] == bcf_int64_missing || a[i] == bcf_int64_vector_end ) continue; + + // Equivalent to: + // for (i = 0; i < n; ++i) { + // if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) + // continue; + // if (max < a[i]) max = a[i]; + // if (min > a[i]) min = a[i]; + // } + int64_t max4[4] = {INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN}; + int64_t min4[4] = {INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX}; + for (i = 0; i < (n&~3); i+=4) { + // bcf_int32_missing == INT32_MIN and + // bcf_int32_vector_end == INT32_MIN+1. + // We skip these, but can mostly avoid explicit checking + if (max4[0] < a[i+0]) max4[0] = a[i+0]; + if (max4[1] < a[i+1]) max4[1] = a[i+1]; + if (max4[2] < a[i+2]) max4[2] = a[i+2]; + if (max4[3] < a[i+3]) max4[3] = a[i+3]; + if (min4[0] > a[i+0] && a[i+0] > INT64_MIN+1) min4[0] = a[i+0]; + if (min4[1] > a[i+1] && a[i+1] > INT64_MIN+1) min4[1] = a[i+1]; + if (min4[2] > a[i+2] && a[i+2] > INT64_MIN+1) min4[2] = a[i+2]; + if (min4[3] > a[i+3] && a[i+3] > INT64_MIN+1) min4[3] = a[i+3]; + } + min = min4[0]; + if (min > min4[1]) min = min4[1]; + if (min > min4[2]) min = min4[2]; + if (min > min4[3]) min = min4[3]; + max = max4[0]; + if (max < max4[1]) max = max4[1]; + if (max < max4[2]) max = max4[2]; + if (max < max4[3]) max = max4[3]; + for (; i < n; ++i) { if (max < a[i]) max = a[i]; - if (min > a[i]) min = a[i]; + if (min > a[i] && a[i] > INT64_MIN+1) min = a[i]; } + if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) { - bcf_enc_size(s, wsize, BCF_BT_INT8); - for (i = 0; i < n; ++i) - if ( a[i]==bcf_int64_vector_end ) kputc(bcf_int8_vector_end, s); - else if ( a[i]==bcf_int64_missing ) kputc(bcf_int8_missing, s); - else kputc(a[i], s); + if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 || + ks_resize(s, s->l + n) < 0) + return -1; + uint8_t *p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i, p++) { + if ( a[i]==bcf_int64_vector_end ) *p = bcf_int8_vector_end; + else if ( a[i]==bcf_int64_missing ) *p = bcf_int8_missing; + else *p = a[i]; + } + s->l += n; } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) { uint8_t *p; - bcf_enc_size(s, wsize, BCF_BT_INT16); - ks_resize(s, s->l + n * sizeof(int16_t)); + if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 || + ks_resize(s, s->l + n * sizeof(int16_t)) < 0) + return -1; p = (uint8_t *) s->s + s->l; for (i = 0; i < n; ++i) { @@ -2657,8 +2696,9 @@ int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize) s->l += n * sizeof(int16_t); } else if(max <= BCF_MAX_BT_INT32 && min >= BCF_MIN_BT_INT32){ uint8_t *p; - bcf_enc_size(s, wsize, BCF_BT_INT32); - ks_resize(s, s->l + n * sizeof(int32_t)); + if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 || + ks_resize(s, s->l + n * sizeof(int32_t)) < 0) + return -1; p = (uint8_t *) s->s + s->l; for (i = 0; i < n; ++i) { int32_t x; @@ -2670,11 +2710,11 @@ int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize) } s->l += n * sizeof(int32_t); } -#ifdef VCF_ALLOW_INT64 + #ifdef VCF_ALLOW_INT64 else { uint8_t *p; - bcf_enc_size(s, wsize, BCF_BT_INT64); - ks_resize(s, s->l + n * sizeof(int64_t)); + if(bcf_enc_size(s, wsize, BCF_BT_INT64) < 0 || ks_resize(s, s->l + n * sizeof(int64_t)) < 0) + return -1; p = (uint8_t *) s->s + s->l; for (i = 0; i < n; ++i) { int64_t x = a[i]; @@ -2688,7 +2728,7 @@ int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize) #endif } - return 0; // FIXME: check for errs in this function + return 0; } static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) { @@ -3524,7 +3564,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p v->unpacked |= BCF_IS_64BIT; bcf_enc_vlong(str, n_val, a_val, -1); val1 = a_val[0]; - if (n_val==1 && strcmp(key, "END") == 0) + if (n_val==1 && strcmp(key, "END") == 0)//memset instead of strcmp { if ( val1 <= v->pos ) { @@ -3586,6 +3626,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) // Ensure string we parse has space to permit some over-flow when during // parsing. Eg to do memcmp(key, "END", 4) in vcf_parse_info over // the more straight forward looking strcmp, giving a speed advantage. + /* if (ks_resize(s, s->l+4) < 0) return -1; @@ -3598,7 +3639,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) s->s[s->l+1] = 0; s->s[s->l+2] = 0; s->s[s->l+3] = 0; - + */ // commented out the part that was required for optimization in vcf_parse_info function, will take a look later. bcf_clear1(v); str = &v->shared; memset(&aux, 0, sizeof(ks_tokaux_t)); @@ -4657,7 +4698,7 @@ int bcf_hdr_parse_required_sample_line(bcf_hdr_t *hdr, char *htxt, size_t* hdr_l // operations do not really care about a few malformed lines). // In the future we may want to add a strict mode that errors in // this case. - if ( strncmp("#CHROM\tPOS",p,10) != 0 ) { + if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) { char *eol = strchr(p, '\n'); if (*p != '\0') { char buffer[320]; From ce36a3daa2100e8c00cd71c4baf266b35ec8afe7 Mon Sep 17 00:00:00 2001 From: eneskuluk <54481799+eneskuluk@users.noreply.github.com> Date: Wed, 6 Sep 2023 12:26:13 -0700 Subject: [PATCH 4/7] fix comment --- vcf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vcf.c b/vcf.c index f5e00b675..e64326263 100644 --- a/vcf.c +++ b/vcf.c @@ -2621,7 +2621,7 @@ static int bcf_enc_long1(kstring_t *s, int64_t x) { } #endif -int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize)//supposed to be optimized, not working yet. +int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize) { int64_t max = INT64_MIN, min = INT64_MAX; int i; @@ -3564,7 +3564,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p v->unpacked |= BCF_IS_64BIT; bcf_enc_vlong(str, n_val, a_val, -1); val1 = a_val[0]; - if (n_val==1 && strcmp(key, "END") == 0)//memset instead of strcmp + if (n_val==1 && strcmp(key, "END") == 0)//memcmp instead of strcmp { if ( val1 <= v->pos ) { From 9913d3750c1164d548456366b9a2ab180ccbcbce Mon Sep 17 00:00:00 2001 From: eneskuluk <54481799+eneskuluk@users.noreply.github.com> Date: Wed, 6 Sep 2023 15:23:06 -0700 Subject: [PATCH 5/7] add back the optimization --- vcf.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/vcf.c b/vcf.c index e64326263..d23d84b7f 100644 --- a/vcf.c +++ b/vcf.c @@ -3564,7 +3564,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p v->unpacked |= BCF_IS_64BIT; bcf_enc_vlong(str, n_val, a_val, -1); val1 = a_val[0]; - if (n_val==1 && strcmp(key, "END") == 0)//memcmp instead of strcmp + if (n_val==1 && val1!=bcf_int64_missing && memcmp(key, "END", 4) == 0)//memcmp instead of strcmp { if ( val1 <= v->pos ) { @@ -3635,11 +3635,10 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) // almost certainly is never detected by the compiler so has no impact, // but equally so this code has minimal (often beneficial) impact on // performance too.) - s->s[s->l+0] = 0; - s->s[s->l+1] = 0; - s->s[s->l+2] = 0; - s->s[s->l+3] = 0; - */ // commented out the part that was required for optimization in vcf_parse_info function, will take a look later. + for(int i = 0; i < 4; i++) + s->s[s->l+i] = 0; + */ + // commented out the part that was required for optimization in vcf_parse_info function, will take a look later. bcf_clear1(v); str = &v->shared; memset(&aux, 0, sizeof(ks_tokaux_t)); @@ -4622,6 +4621,9 @@ bcf_hdr_t *bcf_hdr_read_required_sample_line(htsFile *hfp, const uint8_t is_samp if (bgzf_read(fp, buf, 4) != 4) goto fail; hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24); if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; } +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (hlen > FUZZ_ALLOC_LIMIT) { errno = ENOMEM; goto fail; } +#endif htxt = (char*)malloc(hlen + 1); if (!htxt) goto fail; if (bgzf_read(fp, htxt, hlen) != hlen) goto fail; From ba14d0d9f2c6c217481d44bcb72821684a254962 Mon Sep 17 00:00:00 2001 From: eneskuluk <54481799+eneskuluk@users.noreply.github.com> Date: Fri, 22 Sep 2023 14:05:20 -0700 Subject: [PATCH 6/7] revert precision change and minor fix in faidx.c --- faidx.c | 2 +- kstring.c | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/faidx.c b/faidx.c index 8ac149485..133a43177 100644 --- a/faidx.c +++ b/faidx.c @@ -950,7 +950,7 @@ static void fai_retrieve_into_buffer(const faidx_t *fai, const faidx1_t *val, } s[l] = '\0'; - *len = l < INT_MAX ? l : INT_MAX; + *len = l; } void faidx_fetch_seq_into_buffer(const faidx_t *fai, diff --git a/kstring.c b/kstring.c index 5284d53d0..f8e0f9f3d 100644 --- a/kstring.c +++ b/kstring.c @@ -57,7 +57,7 @@ int kputd(double d, kstring_t *s) { if (ks_resize(s, s->l + 50) < 0) return EOF; // We let stdio handle the exponent cases - int s2 = snprintf(s->s + s->l, s->m - s->l, "%#g", d); + int s2 = snprintf(s->s + s->l, s->m - s->l, "%g", d); len += s2; s->l += s2; return len; @@ -116,7 +116,7 @@ int kputd(double d, kstring_t *s) { } xp[0] = '.'; cp[7] = 0; ep=cp+6; - //if (cp[6] == '.') cp[6] = 0; + if (cp[6] == '.') cp[6] = 0; } // Cull trailing zeros @@ -125,10 +125,8 @@ int kputd(double d, kstring_t *s) { char *z = ep+1; while (ep > cp) { if (*ep == '.') { - if (z[-1] == '.'){ - z[0] = '0'; - z[1] = 0; - } + if (z[-1] == '.') + z[-1] = 0; else z[0] = 0; break; From 5b60e74b7529940a582838e53fcd00b1168e5e5e Mon Sep 17 00:00:00 2001 From: eneskuluk <54481799+eneskuluk@users.noreply.github.com> Date: Mon, 25 Sep 2023 16:11:10 -0700 Subject: [PATCH 7/7] address some of the comments --- faidx.c | 6 ++++++ htslib/vcf.h | 36 ++++++++++++++++++------------------ vcf.c | 20 +------------------- 3 files changed, 25 insertions(+), 37 deletions(-) diff --git a/faidx.c b/faidx.c index 133a43177..089001dd3 100644 --- a/faidx.c +++ b/faidx.c @@ -926,6 +926,12 @@ static void fai_retrieve_into_buffer(const faidx_t *fai, const faidx1_t *val, *len = -1; return; } + + if (val->line_blen <= 0) { + hts_log_error("Invalid line length in index: %d", val->line_blen); + *len = -1; + return; + } ret = bgzf_useek(fai->bgzf, offset diff --git a/htslib/vcf.h b/htslib/vcf.h index de830dda5..908f15b65 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -61,16 +61,16 @@ extern "C" { #define BCF_HT_FLAG 0 // header type #define BCF_HT_INT 1 -#define BCF_HT_REAL 7 -#define BCF_HT_STR 8 -#define BCF_HT_CHAR 9 -#define BCF_HT_INT64 10 +#define BCF_HT_REAL 2 +#define BCF_HT_STR 3 +#define BCF_HT_UINT 4 +#define BCF_HT_CHAR 5 +#define BCF_HT_INT64 6 #define BCF_HT_LONG BCF_HT_INT64 // BCF_HT_INT, but for int64_t values; VCF only! -#define BCF_HT_VOID 12 -#define BCF_NUM_HT_TYPES 14 -#define BCF_HT_UINT 2 -#define BCF_HT_UINT64 11 -#define BCF_HT_DOUBLE 13 +#define BCF_HT_UINT64 7 +#define BCF_HT_VOID 8 +#define BCF_HT_DOUBLE 9 +#define BCF_NUM_HT_TYPES 10 #define BCF_VL_FIXED 0 // variable length #define BCF_VL_VAR 1 @@ -154,14 +154,14 @@ extern uint8_t bcf_type_shift[]; #define VCF_SNP (1<<0) #define VCF_MNP (1<<1) #define VCF_INDEL (1<<2) -#define VCF_OTHER 32 -#define VCF_BND 64 // breakend -#define VCF_OVERLAP 16 // overlapping deletion, ALT=* -#define VCF_INS VCF_INDEL // implies VCF_INDEL -#define VCF_DEL VCF_INDEL // implies VCF_INDEL +#define VCF_OTHER (1<<3) +#define VCF_BND (1<<4) // breakend +#define VCF_OVERLAP (1<<5) // overlapping deletion, ALT=* +#define VCF_SPANNING_DELETION VCF_OVERLAP +#define VCF_INS (1<<6) // implies VCF_INDEL +#define VCF_DEL (1<<7) // implies VCF_INDEL #define VCF_ANY (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP|VCF_INS|VCF_DEL) // any variant type (but not VCF_REF) -#define VCF_NON_REF 8 -#define VCF_SPANNING_DELETION 16 +#define VCF_NON_REF (1<<8) typedef struct bcf_variant_t { int type, n; // variant type and the number of bases affected, negative for deletions @@ -1605,9 +1605,9 @@ static inline int bcf_enc_size(kstring_t *s, size_t size, int type) s->l += 6; } else{ - *p++ = 1<<4|BCF_BT_INT64;// + *p++ = 1<<4|BCF_BT_INT64; i64_to_le(size,p); - s->l += 10; // not so sure about +10 here, whether it is accurate or changes anything. + s->l += 10; return -1; } } diff --git a/vcf.c b/vcf.c index d23d84b7f..e6275ddad 100644 --- a/vcf.c +++ b/vcf.c @@ -1175,7 +1175,6 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) { int len, done = 0; char *p = htxt; - int return_val = 0; // Check sanity: "fileformat" string must come as first bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len); @@ -1198,7 +1197,6 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) { if(len < 0) { - return_val = -1; done = -1; break; } @@ -3577,7 +3575,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p else v->rlen = val1 - v->pos; } - } else if ((y >> 4 & 0xf) == BCF_HT_REAL) { + } else if ((y>>4&0xf) == BCF_HT_REAL) { float *val_f = (float *)a_val; for (i = 0, t = val; i < n_val; ++i, ++t) { @@ -3623,22 +3621,6 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) // Assumed in lots of places, but we may as well spot this early assert(sizeof(float) == sizeof(int32_t)); - // Ensure string we parse has space to permit some over-flow when during - // parsing. Eg to do memcmp(key, "END", 4) in vcf_parse_info over - // the more straight forward looking strcmp, giving a speed advantage. - /* - if (ks_resize(s, s->l+4) < 0) - return -1; - - // Force our memory to be initialised so we avoid the technicality of - // undefined behaviour in using a 4-byte memcmp. (The reality is this - // almost certainly is never detected by the compiler so has no impact, - // but equally so this code has minimal (often beneficial) impact on - // performance too.) - for(int i = 0; i < 4; i++) - s->s[s->l+i] = 0; - */ - // commented out the part that was required for optimization in vcf_parse_info function, will take a look later. bcf_clear1(v); str = &v->shared; memset(&aux, 0, sizeof(ks_tokaux_t));