Skip to content

Commit 306664a

Browse files
committed
Allow initial whitespace in FASTA ">" headers
Fixes samtools/samtools#449. Also ensure an empty name works; fixes #258, hat tip @mtmorgan. Add test/faidx.fa test cases, with unnamed sequence, extra whitespace, and tests for previously-fixed blank line-related bugs fixed in 1980e58. Fix memory leaks introduced by 642783e. [NEWS] * fai_build() and samtools faidx now accept initial whitespace in ">" headers (e.g., "> chr1 description" is taken to refer to "chr1")
1 parent 82203a2 commit 306664a

File tree

5 files changed

+77
-42
lines changed

5 files changed

+77
-42
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ hts.o hts.pico: hts.c config.h version.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram
242242
vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) $(htslib_khash_str2int_h)
243243
sam.o sam.pico: sam.c config.h $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h)
244244
tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_khash_h)
245-
faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h)
245+
faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kstring_h)
246246
synced_bcf_reader.o synced_bcf_reader.pico: synced_bcf_reader.c config.h $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) $(htslib_bgzf_h)
247247
vcf_sweep.o vcf_sweep.pico: vcf_sweep.c config.h $(htslib_vcf_sweep_h) $(htslib_bgzf_h)
248248
vcfutils.o vcfutils.pico: vcfutils.c config.h $(htslib_vcfutils_h)
@@ -289,7 +289,7 @@ tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htsl
289289
check test: $(BUILT_TEST_PROGRAMS)
290290
test/fieldarith test/fieldarith.sam
291291
test/hfile
292-
test/sam test/ce.fa
292+
test/sam test/ce.fa test/faidx.fa
293293
test/test-regidx
294294
cd test && REF_PATH=: ./test_view.pl
295295
cd test && ./test.pl

faidx.5

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
'\" t
2-
.TH faidx 5 "August 2013" "htslib" "Bioinformatics formats"
2+
.TH faidx 5 "August 2015" "htslib" "Bioinformatics formats"
33
.SH NAME
44
faidx \- an index enabling random access to FASTA files
55
.\"
6-
.\" Copyright (C) 2013 Genome Research Ltd.
6+
.\" Copyright (C) 2013, 2015 Genome Research Ltd.
77
.\"
88
.\" Author: John Marshall <jm18@sanger.ac.uk>
99
.\"
@@ -98,9 +98,8 @@ or other line termination, the newline characters present must be consistent,
9898
at least within each reference sequence.
9999
.P
100100
The \fBsamtools\fP implementation uses the first word of the "\fB>\fP" header
101-
line text (i.e., up to the first whitespace character) as the \fBNAME\fP column.
102-
At present, there may be no whitespace between the
103-
">" character and the \fIname\fP.
101+
line text (i.e., up to the first whitespace character, having skipped any
102+
initial whitespace after the ">") as the \fBNAME\fP column.
104103
.SH EXAMPLE
105104
For example, given this FASTA file
106105
.LP

faidx.c

Lines changed: 27 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ DEALINGS IN THE SOFTWARE. */
3535
#include "htslib/faidx.h"
3636
#include "htslib/hfile.h"
3737
#include "htslib/khash.h"
38+
#include "htslib/kstring.h"
3839

3940
typedef struct {
4041
int32_t line_len, line_blen;
@@ -80,9 +81,8 @@ static inline void fai_insert_index(faidx_t *idx, const char *name, int len, int
8081

8182
faidx_t *fai_build_core(BGZF *bgzf)
8283
{
83-
char *name;
84+
kstring_t name = { 0, 0, NULL };
8485
int c;
85-
int l_name, m_name;
8686
int line_len, line_blen, state;
8787
int l1, l2;
8888
faidx_t *idx;
@@ -91,7 +91,6 @@ faidx_t *fai_build_core(BGZF *bgzf)
9191

9292
idx = (faidx_t*)calloc(1, sizeof(faidx_t));
9393
idx->hash = kh_init(s);
94-
name = 0; l_name = m_name = 0;
9594
len = line_len = line_blen = -1; state = 0; l1 = l2 = -1; offset = 0;
9695
while ( (c=bgzf_getc(bgzf))>=0 ) {
9796
if (c == '\n') { // an empty line
@@ -103,30 +102,25 @@ faidx_t *fai_build_core(BGZF *bgzf)
103102
}
104103
if (c == '>') { // fasta header
105104
if (len >= 0)
106-
fai_insert_index(idx, name, len, line_len, line_blen, offset);
107-
l_name = 0;
108-
while ( (c=bgzf_getc(bgzf))>=0 && !isspace(c)) {
109-
if (m_name < l_name + 2) {
110-
m_name = l_name + 2;
111-
kroundup32(m_name);
112-
name = (char*)realloc(name, m_name);
113-
}
114-
name[l_name++] = c;
115-
}
116-
name[l_name] = '\0';
105+
fai_insert_index(idx, name.s, len, line_len, line_blen, offset);
106+
107+
name.l = 0;
108+
while ((c = bgzf_getc(bgzf)) >= 0)
109+
if (! isspace(c)) kputc_(c, &name);
110+
else if (name.l > 0 || c == '\n') break;
111+
kputsn("", 0, &name);
112+
117113
if ( c<0 ) {
118114
fprintf(stderr, "[fai_build_core] the last entry has no sequence\n");
119-
free(name); fai_destroy(idx);
120-
return 0;
115+
goto fail;
121116
}
122117
if (c != '\n') while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
123118
state = 1; len = 0;
124119
offset = bgzf_utell(bgzf);
125120
} else {
126121
if (state == 3) {
127-
fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name);
128-
free(name); fai_destroy(idx);
129-
return 0;
122+
fprintf(stderr, "[fai_build_core] inlined empty line is not allowed in sequence '%s'.\n", name.s);
123+
goto fail;
130124
}
131125
if (state == 2) state = 3;
132126
l1 = l2 = 0;
@@ -135,9 +129,8 @@ faidx_t *fai_build_core(BGZF *bgzf)
135129
if (isgraph(c)) ++l2;
136130
} while ( (c=bgzf_getc(bgzf))>=0 && c != '\n');
137131
if (state == 3 && l2) {
138-
fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name);
139-
free(name); fai_destroy(idx);
140-
return 0;
132+
fprintf(stderr, "[fai_build_core] different line length in sequence '%s'.\n", name.s);
133+
goto fail;
141134
}
142135
++l1; len += l2;
143136
if (state == 1) line_len = l1, line_blen = l2, state = 0;
@@ -146,15 +139,19 @@ faidx_t *fai_build_core(BGZF *bgzf)
146139
}
147140
}
148141
}
149-
if ( name )
150-
fai_insert_index(idx, name, len, line_len, line_blen, offset);
142+
143+
if (len >= 0)
144+
fai_insert_index(idx, name.s, len, line_len, line_blen, offset);
151145
else
152-
{
153-
free(idx);
154-
return NULL;
155-
}
156-
free(name);
146+
goto fail;
147+
148+
free(name.s);
157149
return idx;
150+
151+
fail:
152+
free(name.s);
153+
fai_destroy(idx);
154+
return NULL;
158155
}
159156

160157
void fai_save(const faidx_t *fai, FILE *fp)
@@ -229,6 +226,7 @@ int fai_build(const char *fn)
229226
if ( !fai )
230227
{
231228
if ( bgzf->is_compressed && bgzf->is_gzip ) fprintf(stderr,"Cannot index files compressed with gzip, please use bgzip\n");
229+
bgzf_close(bgzf);
232230
free(str);
233231
return -1;
234232
}

test/faidx.fa

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
>
2+
ATGC
3+
>trailingblank1
4+
AAATTTGGGCCC
5+
TTTGGGCCCAAA
6+
GGGCCCAAA
7+
8+
>trailingblank2 with last dna line the same length
9+
AAATTTGGGCCCAAATTTGGGCCC
10+
TTTGGGCCCAAATTTGGGCCCAAA
11+
GGGCCCAAATTTGGGCCCAAATTT
12+
13+
> foo
14+
TGCATG
15+
CA
16+
> bar description
17+
TTTTAAAA

test/sam.c

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -162,26 +162,47 @@ static void iterators1(void)
162162

163163
static void faidx1(const char *filename)
164164
{
165-
int n;
166-
faidx_t *fai = fai_load(filename);
167-
if (fai == NULL) fail("can't load faidx file");
165+
int n, n_exp = 0;
166+
char tmpfilename[FILENAME_MAX], line[500];
167+
FILE *fin, *fout;
168+
faidx_t *fai;
169+
170+
fin = fopen(filename, "r");
171+
if (fin == NULL) fail("can't open %s\n", filename);
172+
sprintf(tmpfilename, "%s.tmp", filename);
173+
fout = fopen(tmpfilename, "w");
174+
if (fout == NULL) fail("can't create temporary %s\n", tmpfilename);
175+
while (fgets(line, sizeof line, fin)) {
176+
if (line[0] == '>') n_exp++;
177+
fputs(line, fout);
178+
}
179+
fclose(fin);
180+
fclose(fout);
181+
182+
if (fai_build(tmpfilename) < 0) fail("can't index %s", tmpfilename);
183+
fai = fai_load(tmpfilename);
184+
if (fai == NULL) fail("can't load faidx file %s", tmpfilename);
168185

169186
n = faidx_fetch_nseq(fai);
170-
if (n != 7) fail("faidx_fetch_nseq returned %d, expected 7", n);
187+
if (n != n_exp)
188+
fail("%s: faidx_fetch_nseq returned %d, expected %d", filename, n, n_exp);
171189

172190
n = faidx_nseq(fai);
173-
if (n != 7) fail("faidx_nseq returned %d, expected 7", n);
191+
if (n != n_exp)
192+
fail("%s: faidx_nseq returned %d, expected %d", filename, n, n_exp);
174193

175194
fai_destroy(fai);
176195
}
177196

178197
int main(int argc, char **argv)
179198
{
199+
int i;
200+
180201
status = EXIT_SUCCESS;
181202

182203
aux_fields1();
183204
iterators1();
184-
if (argc >= 2) faidx1(argv[1]);
205+
for (i = 1; i < argc; i++) faidx1(argv[i]);
185206

186207
return status;
187208
}

0 commit comments

Comments
 (0)