Skip to content

Commit 3fba279

Browse files
committed
Add VCF/BCF support for POS=0 coordinate
Review of `bcftools merge` code and possibly others is necessary because it is relying on uninitialized POS==-1 in synced_bcf_reader This should resolve samtools#1571
1 parent 70df047 commit 3fba279

File tree

5 files changed

+29
-24
lines changed

5 files changed

+29
-24
lines changed

hts.c

+6-4
Original file line numberDiff line numberDiff line change
@@ -2205,6 +2205,8 @@ static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t of
22052205
{
22062206
int i;
22072207
hts_pos_t beg, end;
2208+
if ( _beg<0 ) _beg = 0;
2209+
if ( _end<=_beg ) _end = _beg+1;
22082210
beg = _beg >> min_shift;
22092211
end = (_end - 1) >> min_shift;
22102212
if (l->m < end + 1) {
@@ -2905,6 +2907,8 @@ static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shi
29052907
{
29062908
int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
29072909
if (beg >= end) return 0;
2910+
if (beg < 0 ) beg = 0;
2911+
if (beg==end) end = 1;
29082912
if (end >= 1LL<<s) end = 1LL<<s;
29092913
for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
29102914
hts_pos_t b, e;
@@ -3086,7 +3090,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
30863090
} else if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) {
30873091
iter->finished = 1;
30883092
} else {
3089-
if (beg < 0) beg = 0;
3093+
if (beg < -1) beg = -1;
30903094
if (end < beg) {
30913095
free(iter);
30923096
return NULL;
@@ -3103,7 +3107,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
31033107

31043108
if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }
31053109

3106-
rel_off = beg>>idx->min_shift;
3110+
rel_off = (beg>=0 ? beg : 0) >> idx->min_shift;
31073111
// compute min_off
31083112
bin = hts_bin_first(idx->n_lvls) + rel_off;
31093113
do {
@@ -3138,8 +3142,6 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
31383142
min_off = kh_val(bidx, k).loff;
31393143
}
31403144
} else if (unmapped) { //CSI index
3141-
if (k != kh_end(bidx))
3142-
min_off = kh_val(bidx, k).loff;
31433145
}
31443146

31453147
// compute max_off: a virtual offset from a bin to the right of end

htslib/synced_bcf_reader.h

+2
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ DEALINGS IN THE SOFTWARE. */
6464
extern "C" {
6565
#endif
6666

67+
#define CSI_COOR_EMPTY -2
68+
6769
/*
6870
When reading multiple files in parallel, duplicate records within each
6971
file will be reordered and offered in intuitive order. For example,

synced_bcf_reader.c

+16-15
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
371371
if ( !files->regions )
372372
files->regions = _regions_init_string(names[i]);
373373
else
374-
_regions_add(files->regions, names[i], -1, -1);
374+
_regions_add(files->regions, names[i], CSI_COOR_EMPTY, CSI_COOR_EMPTY);
375375
}
376376
free(names);
377377
_regions_sort_and_merge(files->regions);
@@ -555,7 +555,7 @@ static int _readers_next_region(bcf_srs_t *files)
555555
int prev_iseq = files->regions->iseq;
556556
hts_pos_t prev_end = files->regions->end;
557557
if ( bcf_sr_regions_next(files->regions)<0 ) return -1;
558-
files->regions->prev_end = prev_iseq==files->regions->iseq ? prev_end : -1;
558+
files->regions->prev_end = prev_iseq==files->regions->iseq ? prev_end : CSI_COOR_EMPTY;
559559

560560
for (i=0; i<files->nreaders; i++)
561561
_reader_seek(&files->readers[i],files->regions->seq_names[files->regions->iseq],files->regions->start,files->regions->end);
@@ -616,7 +616,7 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
616616
{
617617
reader->buffer[reader->mbuffer-i] = bcf_init1();
618618
reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack;
619-
reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1
619+
reader->buffer[reader->mbuffer-i]->pos = CSI_COOR_EMPTY; // for rare cases when VCF starts from 1 or 0
620620
}
621621
}
622622
if ( files->streaming )
@@ -656,7 +656,6 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
656656
if ( ret < 0 ) break; // no more lines or an error
657657
bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]);
658658
}
659-
660659
// Prevent creation of duplicates from records overlapping multiple regions
661660
// and recognize true variant overlaps vs record overlaps (e.g. TA>T vs A>-)
662661
if ( files->regions )
@@ -676,7 +675,6 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
676675
hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap);
677676
exit(1);
678677
}
679-
680678
if ( beg <= files->regions->prev_end || end < files->regions->start || beg > files->regions->end ) continue;
681679
}
682680

@@ -954,9 +952,9 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file)
954952
// inclusive. Sorting and merging step needed afterwards: qsort(..,cmp_regions) and merge_regions().
955953
static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end)
956954
{
957-
if ( start==-1 && end==-1 )
955+
if ( start==CSI_COOR_EMPTY && end==CSI_COOR_EMPTY )
958956
{
959-
start = 0; end = MAX_CSI_COOR-1;
957+
start = -1; end = MAX_CSI_COOR-1;
960958
}
961959
else
962960
{
@@ -1029,8 +1027,9 @@ void _regions_sort_and_merge(bcf_sr_regions_t *reg)
10291027
static bcf_sr_regions_t *_regions_init_string(const char *str)
10301028
{
10311029
bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
1032-
reg->start = reg->end = -1;
1033-
reg->prev_start = reg->prev_end = reg->prev_seq = -1;
1030+
reg->start = reg->end = CSI_COOR_EMPTY;
1031+
reg->prev_start = reg->prev_end = CSI_COOR_EMPTY;
1032+
reg->prev_seq = -1;
10341033

10351034
kstring_t tmp = {0,0,0};
10361035
const char *sp = str, *ep = str;
@@ -1075,7 +1074,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str)
10751074
}
10761075
else
10771076
{
1078-
if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1);
1077+
if ( tmp.l ) _regions_add(reg, tmp.s, CSI_COOR_EMPTY, CSI_COOR_EMPTY);
10791078
if ( !*ep ) break;
10801079
sp = ++ep;
10811080
}
@@ -1157,8 +1156,9 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr
11571156
}
11581157

11591158
reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
1160-
reg->start = reg->end = -1;
1161-
reg->prev_start = reg->prev_end = reg->prev_seq = -1;
1159+
reg->start = reg->end = CSI_COOR_EMPTY;
1160+
reg->prev_start = reg->prev_end = CSI_COOR_EMPTY;
1161+
reg->prev_seq = -1;
11621162

11631163
reg->file = hts_open(regions, "rb");
11641164
if ( !reg->file )
@@ -1253,7 +1253,8 @@ void bcf_sr_regions_destroy(bcf_sr_regions_t *reg)
12531253

12541254
int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq)
12551255
{
1256-
reg->iseq = reg->start = reg->end = -1;
1256+
reg->iseq = -1;
1257+
reg->start = reg->end = CSI_COOR_EMPTY;
12571258
if ( khash_str2int_get(reg->seq_hash, seq, &reg->iseq) < 0 ) return -1; // sequence seq not in regions
12581259

12591260
// using in-memory regions
@@ -1284,7 +1285,7 @@ static int advance_creg(region_t *reg)
12841285
int bcf_sr_regions_next(bcf_sr_regions_t *reg)
12851286
{
12861287
if ( reg->iseq<0 ) return -1;
1287-
reg->start = reg->end = -1;
1288+
reg->start = reg->end = CSI_COOR_EMPTY;
12881289
reg->nals = 0;
12891290

12901291
// using in-memory regions
@@ -1442,7 +1443,7 @@ static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_p
14421443
bcf_sr_regions_flush(reg);
14431444

14441445
bcf_sr_regions_seek(reg, seq);
1445-
reg->start = reg->end = -1;
1446+
reg->start = reg->end = CSI_COOR_EMPTY;
14461447
}
14471448
if ( reg->prev_seq==iseq && reg->iseq!=iseq ) return -2; // no more regions on this chromosome
14481449
reg->prev_seq = reg->iseq;

tbx.c

+4-4
Original file line numberDiff line numberDiff line change
@@ -107,12 +107,12 @@ int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv)
107107
if ( s==line+b ) return -1; // expected int
108108
if (!(conf->preset&TBX_UCSC)) --intv->beg;
109109
else ++intv->end;
110-
if (intv->beg < 0) {
110+
if (intv->beg < -1) {
111111
hts_log_warning("Coordinate <= 0 detected. "
112112
"Did you forget to use the -0 option?");
113113
intv->beg = 0;
114114
}
115-
if (intv->end < 1) intv->end = 1;
115+
if (intv->end < intv->beg ) intv->end = intv->beg;
116116
} else {
117117
if ((conf->preset&0xffff) == TBX_GENERIC) {
118118
if (id == conf->ec)
@@ -171,7 +171,7 @@ int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv)
171171
++id;
172172
}
173173
}
174-
if (intv->ss == 0 || intv->se == 0 || intv->beg < 0 || intv->end < 0) return -1;
174+
if (intv->ss == 0 || intv->se == 0 || intv->beg < -1 || intv->end < -1) return -1;
175175
return 0;
176176
}
177177

@@ -181,7 +181,7 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_
181181
int c = *intv->se;
182182
*intv->se = '\0'; intv->tid = get_tid(tbx, intv->ss, is_add); *intv->se = c;
183183
if (intv->tid < 0) return -2; // get_tid out of memory
184-
return (intv->beg >= 0 && intv->end >= 0)? 0 : -1;
184+
return (intv->beg >= -1 && intv->end >= -1)? 0 : -1;
185185
} else {
186186
char *type = NULL;
187187
switch (tbx->conf.preset&0xffff)

0 commit comments

Comments
 (0)