Skip to content

Commit 0668ecc

Browse files
committed
Add support for merging FORMAT strings, only Number=G tags not supported yet. Resolves samtools#815
1 parent 69711c9 commit 0668ecc

5 files changed

Lines changed: 146 additions & 4 deletions

File tree

test/merge.6.a.vcf

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
##fileformat=VCFv4.2
2+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
3+
##FORMAT=<ID=AFS,Number=A,Type=String,Description="Some string">
4+
##FORMAT=<ID=RFS,Number=R,Type=String,Description="Some string">
5+
##contig=<ID=1,length=248956422>
6+
##reference=file:///home/dnanexus/genome.fa
7+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT t1
8+
1 11080563 . A C . . . GT:RFS:AFS 0/1:a,c:c
9+
1 11080564 . A C . . . GT:RFS:AFS 0/1:a,c:c

test/merge.6.b.vcf

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
##fileformat=VCFv4.2
2+
##FORMAT=<ID=FFS,Number=2,Type=String,Description="Some string">
3+
##FORMAT=<ID=RFS,Number=R,Type=String,Description="Some string">
4+
##FORMAT=<ID=AFS,Number=A,Type=String,Description="Some string">
5+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
6+
##contig=<ID=1,length=248956422>
7+
##reference=file:///home/dnanexus/genome.fa
8+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT t2
9+
1 11080563 . A C . . . GT:FFS:AFS:RFS 0/1:xx,yy:C:A,C
10+
1 11080564 . A T . . . GT:FFS:AFS:RFS 0/1:xx,yy:T:A,T

test/merge.6.out

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
##fileformat=VCFv4.2
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
4+
##FORMAT=<ID=AFS,Number=A,Type=String,Description="Some string">
5+
##FORMAT=<ID=RFS,Number=R,Type=String,Description="Some string">
6+
##contig=<ID=1,length=248956422>
7+
##reference=file:///home/dnanexus/genome.fa
8+
##FORMAT=<ID=FFS,Number=2,Type=String,Description="Some string">
9+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT t1 t2
10+
1 11080563 . A C . . . GT:RFS:AFS:FFS 0/1:a,c:c:. 0/1:A,C:C:xx,yy
11+
1 11080564 . A C,T . . . GT:RFS:AFS:FFS 0/1:a,c,.:c,.:. 0/2:A,.,T:.,T:xx,yy

test/test.pl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
test_vcf_merge($opts,in=>['merge.gvcf.2.a','merge.gvcf.2.b','merge.gvcf.2.c'],out=>'merge.gvcf.2.out',args=>'--gvcf -');
6666
test_vcf_merge($opts,in=>['merge.gvcf.3.a','merge.gvcf.3.b'],out=>'merge.gvcf.3.out',args=>'--gvcf - -i SRC:join');
6767
test_vcf_merge($opts,in=>['merge.5.a','merge.5.b'],out=>'merge.5.out');
68+
test_vcf_merge($opts,in=>['merge.6.a','merge.6.b'],out=>'merge.6.out');
6869
test_vcf_query($opts,in=>'query',out=>'query.out',args=>q[-f '%CHROM\\t%POS\\t%REF\\t%ALT\\t%DP4\\t%AN[\\t%GT\\t%TGT]\\n']);
6970
test_vcf_query($opts,in=>'view.filter',out=>'query.2.out',args=>q[-f'%XRI\\n' -i'XRI[*]>1111']);
7071
test_vcf_query($opts,in=>'view.filter',out=>'query.3.out',args=>q[-f'%XRF\\n' -i'XRF[*]=2e6']);

vcfmerge.c

Lines changed: 115 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,8 @@ typedef struct
128128
bcf_srs_t *files;
129129
int gvcf_min, gvcf_break; // min buffered gvcf END position (NB: gvcf_min is 1-based) or 0 if no active lines are present
130130
gvcf_aux_t *gvcf; // buffer of gVCF lines
131+
int nout_smpl;
132+
kstring_t *str;
131133
}
132134
maux_t;
133135

@@ -677,6 +679,8 @@ maux_t *maux_init(args_t *args)
677679
int i, n_smpl = 0;
678680
for (i=0; i<ma->n; i++)
679681
n_smpl += bcf_hdr_nsamples(files->readers[i].header);
682+
ma->nout_smpl = n_smpl;
683+
assert( n_smpl==bcf_hdr_nsamples(args->out_hdr) );
680684
if ( args->do_gvcf )
681685
{
682686
ma->gvcf = (gvcf_aux_t*) calloc(ma->n,sizeof(gvcf_aux_t));
@@ -688,11 +692,14 @@ maux_t *maux_init(args_t *args)
688692
ma->buf = (buffer_t*) calloc(ma->n,sizeof(buffer_t));
689693
for (i=0; i<ma->n; i++)
690694
ma->buf[i].rid = -1;
695+
ma->str = (kstring_t*) calloc(n_smpl,sizeof(kstring_t));
691696
return ma;
692697
}
693698
void maux_destroy(maux_t *ma)
694699
{
695700
int i,j;
701+
for (i=0; i<ma->nout_smpl; i++) free(ma->str[i].s);
702+
free(ma->str);
696703
for (i=0; i<ma->mals; i++)
697704
{
698705
free(ma->als[i]);
@@ -1008,7 +1015,7 @@ int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst
10081015
int end_src = start_src;
10091016
while ( end_src<src_len && src[end_src] && src[end_src]!=',' ) end_src++;
10101017

1011-
int nsrc_cpy = end_src - start_src;
1018+
int nsrc_cpy = end_src - start_src; // number of chars to copy (excluding \0)
10121019
if ( nsrc_cpy==1 && src[start_src]=='.' ) return 0; // don't write missing values, dst is already initialized
10131020

10141021
int ith_dst = 0, start_dst = 0;
@@ -1412,6 +1419,107 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
14121419
bcf_update_format_int32(out_hdr, out, "GT", (int32_t*)ma->tmp_arr, nsamples*nsize);
14131420
}
14141421

1422+
void merge_format_string(args_t *args, const char *key, bcf_fmt_t **fmt_map, bcf1_t *out, int length, int nsize)
1423+
{
1424+
bcf_srs_t *files = args->files;
1425+
bcf_hdr_t *out_hdr = args->out_hdr;
1426+
maux_t *ma = args->maux;
1427+
int i,j, nsamples = bcf_hdr_nsamples(out_hdr);
1428+
1429+
// initialize empty strings, a dot for each value, e.g. ".,.,."
1430+
int nmax = 0;
1431+
for (i=0; i<nsamples; i++)
1432+
{
1433+
kstring_t *str = &ma->str[i];
1434+
if ( length==BCF_VL_FIXED || length==BCF_VL_VAR )
1435+
{
1436+
str->l = 1;
1437+
ks_resize(str, str->l+1);
1438+
str->s[0] = '.';
1439+
}
1440+
else
1441+
{
1442+
str->l = nsize*2 - 1;
1443+
ks_resize(str, str->l+1);
1444+
str->s[0] = '.';
1445+
for (j=1; j<nsize; j++) str->s[j*2-1] = ',', str->s[j*2] = '.';
1446+
}
1447+
str->s[str->l] = 0;
1448+
if ( nmax < str->l ) nmax = str->l;
1449+
}
1450+
1451+
// fill in values for each sample
1452+
int ismpl = 0;
1453+
for (i=0; i<files->nreaders; i++)
1454+
{
1455+
bcf_sr_t *reader = &files->readers[i];
1456+
bcf_hdr_t *hdr = reader->header;
1457+
bcf_fmt_t *fmt_ori = fmt_map[i];
1458+
if ( !fmt_ori )
1459+
{
1460+
// the field is not present in this file
1461+
ismpl += bcf_hdr_nsamples(hdr);
1462+
continue;
1463+
}
1464+
1465+
bcf1_t *line = maux_get_line(args, i);
1466+
int irec = ma->buf[i].cur;
1467+
char *src = (char*) fmt_ori->p;
1468+
1469+
if ( length==BCF_VL_FIXED || length==BCF_VL_VAR || (line->n_allele==out->n_allele && !ma->buf[i].rec[irec].als_differ) )
1470+
{
1471+
// alleles unchanged, copy over
1472+
for (j=0; j<bcf_hdr_nsamples(hdr); j++)
1473+
{
1474+
kstring_t *str = &ma->str[ismpl++];
1475+
str->l = 0;
1476+
kputs(src, str);
1477+
if ( nmax < str->l ) nmax = str->l;
1478+
src += fmt_ori->n;
1479+
}
1480+
continue;
1481+
}
1482+
// NB, what is below is not the fastest way, copy_string_field() keeps
1483+
// finding the indexes repeatedly at multiallelic sites
1484+
if ( length==BCF_VL_A || length==BCF_VL_R )
1485+
{
1486+
int ifrom = length==BCF_VL_A ? 1 : 0;
1487+
for (j=0; j<bcf_hdr_nsamples(hdr); j++)
1488+
{
1489+
kstring_t *str = &ma->str[ismpl++];
1490+
int iori,inew;
1491+
for (iori=ifrom; iori<line->n_allele; iori++)
1492+
{
1493+
inew = ma->buf[i].rec[irec].map[iori] - ifrom;
1494+
int ret = copy_string_field(src, iori - ifrom, fmt_ori->size, str, inew);
1495+
if ( ret<-1 ) error("[E::%s] fixme: internal error at %s:%d .. %d\n",__func__,bcf_seqname(hdr,line),line->pos+1,ret);
1496+
}
1497+
src += fmt_ori->size;
1498+
}
1499+
continue;
1500+
}
1501+
assert( length==BCF_VL_G );
1502+
error("[E::%s] Merging of Number=G FORMAT strings (in your case FORMAT/%s) is not supported yet, sorry!\n"
1503+
"Please open an issue on github if this feature is essential for you. However, note that using FORMAT strings is not\n"
1504+
"a good idea in general - it is slow to parse and does not compress well, it is better to use integer codes instead.\n"
1505+
"If you don't really need it, use `bcftools annotate -x` to remove the annotation before merging.\n", __func__,key);
1506+
}
1507+
// update the record
1508+
if ( ma->ntmp_arr < nsamples*nmax )
1509+
{
1510+
ma->ntmp_arr = nsamples*nmax;
1511+
ma->tmp_arr = realloc(ma->tmp_arr, ma->ntmp_arr);
1512+
}
1513+
char *tgt = (char*) ma->tmp_arr;
1514+
for (i=0; i<nsamples; i++)
1515+
{
1516+
memcpy(tgt, ma->str[i].s, ma->str[i].l);
1517+
if ( ma->str[i].l < nmax ) memset(tgt + ma->str[i].l, 0, nmax - ma->str[i].l);
1518+
tgt += nmax;
1519+
}
1520+
bcf_update_format_char(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nmax);
1521+
}
1522+
14151523
void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
14161524
{
14171525
bcf_srs_t *files = args->files;
@@ -1447,6 +1555,11 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
14471555
}
14481556
if ( fmt_map[i]->n > nsize ) nsize = fmt_map[i]->n;
14491557
}
1558+
if ( type==BCF_BT_CHAR )
1559+
{
1560+
merge_format_string(args, key, fmt_map, out, length, nsize);
1561+
return;
1562+
}
14501563

14511564
int msize = sizeof(float)>sizeof(int32_t) ? sizeof(float) : sizeof(int32_t);
14521565
if ( ma->ntmp_arr < nsamples*nsize*msize )
@@ -1463,6 +1576,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
14631576
bcf_fmt_t *fmt_ori = fmt_map[i];
14641577
bcf1_t *line = maux_get_line(args, i);
14651578
int irec = ma->buf[i].cur;
1579+
14661580
if ( fmt_ori )
14671581
{
14681582
type = fmt_ori->type;
@@ -1619,15 +1733,12 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
16191733
case BCF_BT_INT16: BRANCH(int32_t, int16_t, *src==bcf_int16_missing, *src==bcf_int16_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
16201734
case BCF_BT_INT32: BRANCH(int32_t, int32_t, *src==bcf_int32_missing, *src==bcf_int32_vector_end, *tgt=bcf_int32_missing, *tgt=bcf_int32_vector_end); break;
16211735
case BCF_BT_FLOAT: BRANCH(float, float, bcf_float_is_missing(*src), bcf_float_is_vector_end(*src), bcf_float_set_missing(*tgt), bcf_float_set_vector_end(*tgt)); break;
1622-
case BCF_BT_CHAR: BRANCH(uint8_t, uint8_t, *src==bcf_str_missing, *src==bcf_str_vector_end, *tgt=bcf_str_missing, *tgt=bcf_str_vector_end); break;
16231736
default: error("Unexpected case: %d, %s\n", type, key);
16241737
}
16251738
#undef BRANCH
16261739
}
16271740
if ( type==BCF_BT_FLOAT )
16281741
bcf_update_format_float(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nsize);
1629-
else if ( type==BCF_BT_CHAR )
1630-
bcf_update_format_char(out_hdr, out, key, (float*)ma->tmp_arr, nsamples*nsize);
16311742
else
16321743
bcf_update_format_int32(out_hdr, out, key, (int32_t*)ma->tmp_arr, nsamples*nsize);
16331744
}

0 commit comments

Comments
 (0)