@@ -128,6 +128,8 @@ typedef struct
128128 bcf_srs_t * files ;
129129 int gvcf_min , gvcf_break ; // min buffered gvcf END position (NB: gvcf_min is 1-based) or 0 if no active lines are present
130130 gvcf_aux_t * gvcf ; // buffer of gVCF lines
131+ int nout_smpl ;
132+ kstring_t * str ;
131133}
132134maux_t ;
133135
@@ -677,6 +679,8 @@ maux_t *maux_init(args_t *args)
677679 int i , n_smpl = 0 ;
678680 for (i = 0 ; i < ma -> n ; i ++ )
679681 n_smpl += bcf_hdr_nsamples (files -> readers [i ].header );
682+ ma -> nout_smpl = n_smpl ;
683+ assert ( n_smpl == bcf_hdr_nsamples (args -> out_hdr ) );
680684 if ( args -> do_gvcf )
681685 {
682686 ma -> gvcf = (gvcf_aux_t * ) calloc (ma -> n ,sizeof (gvcf_aux_t ));
@@ -688,11 +692,14 @@ maux_t *maux_init(args_t *args)
688692 ma -> buf = (buffer_t * ) calloc (ma -> n ,sizeof (buffer_t ));
689693 for (i = 0 ; i < ma -> n ; i ++ )
690694 ma -> buf [i ].rid = -1 ;
695+ ma -> str = (kstring_t * ) calloc (n_smpl ,sizeof (kstring_t ));
691696 return ma ;
692697}
693698void maux_destroy (maux_t * ma )
694699{
695700 int i ,j ;
701+ for (i = 0 ; i < ma -> nout_smpl ; i ++ ) free (ma -> str [i ].s );
702+ free (ma -> str );
696703 for (i = 0 ; i < ma -> mals ; i ++ )
697704 {
698705 free (ma -> als [i ]);
@@ -1008,7 +1015,7 @@ int copy_string_field(char *src, int isrc, int src_len, kstring_t *dst, int idst
10081015 int end_src = start_src ;
10091016 while ( end_src < src_len && src [end_src ] && src [end_src ]!= ',' ) end_src ++ ;
10101017
1011- int nsrc_cpy = end_src - start_src ;
1018+ int nsrc_cpy = end_src - start_src ; // number of chars to copy (excluding \0)
10121019 if ( nsrc_cpy == 1 && src [start_src ]== '.' ) return 0 ; // don't write missing values, dst is already initialized
10131020
10141021 int ith_dst = 0 , start_dst = 0 ;
@@ -1412,6 +1419,107 @@ void merge_GT(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
14121419 bcf_update_format_int32 (out_hdr , out , "GT" , (int32_t * )ma -> tmp_arr , nsamples * nsize );
14131420}
14141421
1422+ void merge_format_string (args_t * args , const char * key , bcf_fmt_t * * fmt_map , bcf1_t * out , int length , int nsize )
1423+ {
1424+ bcf_srs_t * files = args -> files ;
1425+ bcf_hdr_t * out_hdr = args -> out_hdr ;
1426+ maux_t * ma = args -> maux ;
1427+ int i ,j , nsamples = bcf_hdr_nsamples (out_hdr );
1428+
1429+ // initialize empty strings, a dot for each value, e.g. ".,.,."
1430+ int nmax = 0 ;
1431+ for (i = 0 ; i < nsamples ; i ++ )
1432+ {
1433+ kstring_t * str = & ma -> str [i ];
1434+ if ( length == BCF_VL_FIXED || length == BCF_VL_VAR )
1435+ {
1436+ str -> l = 1 ;
1437+ ks_resize (str , str -> l + 1 );
1438+ str -> s [0 ] = '.' ;
1439+ }
1440+ else
1441+ {
1442+ str -> l = nsize * 2 - 1 ;
1443+ ks_resize (str , str -> l + 1 );
1444+ str -> s [0 ] = '.' ;
1445+ for (j = 1 ; j < nsize ; j ++ ) str -> s [j * 2 - 1 ] = ',' , str -> s [j * 2 ] = '.' ;
1446+ }
1447+ str -> s [str -> l ] = 0 ;
1448+ if ( nmax < str -> l ) nmax = str -> l ;
1449+ }
1450+
1451+ // fill in values for each sample
1452+ int ismpl = 0 ;
1453+ for (i = 0 ; i < files -> nreaders ; i ++ )
1454+ {
1455+ bcf_sr_t * reader = & files -> readers [i ];
1456+ bcf_hdr_t * hdr = reader -> header ;
1457+ bcf_fmt_t * fmt_ori = fmt_map [i ];
1458+ if ( !fmt_ori )
1459+ {
1460+ // the field is not present in this file
1461+ ismpl += bcf_hdr_nsamples (hdr );
1462+ continue ;
1463+ }
1464+
1465+ bcf1_t * line = maux_get_line (args , i );
1466+ int irec = ma -> buf [i ].cur ;
1467+ char * src = (char * ) fmt_ori -> p ;
1468+
1469+ if ( length == BCF_VL_FIXED || length == BCF_VL_VAR || (line -> n_allele == out -> n_allele && !ma -> buf [i ].rec [irec ].als_differ ) )
1470+ {
1471+ // alleles unchanged, copy over
1472+ for (j = 0 ; j < bcf_hdr_nsamples (hdr ); j ++ )
1473+ {
1474+ kstring_t * str = & ma -> str [ismpl ++ ];
1475+ str -> l = 0 ;
1476+ kputs (src , str );
1477+ if ( nmax < str -> l ) nmax = str -> l ;
1478+ src += fmt_ori -> n ;
1479+ }
1480+ continue ;
1481+ }
1482+ // NB, what is below is not the fastest way, copy_string_field() keeps
1483+ // finding the indexes repeatedly at multiallelic sites
1484+ if ( length == BCF_VL_A || length == BCF_VL_R )
1485+ {
1486+ int ifrom = length == BCF_VL_A ? 1 : 0 ;
1487+ for (j = 0 ; j < bcf_hdr_nsamples (hdr ); j ++ )
1488+ {
1489+ kstring_t * str = & ma -> str [ismpl ++ ];
1490+ int iori ,inew ;
1491+ for (iori = ifrom ; iori < line -> n_allele ; iori ++ )
1492+ {
1493+ inew = ma -> buf [i ].rec [irec ].map [iori ] - ifrom ;
1494+ int ret = copy_string_field (src , iori - ifrom , fmt_ori -> size , str , inew );
1495+ if ( ret < -1 ) error ("[E::%s] fixme: internal error at %s:%d .. %d\n" ,__func__ ,bcf_seqname (hdr ,line ),line -> pos + 1 ,ret );
1496+ }
1497+ src += fmt_ori -> size ;
1498+ }
1499+ continue ;
1500+ }
1501+ assert ( length == BCF_VL_G );
1502+ error ("[E::%s] Merging of Number=G FORMAT strings (in your case FORMAT/%s) is not supported yet, sorry!\n"
1503+ "Please open an issue on github if this feature is essential for you. However, note that using FORMAT strings is not\n"
1504+ "a good idea in general - it is slow to parse and does not compress well, it is better to use integer codes instead.\n"
1505+ "If you don't really need it, use `bcftools annotate -x` to remove the annotation before merging.\n" , __func__ ,key );
1506+ }
1507+ // update the record
1508+ if ( ma -> ntmp_arr < nsamples * nmax )
1509+ {
1510+ ma -> ntmp_arr = nsamples * nmax ;
1511+ ma -> tmp_arr = realloc (ma -> tmp_arr , ma -> ntmp_arr );
1512+ }
1513+ char * tgt = (char * ) ma -> tmp_arr ;
1514+ for (i = 0 ; i < nsamples ; i ++ )
1515+ {
1516+ memcpy (tgt , ma -> str [i ].s , ma -> str [i ].l );
1517+ if ( ma -> str [i ].l < nmax ) memset (tgt + ma -> str [i ].l , 0 , nmax - ma -> str [i ].l );
1518+ tgt += nmax ;
1519+ }
1520+ bcf_update_format_char (out_hdr , out , key , (float * )ma -> tmp_arr , nsamples * nmax );
1521+ }
1522+
14151523void merge_format_field (args_t * args , bcf_fmt_t * * fmt_map , bcf1_t * out )
14161524{
14171525 bcf_srs_t * files = args -> files ;
@@ -1447,6 +1555,11 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
14471555 }
14481556 if ( fmt_map [i ]-> n > nsize ) nsize = fmt_map [i ]-> n ;
14491557 }
1558+ if ( type == BCF_BT_CHAR )
1559+ {
1560+ merge_format_string (args , key , fmt_map , out , length , nsize );
1561+ return ;
1562+ }
14501563
14511564 int msize = sizeof (float )> sizeof (int32_t ) ? sizeof (float ) : sizeof (int32_t );
14521565 if ( ma -> ntmp_arr < nsamples * nsize * msize )
@@ -1463,6 +1576,7 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
14631576 bcf_fmt_t * fmt_ori = fmt_map [i ];
14641577 bcf1_t * line = maux_get_line (args , i );
14651578 int irec = ma -> buf [i ].cur ;
1579+
14661580 if ( fmt_ori )
14671581 {
14681582 type = fmt_ori -> type ;
@@ -1619,15 +1733,12 @@ void merge_format_field(args_t *args, bcf_fmt_t **fmt_map, bcf1_t *out)
16191733 case BCF_BT_INT16 : BRANCH (int32_t , int16_t , * src == bcf_int16_missing , * src == bcf_int16_vector_end , * tgt = bcf_int32_missing , * tgt = bcf_int32_vector_end ); break ;
16201734 case BCF_BT_INT32 : BRANCH (int32_t , int32_t , * src == bcf_int32_missing , * src == bcf_int32_vector_end , * tgt = bcf_int32_missing , * tgt = bcf_int32_vector_end ); break ;
16211735 case BCF_BT_FLOAT : BRANCH (float , float , bcf_float_is_missing (* src ), bcf_float_is_vector_end (* src ), bcf_float_set_missing (* tgt ), bcf_float_set_vector_end (* tgt )); break ;
1622- case BCF_BT_CHAR : BRANCH (uint8_t , uint8_t , * src == bcf_str_missing , * src == bcf_str_vector_end , * tgt = bcf_str_missing , * tgt = bcf_str_vector_end ); break ;
16231736 default : error ("Unexpected case: %d, %s\n" , type , key );
16241737 }
16251738 #undef BRANCH
16261739 }
16271740 if ( type == BCF_BT_FLOAT )
16281741 bcf_update_format_float (out_hdr , out , key , (float * )ma -> tmp_arr , nsamples * nsize );
1629- else if ( type == BCF_BT_CHAR )
1630- bcf_update_format_char (out_hdr , out , key , (float * )ma -> tmp_arr , nsamples * nsize );
16311742 else
16321743 bcf_update_format_int32 (out_hdr , out , key , (int32_t * )ma -> tmp_arr , nsamples * nsize );
16331744}
0 commit comments