Split VCF into n pieces by coordinate

Introduction

bcftools view -r 1:40000-50000 vcf.gz will output (to stdout) a vcf containing the header and variants on chromosome 1 between coordinates 40,000 and 50,000 base pairs.

I need to break down a large vcf into smaller pieces to dramatically speed up annotation. Let’s try 100 pieces.

The human genome is approximately 3 gigabases or 3e9 base pairs.

\[ \frac{3 * 10^9\ base\ pairs}{100\ pieces} = 3*10^7\ base\ pairs\ per\ piece \]

That’s our target size.

This is made a bit tricky since the genome is laid by chromosome. So we have to break into 3e7 pieces, accounting for chromosomes. There are also many contigs, most of which are well under 3e7 in size. Those can be processed as a group with bcftools by splitting each contig by a ,.

Let’s read in the header. It contains chromosome (and contig) sizes, which I’ve extracted from the vcf with zcat EGAD00001002656.GATK.vcf.gz | head -n 1000 | grep ^## > /home/mcgaugheyd/git/OGVFB_one_offs/mcgaughey/split_VCFs_into_n_pieces/EGAD00001002656.header

Read in vcf header

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(stringr)
vcf_header = scan('~/git/OGVFB_one_offs/mcgaughey/split_VCFs_into_n_pieces/EGAD00001002656.header', what='character')
vcf_header[grepl('contig',vcf_header)]
##  [1] "##contig=<ID=1,length=249250621,assembly=b37>"      
##  [2] "##contig=<ID=2,length=243199373,assembly=b37>"      
##  [3] "##contig=<ID=3,length=198022430,assembly=b37>"      
##  [4] "##contig=<ID=4,length=191154276,assembly=b37>"      
##  [5] "##contig=<ID=5,length=180915260,assembly=b37>"      
##  [6] "##contig=<ID=6,length=171115067,assembly=b37>"      
##  [7] "##contig=<ID=7,length=159138663,assembly=b37>"      
##  [8] "##contig=<ID=8,length=146364022,assembly=b37>"      
##  [9] "##contig=<ID=9,length=141213431,assembly=b37>"      
## [10] "##contig=<ID=10,length=135534747,assembly=b37>"     
## [11] "##contig=<ID=11,length=135006516,assembly=b37>"     
## [12] "##contig=<ID=12,length=133851895,assembly=b37>"     
## [13] "##contig=<ID=13,length=115169878,assembly=b37>"     
## [14] "##contig=<ID=14,length=107349540,assembly=b37>"     
## [15] "##contig=<ID=15,length=102531392,assembly=b37>"     
## [16] "##contig=<ID=16,length=90354753,assembly=b37>"      
## [17] "##contig=<ID=17,length=81195210,assembly=b37>"      
## [18] "##contig=<ID=18,length=78077248,assembly=b37>"      
## [19] "##contig=<ID=19,length=59128983,assembly=b37>"      
## [20] "##contig=<ID=20,length=63025520,assembly=b37>"      
## [21] "##contig=<ID=21,length=48129895,assembly=b37>"      
## [22] "##contig=<ID=22,length=51304566,assembly=b37>"      
## [23] "##contig=<ID=X,length=155270560,assembly=b37>"      
## [24] "##contig=<ID=Y,length=59373566,assembly=b37>"       
## [25] "##contig=<ID=MT,length=16569,assembly=b37>"         
## [26] "##contig=<ID=GL000207.1,length=4262,assembly=b37>"  
## [27] "##contig=<ID=GL000226.1,length=15008,assembly=b37>" 
## [28] "##contig=<ID=GL000229.1,length=19913,assembly=b37>" 
## [29] "##contig=<ID=GL000231.1,length=27386,assembly=b37>" 
## [30] "##contig=<ID=GL000210.1,length=27682,assembly=b37>" 
## [31] "##contig=<ID=GL000239.1,length=33824,assembly=b37>" 
## [32] "##contig=<ID=GL000235.1,length=34474,assembly=b37>" 
## [33] "##contig=<ID=GL000201.1,length=36148,assembly=b37>" 
## [34] "##contig=<ID=GL000247.1,length=36422,assembly=b37>" 
## [35] "##contig=<ID=GL000245.1,length=36651,assembly=b37>" 
## [36] "##contig=<ID=GL000197.1,length=37175,assembly=b37>" 
## [37] "##contig=<ID=GL000203.1,length=37498,assembly=b37>" 
## [38] "##contig=<ID=GL000246.1,length=38154,assembly=b37>" 
## [39] "##contig=<ID=GL000249.1,length=38502,assembly=b37>" 
## [40] "##contig=<ID=GL000196.1,length=38914,assembly=b37>" 
## [41] "##contig=<ID=GL000248.1,length=39786,assembly=b37>" 
## [42] "##contig=<ID=GL000244.1,length=39929,assembly=b37>" 
## [43] "##contig=<ID=GL000238.1,length=39939,assembly=b37>" 
## [44] "##contig=<ID=GL000202.1,length=40103,assembly=b37>" 
## [45] "##contig=<ID=GL000234.1,length=40531,assembly=b37>" 
## [46] "##contig=<ID=GL000232.1,length=40652,assembly=b37>" 
## [47] "##contig=<ID=GL000206.1,length=41001,assembly=b37>" 
## [48] "##contig=<ID=GL000240.1,length=41933,assembly=b37>" 
## [49] "##contig=<ID=GL000236.1,length=41934,assembly=b37>" 
## [50] "##contig=<ID=GL000241.1,length=42152,assembly=b37>" 
## [51] "##contig=<ID=GL000243.1,length=43341,assembly=b37>" 
## [52] "##contig=<ID=GL000242.1,length=43523,assembly=b37>" 
## [53] "##contig=<ID=GL000230.1,length=43691,assembly=b37>" 
## [54] "##contig=<ID=GL000237.1,length=45867,assembly=b37>" 
## [55] "##contig=<ID=GL000233.1,length=45941,assembly=b37>" 
## [56] "##contig=<ID=GL000204.1,length=81310,assembly=b37>" 
## [57] "##contig=<ID=GL000198.1,length=90085,assembly=b37>" 
## [58] "##contig=<ID=GL000208.1,length=92689,assembly=b37>" 
## [59] "##contig=<ID=GL000191.1,length=106433,assembly=b37>"
## [60] "##contig=<ID=GL000227.1,length=128374,assembly=b37>"
## [61] "##contig=<ID=GL000228.1,length=129120,assembly=b37>"
## [62] "##contig=<ID=GL000214.1,length=137718,assembly=b37>"
## [63] "##contig=<ID=GL000221.1,length=155397,assembly=b37>"
## [64] "##contig=<ID=GL000209.1,length=159169,assembly=b37>"
## [65] "##contig=<ID=GL000218.1,length=161147,assembly=b37>"
## [66] "##contig=<ID=GL000220.1,length=161802,assembly=b37>"
## [67] "##contig=<ID=GL000213.1,length=164239,assembly=b37>"
## [68] "##contig=<ID=GL000211.1,length=166566,assembly=b37>"
## [69] "##contig=<ID=GL000199.1,length=169874,assembly=b37>"
## [70] "##contig=<ID=GL000217.1,length=172149,assembly=b37>"
## [71] "##contig=<ID=GL000216.1,length=172294,assembly=b37>"
## [72] "##contig=<ID=GL000215.1,length=172545,assembly=b37>"
## [73] "##contig=<ID=GL000205.1,length=174588,assembly=b37>"
## [74] "##contig=<ID=GL000219.1,length=179198,assembly=b37>"
## [75] "##contig=<ID=GL000224.1,length=179693,assembly=b37>"
## [76] "##contig=<ID=GL000223.1,length=180455,assembly=b37>"
## [77] "##contig=<ID=GL000195.1,length=182896,assembly=b37>"
## [78] "##contig=<ID=GL000212.1,length=186858,assembly=b37>"
## [79] "##contig=<ID=GL000222.1,length=186861,assembly=b37>"
## [80] "##contig=<ID=GL000200.1,length=187035,assembly=b37>"
## [81] "##contig=<ID=GL000193.1,length=189789,assembly=b37>"
## [82] "##contig=<ID=GL000194.1,length=191469,assembly=b37>"
## [83] "##contig=<ID=GL000225.1,length=211173,assembly=b37>"
## [84] "##contig=<ID=GL000192.1,length=547496,assembly=b37>"
## [85] "##contig=<ID=NC_007605,length=171823,assembly=b37>" 
## [86] "##contig=<ID=hs37d5,length=35477943,assembly=b37>"

Parse out chr / contig sizes

# turn into data frame (well, a tibble)
contig_size <- vcf_header[grepl('contig', vcf_header)] %>% 
  data.frame() %>% 
  select(1, 'header' = 1) %>% 
  # separate by ,
  separate(header, c('contig','length','assembly'),',') %>% 
  # extract values by splitting against = and taking the last element (first after reversing)
  rowwise() %>% 
  mutate(contig = str_split(contig,'=')[[1]] %>% gsub('>','',.) %>% rev() %>% .[[1]],
         length = str_split(length,'=')[[1]] %>% gsub('>','',.) %>% rev() %>% .[[1]] %>% as.numeric(),
         assembly = str_split(assembly,'=')[[1]] %>% gsub('>','',.) %>% rev() %>% .[[1]])
contig_size
## Source: local data frame [86 x 3]
## Groups: <by row>
## 
## # A tibble: 86 x 3
##    contig    length assembly
##    <chr>      <dbl> <chr>   
##  1 1      249250621 b37     
##  2 2      243199373 b37     
##  3 3      198022430 b37     
##  4 4      191154276 b37     
##  5 5      180915260 b37     
##  6 6      171115067 b37     
##  7 7      159138663 b37     
##  8 8      146364022 b37     
##  9 9      141213431 b37     
## 10 10     135534747 b37     
## # ... with 76 more rows

Split chr above 3e7 base pairs into equal(ish) size pieces

ceiling will allow intervals a bit less than 3e7 by rounding up the number of pieces per chromsome. Would rather have more splits with less than the target size.

n_split <- function(size){
  pieces <- ceiling(size / 3e7)
  seq(1, size, size/pieces)
}

calculate coordinates

Will skip contig < 3e7 (all but hs37d5, which I don’t process, so it will be eliminated). The contigs will be printed comma separated for bcftools view -r purposes.

How many regions do we have? Should have a bit more than 100.

regions <- data.frame()
for (i in contig_size %>% filter(length > 3e7, contig != 'hs37d5') %>% pull(contig)){
  regions <- rbind(regions,(n_printer(i)))
}
regions %>% nrow()
## [1] 115

output ’em for python input (Snakemake)

The second write command appends all of the chromosomes or contigs (in this case, just contigs) that are less than 3e7 in length to the output file. It comma separates them, which is how bcftools view -r takes in multiple chromosomes or contigs. The paste(., collapse=',') command at the end collapses the vector of contigs into a string with comma separation.

write(regions %>% mutate(f = paste(paste(chr, start, sep =':'), end, sep='-')) %>% pull(f), file='vcf_region_split_coords.txt')
write(contig_size %>% filter(length < 3e7, contig != 'hs37d5') %>% pull(contig) %>% paste(., collapse=','), file='vcf_region_split_coords.txt', append = T)

rscript

I’ve wrapped up the functions and handling as a Rscript that takes the header of a vcf as input and outputs and writes to a user-given file the regions. The script also allows you to select desired number of regions (you will almost always get a few more), the output file name, and the genome size (defaults to human genome). The script is here.

Using the script output

I’m using it in a Snakemake pipeline. bcftools can use it with -R (region) if you run the script like this (see source for comments): Rscript split_vcf_into_n_pieces.R yourVCF.header 200 vcf_region_split_200_coords.txt 3e9 bed

sessionInfo()

devtools::session_info()
## Session info -------------------------------------------------------------
##  setting  value                       
##  version  R version 3.5.0 (2018-04-23)
##  system   x86_64, darwin15.6.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  tz       Europe/London               
##  date     2018-09-17
## Packages -----------------------------------------------------------------
##  package    * version date       source        
##  assertthat   0.2.0   2017-04-11 CRAN (R 3.5.0)
##  backports    1.1.2   2017-12-13 CRAN (R 3.5.0)
##  base       * 3.5.0   2018-04-24 local         
##  bindr        0.1.1   2018-03-13 CRAN (R 3.5.0)
##  bindrcpp   * 0.2.2   2018-03-29 CRAN (R 3.5.0)
##  blogdown     0.8     2018-07-15 CRAN (R 3.5.0)
##  bookdown     0.7     2018-02-18 CRAN (R 3.5.0)
##  broom        0.4.4   2018-03-29 CRAN (R 3.5.0)
##  cellranger   1.1.0   2016-07-27 CRAN (R 3.5.0)
##  cli          1.0.0   2017-11-05 CRAN (R 3.5.0)
##  colorspace   1.3-2   2016-12-14 CRAN (R 3.5.0)
##  compiler     3.5.0   2018-04-24 local         
##  crayon       1.3.4   2017-09-16 CRAN (R 3.5.0)
##  datasets   * 3.5.0   2018-04-24 local         
##  devtools     1.13.6  2018-06-27 CRAN (R 3.5.0)
##  digest       0.6.15  2018-01-28 CRAN (R 3.5.0)
##  dplyr      * 0.7.6   2018-06-29 cran (@0.7.6) 
##  evaluate     0.10.1  2017-06-24 CRAN (R 3.5.0)
##  forcats    * 0.3.0   2018-02-19 CRAN (R 3.5.0)
##  foreign      0.8-70  2017-11-28 CRAN (R 3.5.0)
##  ggplot2    * 3.0.0   2018-07-03 cran (@3.0.0) 
##  glue         1.2.0   2017-10-29 CRAN (R 3.5.0)
##  graphics   * 3.5.0   2018-04-24 local         
##  grDevices  * 3.5.0   2018-04-24 local         
##  grid         3.5.0   2018-04-24 local         
##  gtable       0.2.0   2016-02-26 CRAN (R 3.5.0)
##  haven        1.1.1   2018-01-18 CRAN (R 3.5.0)
##  hms          0.4.2   2018-03-10 CRAN (R 3.5.0)
##  htmltools    0.3.6   2017-04-28 CRAN (R 3.5.0)
##  httr         1.3.1   2017-08-20 CRAN (R 3.5.0)
##  jsonlite     1.5     2017-06-01 CRAN (R 3.5.0)
##  knitr        1.20    2018-02-20 CRAN (R 3.5.0)
##  lattice      0.20-35 2017-03-25 CRAN (R 3.5.0)
##  lazyeval     0.2.1   2017-10-29 CRAN (R 3.5.0)
##  lubridate    1.7.4   2018-04-11 CRAN (R 3.5.0)
##  magrittr     1.5     2014-11-22 CRAN (R 3.5.0)
##  memoise      1.1.0   2017-04-21 CRAN (R 3.5.0)
##  methods    * 3.5.0   2018-04-24 local         
##  mnormt       1.5-5   2016-10-15 CRAN (R 3.5.0)
##  modelr       0.1.2   2018-05-11 CRAN (R 3.5.0)
##  munsell      0.4.3   2016-02-13 CRAN (R 3.5.0)
##  nlme         3.1-137 2018-04-07 CRAN (R 3.5.0)
##  parallel     3.5.0   2018-04-24 local         
##  pillar       1.2.3   2018-05-25 CRAN (R 3.5.0)
##  pkgconfig    2.0.1   2017-03-21 CRAN (R 3.5.0)
##  plyr         1.8.4   2016-06-08 CRAN (R 3.5.0)
##  psych        1.8.4   2018-05-06 CRAN (R 3.5.0)
##  purrr      * 0.2.5   2018-05-29 CRAN (R 3.5.0)
##  R6           2.2.2   2017-06-17 CRAN (R 3.5.0)
##  Rcpp         0.12.17 2018-05-18 CRAN (R 3.5.0)
##  readr      * 1.1.1   2017-05-16 CRAN (R 3.5.0)
##  readxl       1.1.0   2018-04-20 CRAN (R 3.5.0)
##  reshape2     1.4.3   2017-12-11 CRAN (R 3.5.0)
##  rlang        0.2.1   2018-05-30 CRAN (R 3.5.0)
##  rmarkdown    1.10    2018-06-11 cran (@1.10)  
##  rprojroot    1.3-2   2018-01-03 CRAN (R 3.5.0)
##  rstudioapi   0.7     2017-09-07 CRAN (R 3.5.0)
##  rvest        0.3.2   2016-06-17 CRAN (R 3.5.0)
##  scales       0.5.0   2017-08-24 CRAN (R 3.5.0)
##  stats      * 3.5.0   2018-04-24 local         
##  stringi      1.2.2   2018-05-02 CRAN (R 3.5.0)
##  stringr    * 1.3.1   2018-05-10 CRAN (R 3.5.0)
##  tibble     * 1.4.2   2018-01-22 CRAN (R 3.5.0)
##  tidyr      * 0.8.1   2018-05-18 CRAN (R 3.5.0)
##  tidyselect   0.2.4   2018-02-26 CRAN (R 3.5.0)
##  tidyverse  * 1.2.1   2017-11-14 CRAN (R 3.5.0)
##  tools        3.5.0   2018-04-24 local         
##  utf8         1.1.4   2018-05-24 CRAN (R 3.5.0)
##  utils      * 3.5.0   2018-04-24 local         
##  withr        2.1.2   2018-03-15 CRAN (R 3.5.0)
##  xfun         0.3     2018-07-06 CRAN (R 3.5.0)
##  xml2         1.2.0   2018-01-24 CRAN (R 3.5.0)
##  yaml         2.1.19  2018-05-01 CRAN (R 3.5.0)

```

Related

comments powered by Disqus