Introduction
bcftools view -r 1:40000-50000 vcf.gz
will output (to stdout) a vcf containing the header and variants on chromosome 1 between coordinates 40,000 and 50,000 base pairs.
I need to break down a large vcf into smaller pieces to dramatically speed up annotation. Let’s try 100 pieces.
The human genome is approximately 3 gigabases or 3e9 base pairs.
\[ \frac{3 * 10^9\ base\ pairs}{100\ pieces} = 3*10^7\ base\ pairs\ per\ piece \]
That’s our target size.
This is made a bit tricky since the genome is laid by chromosome. So we have to break into 3e7 pieces, accounting for chromosomes. There are also many contigs, most of which are well under 3e7 in size. Those can be processed as a group with bcftools
by splitting each contig by a ,
.
Let’s read in the header. It contains chromosome (and contig) sizes, which I’ve extracted from the vcf with zcat EGAD00001002656.GATK.vcf.gz | head -n 1000 | grep ^## > /home/mcgaugheyd/git/OGVFB_one_offs/mcgaughey/split_VCFs_into_n_pieces/EGAD00001002656.header
Read in vcf header
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.6
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(stringr)
vcf_header = scan('~/git/OGVFB_one_offs/mcgaughey/split_VCFs_into_n_pieces/EGAD00001002656.header', what='character')
vcf_header[grepl('contig',vcf_header)]
## [1] "##contig=<ID=1,length=249250621,assembly=b37>"
## [2] "##contig=<ID=2,length=243199373,assembly=b37>"
## [3] "##contig=<ID=3,length=198022430,assembly=b37>"
## [4] "##contig=<ID=4,length=191154276,assembly=b37>"
## [5] "##contig=<ID=5,length=180915260,assembly=b37>"
## [6] "##contig=<ID=6,length=171115067,assembly=b37>"
## [7] "##contig=<ID=7,length=159138663,assembly=b37>"
## [8] "##contig=<ID=8,length=146364022,assembly=b37>"
## [9] "##contig=<ID=9,length=141213431,assembly=b37>"
## [10] "##contig=<ID=10,length=135534747,assembly=b37>"
## [11] "##contig=<ID=11,length=135006516,assembly=b37>"
## [12] "##contig=<ID=12,length=133851895,assembly=b37>"
## [13] "##contig=<ID=13,length=115169878,assembly=b37>"
## [14] "##contig=<ID=14,length=107349540,assembly=b37>"
## [15] "##contig=<ID=15,length=102531392,assembly=b37>"
## [16] "##contig=<ID=16,length=90354753,assembly=b37>"
## [17] "##contig=<ID=17,length=81195210,assembly=b37>"
## [18] "##contig=<ID=18,length=78077248,assembly=b37>"
## [19] "##contig=<ID=19,length=59128983,assembly=b37>"
## [20] "##contig=<ID=20,length=63025520,assembly=b37>"
## [21] "##contig=<ID=21,length=48129895,assembly=b37>"
## [22] "##contig=<ID=22,length=51304566,assembly=b37>"
## [23] "##contig=<ID=X,length=155270560,assembly=b37>"
## [24] "##contig=<ID=Y,length=59373566,assembly=b37>"
## [25] "##contig=<ID=MT,length=16569,assembly=b37>"
## [26] "##contig=<ID=GL000207.1,length=4262,assembly=b37>"
## [27] "##contig=<ID=GL000226.1,length=15008,assembly=b37>"
## [28] "##contig=<ID=GL000229.1,length=19913,assembly=b37>"
## [29] "##contig=<ID=GL000231.1,length=27386,assembly=b37>"
## [30] "##contig=<ID=GL000210.1,length=27682,assembly=b37>"
## [31] "##contig=<ID=GL000239.1,length=33824,assembly=b37>"
## [32] "##contig=<ID=GL000235.1,length=34474,assembly=b37>"
## [33] "##contig=<ID=GL000201.1,length=36148,assembly=b37>"
## [34] "##contig=<ID=GL000247.1,length=36422,assembly=b37>"
## [35] "##contig=<ID=GL000245.1,length=36651,assembly=b37>"
## [36] "##contig=<ID=GL000197.1,length=37175,assembly=b37>"
## [37] "##contig=<ID=GL000203.1,length=37498,assembly=b37>"
## [38] "##contig=<ID=GL000246.1,length=38154,assembly=b37>"
## [39] "##contig=<ID=GL000249.1,length=38502,assembly=b37>"
## [40] "##contig=<ID=GL000196.1,length=38914,assembly=b37>"
## [41] "##contig=<ID=GL000248.1,length=39786,assembly=b37>"
## [42] "##contig=<ID=GL000244.1,length=39929,assembly=b37>"
## [43] "##contig=<ID=GL000238.1,length=39939,assembly=b37>"
## [44] "##contig=<ID=GL000202.1,length=40103,assembly=b37>"
## [45] "##contig=<ID=GL000234.1,length=40531,assembly=b37>"
## [46] "##contig=<ID=GL000232.1,length=40652,assembly=b37>"
## [47] "##contig=<ID=GL000206.1,length=41001,assembly=b37>"
## [48] "##contig=<ID=GL000240.1,length=41933,assembly=b37>"
## [49] "##contig=<ID=GL000236.1,length=41934,assembly=b37>"
## [50] "##contig=<ID=GL000241.1,length=42152,assembly=b37>"
## [51] "##contig=<ID=GL000243.1,length=43341,assembly=b37>"
## [52] "##contig=<ID=GL000242.1,length=43523,assembly=b37>"
## [53] "##contig=<ID=GL000230.1,length=43691,assembly=b37>"
## [54] "##contig=<ID=GL000237.1,length=45867,assembly=b37>"
## [55] "##contig=<ID=GL000233.1,length=45941,assembly=b37>"
## [56] "##contig=<ID=GL000204.1,length=81310,assembly=b37>"
## [57] "##contig=<ID=GL000198.1,length=90085,assembly=b37>"
## [58] "##contig=<ID=GL000208.1,length=92689,assembly=b37>"
## [59] "##contig=<ID=GL000191.1,length=106433,assembly=b37>"
## [60] "##contig=<ID=GL000227.1,length=128374,assembly=b37>"
## [61] "##contig=<ID=GL000228.1,length=129120,assembly=b37>"
## [62] "##contig=<ID=GL000214.1,length=137718,assembly=b37>"
## [63] "##contig=<ID=GL000221.1,length=155397,assembly=b37>"
## [64] "##contig=<ID=GL000209.1,length=159169,assembly=b37>"
## [65] "##contig=<ID=GL000218.1,length=161147,assembly=b37>"
## [66] "##contig=<ID=GL000220.1,length=161802,assembly=b37>"
## [67] "##contig=<ID=GL000213.1,length=164239,assembly=b37>"
## [68] "##contig=<ID=GL000211.1,length=166566,assembly=b37>"
## [69] "##contig=<ID=GL000199.1,length=169874,assembly=b37>"
## [70] "##contig=<ID=GL000217.1,length=172149,assembly=b37>"
## [71] "##contig=<ID=GL000216.1,length=172294,assembly=b37>"
## [72] "##contig=<ID=GL000215.1,length=172545,assembly=b37>"
## [73] "##contig=<ID=GL000205.1,length=174588,assembly=b37>"
## [74] "##contig=<ID=GL000219.1,length=179198,assembly=b37>"
## [75] "##contig=<ID=GL000224.1,length=179693,assembly=b37>"
## [76] "##contig=<ID=GL000223.1,length=180455,assembly=b37>"
## [77] "##contig=<ID=GL000195.1,length=182896,assembly=b37>"
## [78] "##contig=<ID=GL000212.1,length=186858,assembly=b37>"
## [79] "##contig=<ID=GL000222.1,length=186861,assembly=b37>"
## [80] "##contig=<ID=GL000200.1,length=187035,assembly=b37>"
## [81] "##contig=<ID=GL000193.1,length=189789,assembly=b37>"
## [82] "##contig=<ID=GL000194.1,length=191469,assembly=b37>"
## [83] "##contig=<ID=GL000225.1,length=211173,assembly=b37>"
## [84] "##contig=<ID=GL000192.1,length=547496,assembly=b37>"
## [85] "##contig=<ID=NC_007605,length=171823,assembly=b37>"
## [86] "##contig=<ID=hs37d5,length=35477943,assembly=b37>"
Parse out chr / contig sizes
# turn into data frame (well, a tibble)
contig_size <- vcf_header[grepl('contig', vcf_header)] %>%
data.frame() %>%
select(1, 'header' = 1) %>%
# separate by ,
separate(header, c('contig','length','assembly'),',') %>%
# extract values by splitting against = and taking the last element (first after reversing)
rowwise() %>%
mutate(contig = str_split(contig,'=')[[1]] %>% gsub('>','',.) %>% rev() %>% .[[1]],
length = str_split(length,'=')[[1]] %>% gsub('>','',.) %>% rev() %>% .[[1]] %>% as.numeric(),
assembly = str_split(assembly,'=')[[1]] %>% gsub('>','',.) %>% rev() %>% .[[1]])
contig_size
## Source: local data frame [86 x 3]
## Groups: <by row>
##
## # A tibble: 86 x 3
## contig length assembly
## <chr> <dbl> <chr>
## 1 1 249250621 b37
## 2 2 243199373 b37
## 3 3 198022430 b37
## 4 4 191154276 b37
## 5 5 180915260 b37
## 6 6 171115067 b37
## 7 7 159138663 b37
## 8 8 146364022 b37
## 9 9 141213431 b37
## 10 10 135534747 b37
## # ... with 76 more rows
Split chr above 3e7 base pairs into equal(ish) size pieces
ceiling
will allow intervals a bit less than 3e7 by rounding up the number of pieces per chromsome. Would rather have more splits with less than the target size.
n_split <- function(size){
pieces <- ceiling(size / 3e7)
seq(1, size, size/pieces)
}
print coordinates given a chromosome / contig
n_printer <- function(chr) {
# grab the legnth of chr or contig
size <- contig_size %>% filter(contig == chr) %>% pull(length)
# split into ~30e7 sized pieces
sequence <- n_split(size)
# add the max size to end (plus another base pair since the loop below reduces size by 1 to eliminate overlaps)
sequence <- c(sequence, size+1)
df <- data.frame()
for(i in 1:length(sequence)){
row <- cbind(chr, as.integer(sequence[max(i-1,1)]), # for first row, makes sure you don't pick the 0 position, which doesn't exit
as.integer(sequence[i]-1)) # decrements by one so you don't overlap
df <- rbind(df, row)
}
colnames(df) <- c('chr','start','end')
# skip first row which has dummy values
df[-1,]
}
calculate coordinates
Will skip contig < 3e7 (all but hs37d5, which I don’t process, so it will be eliminated). The contigs will be printed comma separated for bcftools view -r
purposes.
How many regions do we have? Should have a bit more than 100.
regions <- data.frame()
for (i in contig_size %>% filter(length > 3e7, contig != 'hs37d5') %>% pull(contig)){
regions <- rbind(regions,(n_printer(i)))
}
regions %>% nrow()
## [1] 115
print ’em
regions %>% mutate(f = paste(paste(chr, start, sep =':'), end, sep='-')) %>% select(f)
## f
## 1 1:1-27694513
## 2 1:27694514-55389026
## 3 1:55389027-83083540
## 4 1:83083541-110778053
## 5 1:110778054-138472567
## 6 1:138472568-166167080
## 7 1:166167081-193861594
## 8 1:193861595-221556107
## 9 1:221556108-249250621
## 10 2:1-27022152
## 11 2:27022153-54044305
## 12 2:54044306-81066457
## 13 2:81066458-108088610
## 14 2:108088611-135110762
## 15 2:135110763-162132915
## 16 2:162132916-189155067
## 17 2:189155068-216177220
## 18 2:216177221-243199373
## 19 3:1-28288918
## 20 3:28288919-56577837
## 21 3:56577838-84866755
## 22 3:84866756-113155674
## 23 3:113155675-141444592
## 24 3:141444593-169733511
## 25 3:169733512-198022430
## 26 4:1-27307753
## 27 4:27307754-54615507
## 28 4:54615508-81923261
## 29 4:81923262-109231014
## 30 4:109231015-136538768
## 31 4:136538769-163846522
## 32 4:163846523-191154276
## 33 5:1-25845037
## 34 5:25845038-51690074
## 35 5:51690075-77535111
## 36 5:77535112-103380148
## 37 5:103380149-129225185
## 38 5:129225186-155070222
## 39 5:155070223-180915260
## 40 6:1-28519177
## 41 6:28519178-57038355
## 42 6:57038356-85557533
## 43 6:85557534-114076711
## 44 6:114076712-142595889
## 45 6:142595890-171115067
## 46 7:1-26523110
## 47 7:26523111-53046221
## 48 7:53046222-79569331
## 49 7:79569332-106092442
## 50 7:106092443-132615552
## 51 7:132615553-159138663
## 52 8:1-29272804
## 53 8:29272805-58545608
## 54 8:58545609-87818413
## 55 8:87818414-117091217
## 56 8:117091218-146364022
## 57 9:1-28242686
## 58 9:28242687-56485372
## 59 9:56485373-84728058
## 60 9:84728059-112970744
## 61 9:112970745-141213431
## 62 10:1-27106949
## 63 10:27106950-54213898
## 64 10:54213899-81320848
## 65 10:81320849-108427797
## 66 10:108427798-135534747
## 67 11:1-27001303
## 68 11:27001304-54002606
## 69 11:54002607-81003909
## 70 11:81003910-108005212
## 71 11:108005213-135006516
## 72 12:1-26770379
## 73 12:26770380-53540758
## 74 12:53540759-80311137
## 75 12:80311138-107081516
## 76 12:107081517-133851895
## 77 13:1-28792469
## 78 13:28792470-57584939
## 79 13:57584940-86377408
## 80 13:86377409-115169878
## 81 14:1-26837385
## 82 14:26837386-53674770
## 83 14:53674771-80512155
## 84 14:80512156-107349540
## 85 15:1-25632848
## 86 15:25632849-51265696
## 87 15:51265697-76898544
## 88 15:76898545-102531392
## 89 16:1-22588688
## 90 16:22588689-45177376
## 91 16:45177377-67766064
## 92 16:67766065-90354753
## 93 17:1-27065070
## 94 17:27065071-54130140
## 95 17:54130141-81195210
## 96 18:1-26025749
## 97 18:26025750-52051498
## 98 18:52051499-78077248
## 99 19:1-29564491
## 100 19:29564492-59128983
## 101 20:1-21008506
## 102 20:21008507-42017013
## 103 20:42017014-63025520
## 104 21:1-24064947
## 105 21:24064948-48129895
## 106 22:1-25652283
## 107 22:25652284-51304566
## 108 X:1-25878426
## 109 X:25878427-51756853
## 110 X:51756854-77635280
## 111 X:77635281-103513706
## 112 X:103513707-129392133
## 113 X:129392134-155270560
## 114 Y:1-29686783
## 115 Y:29686784-59373566
output ’em for python input (Snakemake)
The second write command appends all of the chromosomes or contigs (in this case, just contigs) that are less than 3e7 in length to the output file. It comma separates them, which is how bcftools view -r
takes in multiple chromosomes or contigs. The paste(., collapse=',')
command at the end collapses the vector of contigs into a string with comma separation.
write(regions %>% mutate(f = paste(paste(chr, start, sep =':'), end, sep='-')) %>% pull(f), file='vcf_region_split_coords.txt')
write(contig_size %>% filter(length < 3e7, contig != 'hs37d5') %>% pull(contig) %>% paste(., collapse=','), file='vcf_region_split_coords.txt', append = T)
rscript
I’ve wrapped up the functions and handling as a Rscript that takes the header of a vcf as input and outputs and writes to a user-given file the regions. The script also allows you to select desired number of regions (you will almost always get a few more), the output file name, and the genome size (defaults to human genome). The script is here.
Using the script output
I’m using it in a Snakemake pipeline. bcftools
can use it with -R
(region) if you run the script like this (see source for comments): Rscript split_vcf_into_n_pieces.R yourVCF.header 200 vcf_region_split_200_coords.txt 3e9 bed
sessionInfo()
devtools::session_info()
## Session info -------------------------------------------------------------
## setting value
## version R version 3.5.0 (2018-04-23)
## system x86_64, darwin15.6.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## tz Europe/London
## date 2018-09-17
## Packages -----------------------------------------------------------------
## package * version date source
## assertthat 0.2.0 2017-04-11 CRAN (R 3.5.0)
## backports 1.1.2 2017-12-13 CRAN (R 3.5.0)
## base * 3.5.0 2018-04-24 local
## bindr 0.1.1 2018-03-13 CRAN (R 3.5.0)
## bindrcpp * 0.2.2 2018-03-29 CRAN (R 3.5.0)
## blogdown 0.8 2018-07-15 CRAN (R 3.5.0)
## bookdown 0.7 2018-02-18 CRAN (R 3.5.0)
## broom 0.4.4 2018-03-29 CRAN (R 3.5.0)
## cellranger 1.1.0 2016-07-27 CRAN (R 3.5.0)
## cli 1.0.0 2017-11-05 CRAN (R 3.5.0)
## colorspace 1.3-2 2016-12-14 CRAN (R 3.5.0)
## compiler 3.5.0 2018-04-24 local
## crayon 1.3.4 2017-09-16 CRAN (R 3.5.0)
## datasets * 3.5.0 2018-04-24 local
## devtools 1.13.6 2018-06-27 CRAN (R 3.5.0)
## digest 0.6.15 2018-01-28 CRAN (R 3.5.0)
## dplyr * 0.7.6 2018-06-29 cran (@0.7.6)
## evaluate 0.10.1 2017-06-24 CRAN (R 3.5.0)
## forcats * 0.3.0 2018-02-19 CRAN (R 3.5.0)
## foreign 0.8-70 2017-11-28 CRAN (R 3.5.0)
## ggplot2 * 3.0.0 2018-07-03 cran (@3.0.0)
## glue 1.2.0 2017-10-29 CRAN (R 3.5.0)
## graphics * 3.5.0 2018-04-24 local
## grDevices * 3.5.0 2018-04-24 local
## grid 3.5.0 2018-04-24 local
## gtable 0.2.0 2016-02-26 CRAN (R 3.5.0)
## haven 1.1.1 2018-01-18 CRAN (R 3.5.0)
## hms 0.4.2 2018-03-10 CRAN (R 3.5.0)
## htmltools 0.3.6 2017-04-28 CRAN (R 3.5.0)
## httr 1.3.1 2017-08-20 CRAN (R 3.5.0)
## jsonlite 1.5 2017-06-01 CRAN (R 3.5.0)
## knitr 1.20 2018-02-20 CRAN (R 3.5.0)
## lattice 0.20-35 2017-03-25 CRAN (R 3.5.0)
## lazyeval 0.2.1 2017-10-29 CRAN (R 3.5.0)
## lubridate 1.7.4 2018-04-11 CRAN (R 3.5.0)
## magrittr 1.5 2014-11-22 CRAN (R 3.5.0)
## memoise 1.1.0 2017-04-21 CRAN (R 3.5.0)
## methods * 3.5.0 2018-04-24 local
## mnormt 1.5-5 2016-10-15 CRAN (R 3.5.0)
## modelr 0.1.2 2018-05-11 CRAN (R 3.5.0)
## munsell 0.4.3 2016-02-13 CRAN (R 3.5.0)
## nlme 3.1-137 2018-04-07 CRAN (R 3.5.0)
## parallel 3.5.0 2018-04-24 local
## pillar 1.2.3 2018-05-25 CRAN (R 3.5.0)
## pkgconfig 2.0.1 2017-03-21 CRAN (R 3.5.0)
## plyr 1.8.4 2016-06-08 CRAN (R 3.5.0)
## psych 1.8.4 2018-05-06 CRAN (R 3.5.0)
## purrr * 0.2.5 2018-05-29 CRAN (R 3.5.0)
## R6 2.2.2 2017-06-17 CRAN (R 3.5.0)
## Rcpp 0.12.17 2018-05-18 CRAN (R 3.5.0)
## readr * 1.1.1 2017-05-16 CRAN (R 3.5.0)
## readxl 1.1.0 2018-04-20 CRAN (R 3.5.0)
## reshape2 1.4.3 2017-12-11 CRAN (R 3.5.0)
## rlang 0.2.1 2018-05-30 CRAN (R 3.5.0)
## rmarkdown 1.10 2018-06-11 cran (@1.10)
## rprojroot 1.3-2 2018-01-03 CRAN (R 3.5.0)
## rstudioapi 0.7 2017-09-07 CRAN (R 3.5.0)
## rvest 0.3.2 2016-06-17 CRAN (R 3.5.0)
## scales 0.5.0 2017-08-24 CRAN (R 3.5.0)
## stats * 3.5.0 2018-04-24 local
## stringi 1.2.2 2018-05-02 CRAN (R 3.5.0)
## stringr * 1.3.1 2018-05-10 CRAN (R 3.5.0)
## tibble * 1.4.2 2018-01-22 CRAN (R 3.5.0)
## tidyr * 0.8.1 2018-05-18 CRAN (R 3.5.0)
## tidyselect 0.2.4 2018-02-26 CRAN (R 3.5.0)
## tidyverse * 1.2.1 2017-11-14 CRAN (R 3.5.0)
## tools 3.5.0 2018-04-24 local
## utf8 1.1.4 2018-05-24 CRAN (R 3.5.0)
## utils * 3.5.0 2018-04-24 local
## withr 2.1.2 2018-03-15 CRAN (R 3.5.0)
## xfun 0.3 2018-07-06 CRAN (R 3.5.0)
## xml2 1.2.0 2018-01-24 CRAN (R 3.5.0)
## yaml 2.1.19 2018-05-01 CRAN (R 3.5.0)
```