dplyr
to do itsetwd("~/Desktop/workshops/intro-r/")
d <- read.csv("data/Dataset_S1.txt")
colnames(d)[12] <- "percent.GC"
Let’s create a binned variable find the mean depth for three GC bins we created
d$GC.binned <- cut(d$percent.GC, 5)
d$GC.binned
> # [1] (51.6,68.5] (34.7,51.6] (34.7,51.6] (34.7,51.6] (34.7,51.6]
> # [6] (34.7,51.6] (34.7,51.6] (34.7,51.6] (34.7,51.6] (17.7,34.7]
> # [11] (17.7,34.7] (34.7,51.6] (34.7,51.6] (34.7,51.6] (34.7,51.6]
> # [16] (34.7,51.6] (34.7,51.6] (34.7,51.6] (34.7,51.6] (34.7,51.6]
> # [21] (34.7,51.6] (17.7,34.7] (34.7,51.6] (34.7,51.6] (51.6,68.5]
> # [26] (34.7,51.6] (34.7,51.6] (17.7,34.7] (34.7,51.6] (34.7,51.6]
> # [31] (34.7,51.6] (34.7,51.6] (34.7,51.6] (34.7,51.6] (17.7,34.7]
> # [36] (34.7,51.6] (51.6,68.5] (34.7,51.6] (34.7,51.6] (34.7,51.6]
> # [41] (17.7,34.7] (17.7,34.7] (34.7,51.6] (17.7,34.7] (17.7,34.7]
> # [46] (17.7,34.7] (34.7,51.6] (34.7,51.6] (34.7,51.6] (17.7,34.7]
> # [51] (34.7,51.6] (34.7,51.6] (34.7,51.6] (34.7,51.6] (34.7,51.6]
> # [56] (34.7,51.6] (34.7,51.6] (17.7,34.7] (34.7,51.6] (34.7,51.6]
> # [61] (17.7,34.7] (17.7,34.7] (17.7,34.7] (34.7,51.6] (17.7,34.7]
> # [66] (17.7,34.7] (34.7,51.6] (34.7,51.6] (34.7,51.6] (34.7,51.6]
> # [71] (34.7,51.6] (34.7,51.6] (34.7,51.6] (34.7,51.6] (34.7,51.6]
> # [ reached getOption("max.print") -- omitted 59065 entries ]
> # 5 Levels: (0.716,17.7] (17.7,34.7] (34.7,51.6] ... (68.5,85.6]
table(d$GC.binned)
> #
> # (0.716,17.7] (17.7,34.7] (34.7,51.6] (51.6,68.5] (68.5,85.6]
> # 6 4976 45784 8122 252
split(x, f)
where: x is a dataframe and f is a factord$depth
column into a list based on the factor d$GC.binned
d_split <- split(d$depth, d$GC.binned)
str(d_split)
> # List of 5
> # $ (0.716,17.7]: num [1:6] 4.57 1.12 6.95 2.66 3.69 3.87
> # $ (17.7,34.7] : num [1:4976] 8 8.38 9.02 10.31 12.09 ...
> # $ (34.7,51.6] : num [1:45784] 6.68 9.06 10.26 8.06 7.05 ...
> # $ (51.6,68.5] : num [1:8122] 3.41 7 6.63 7.15 6.97 4.77 5.18 6.6 6.62 5.05 ...
> # $ (68.5,85.6] : num [1:252] 8.04 1.96 3.71 1.97 4.82 4.22 3.76 3.84 1.85 1.05 ...
split()
returns a list with each element containing all observations for a particular level of the factor used in the groupingsplit()
contains elements corresponding to the levels of d$GC.binned
using names(d_split)
, levels(d$GC.binned)
, length(d_split)
, and nlevels(d$GC.binned)
.head(d$GC.binned)
> # [1] (51.6,68.5] (34.7,51.6] (34.7,51.6] (34.7,51.6] (34.7,51.6] (34.7,51.6]
> # 5 Levels: (0.716,17.7] (17.7,34.7] (34.7,51.6] ... (68.5,85.6]
names(d_split)
> # [1] "(0.716,17.7]" "(17.7,34.7]" "(34.7,51.6]" "(51.6,68.5]"
> # [5] "(68.5,85.6]"
levels(d$GC.binned)
> # [1] "(0.716,17.7]" "(17.7,34.7]" "(34.7,51.6]" "(51.6,68.5]"
> # [5] "(68.5,85.6]"
lapply(d_split, mean)
> # $`(0.716,17.7]`
> # [1] 3.81
> #
> # $`(17.7,34.7]`
> # [1] 8.788244
> #
> # $`(34.7,51.6]`
> # [1] 8.296699
> #
> # $`(51.6,68.5]`
> # [1] 7.309941
> #
> # $`(68.5,85.6]`
> # [1] 4.037698
# look at str
unlist(lapply(d_split, mean))
> # (0.716,17.7] (17.7,34.7] (34.7,51.6] (51.6,68.5] (68.5,85.6]
> # 3.810000 8.788244 8.296699 7.309941 4.037698
unlist
returns a vector of the highest type it cansapply(d_split, mean)
> # (0.716,17.7] (17.7,34.7] (34.7,51.6] (51.6,68.5] (68.5,85.6]
> # 3.810000 8.788244 8.296699 7.309941 4.037698
tapply(d$depth, d$GC.binned, mean)
> # (0.716,17.7] (17.7,34.7] (34.7,51.6] (51.6,68.5] (68.5,85.6]
> # 3.810000 8.788244 8.296699 7.309941 4.037698
aggregate(d$depth, list(gc = d$GC.binned), mean)
> # gc x
> # 1 (0.716,17.7] 3.810000
> # 2 (17.7,34.7] 8.788244
> # 3 (34.7,51.6] 8.296699
> # 4 (51.6,68.5] 7.309941
> # 5 (68.5,85.6] 4.037698
dplyr
is a package for making data manipulation easierstr()
& round()
are built indplyr
b/c it’s not built-indplyr
written in C++
so it’s very fast# install.packages('dplyr') # install dplyr if it's not already installed
library(dplyr)
d_df <- tbl_df(d)
d_df
> # Source: local data frame [59,140 x 17]
> #
> # start end total.SNPs total.Bases depth unique.SNPs dhSNPs
> # (int) (int) (int) (int) (dbl) (int) (int)
> # 1 55001 56000 0 1894 3.41 0 0
> # 2 56001 57000 5 6683 6.68 2 2
> # 3 57001 58000 1 9063 9.06 1 0
> # 4 58001 59000 7 10256 10.26 3 2
> # 5 59001 60000 4 8057 8.06 4 0
> # 6 60001 61000 6 7051 7.05 2 1
> # 7 61001 62000 7 6950 6.95 2 1
> # 8 62001 63000 1 8834 8.83 1 0
> # 9 63001 64000 1 9629 9.63 1 0
> # [ reached getOption("max.print") -- omitted 2 rows ]
> # Variables not shown: reference.Bases (int), Theta (dbl), Pi (dbl),
> # Heterozygosity (dbl), percent.GC (dbl), Recombination (dbl), Divergence
> # (dbl), Constraint (int), SNPs (int), GC.binned (fctr)
dplyr functions
select
, filter()
, mutate()
, group_by
, summarise()
select()
for selecting columnsselect(d_df, start, end, Pi, Recombination, depth)
> # Source: local data frame [59,140 x 5]
> #
> # start end Pi Recombination depth
> # (int) (int) (dbl) (dbl) (dbl)
> # 1 55001 56000 0.000 0.009601574 3.41
> # 2 56001 57000 10.354 0.009601574 6.68
> # 3 57001 58000 1.986 0.009601574 9.06
> # 4 58001 59000 9.556 0.009601574 10.26
> # 5 59001 60000 8.506 0.009601574 8.06
> # 6 60001 61000 9.121 0.009601574 7.05
> # 7 61001 62000 8.062 0.009601574 6.95
> # 8 62001 63000 2.065 0.009601574 8.83
> # 9 63001 64000 1.879 0.009601574 9.63
> # 10 64001 65000 5.408 0.009584180 8.00
> # .. ... ... ... ... ...
d[, c("start", "end", "Pi", "Recombination", "depth")]
> # start end Pi Recombination depth
> # 1 55001 56000 0.000 0.009601574 3.41
> # 2 56001 57000 10.354 0.009601574 6.68
> # 3 57001 58000 1.986 0.009601574 9.06
> # 4 58001 59000 9.556 0.009601574 10.26
> # 5 59001 60000 8.506 0.009601574 8.06
> # 6 60001 61000 9.121 0.009601574 7.05
> # 7 61001 62000 8.062 0.009601574 6.95
> # 8 62001 63000 2.065 0.009601574 8.83
> # 9 63001 64000 1.879 0.009601574 9.63
> # 10 64001 65000 5.408 0.009584180 8.00
> # 11 65001 66000 0.000 0.009582394 8.38
> # 12 66001 67000 4.178 0.009582394 8.43
> # 13 67001 68000 7.561 0.009582394 10.84
> # 14 68001 69000 3.252 0.009582394 5.38
> # 15 69001 70000 5.750 0.007277240 8.01
> # [ reached getOption("max.print") -- omitted 59125 rows ]
select(d_df, start:total.Bases)
> # Source: local data frame [59,140 x 4]
> #
> # start end total.SNPs total.Bases
> # (int) (int) (int) (int)
> # 1 55001 56000 0 1894
> # 2 56001 57000 5 6683
> # 3 57001 58000 1 9063
> # 4 58001 59000 7 10256
> # 5 59001 60000 4 8057
> # 6 60001 61000 6 7051
> # 7 61001 62000 7 6950
> # 8 62001 63000 1 8834
> # 9 63001 64000 1 9629
> # 10 64001 65000 3 7999
> # .. ... ... ... ...
select(d_df, -(start:percent.GC))
> # Source: local data frame [59,140 x 5]
> #
> # Recombination Divergence Constraint SNPs GC.binned
> # (dbl) (dbl) (int) (int) (fctr)
> # 1 0.009601574 0.003006012 0 0 (51.6,68.5]
> # 2 0.009601574 0.018018020 0 0 (34.7,51.6]
> # 3 0.009601574 0.007007007 0 0 (34.7,51.6]
> # 4 0.009601574 0.012012010 0 0 (34.7,51.6]
> # 5 0.009601574 0.024024020 0 0 (34.7,51.6]
> # 6 0.009601574 0.016016020 0 0 (34.7,51.6]
> # 7 0.009601574 0.012012010 0 0 (34.7,51.6]
> # 8 0.009601574 0.015015020 0 0 (34.7,51.6]
> # 9 0.009601574 0.009009009 58 1 (34.7,51.6]
> # 10 0.009584180 0.007007007 0 1 (17.7,34.7]
> # .. ... ... ... ... ...
Filter for rows:
Yesterday we filtered by row with:
d[d$Pi > 16 & d$percent.GC > 80, ]
> # start end total.SNPs total.Bases depth unique.SNPs dhSNPs
> # 58550 63097001 63098000 5 947 2.39 2 1
> # 58641 63188001 63189000 2 1623 3.21 2 0
> # 58642 63189001 63190000 5 1395 1.89 3 2
> # reference.Bases Theta Pi Heterozygosity percent.GC
> # 58550 397 37.544 41.172 52.784 82.0821
> # 58641 506 16.436 16.436 12.327 82.3824
> # 58642 738 35.052 41.099 35.842 80.5806
> # Recombination Divergence Constraint SNPs GC.binned
> # 58550 0.000781326 0.03826531 226 1 (68.5,85.6]
> # 58641 0.000347382 0.01678657 148 0 (68.5,85.6]
> # 58642 0.000347382 0.01793722 0 0 (68.5,85.6]
filter(d_df, Pi > 16, percent.GC > 80)
> # Source: local data frame [3 x 17]
> #
> # start end total.SNPs total.Bases depth unique.SNPs dhSNPs
> # (int) (int) (int) (int) (dbl) (int) (int)
> # 1 63097001 63098000 5 947 2.39 2 1
> # 2 63188001 63189000 2 1623 3.21 2 0
> # 3 63189001 63190000 5 1395 1.89 3 2
> # Variables not shown: reference.Bases (int), Theta (dbl), Pi (dbl),
> # Heterozygosity (dbl), percent.GC (dbl), Recombination (dbl), Divergence
> # (dbl), Constraint (int), SNPs (int), GC.binned (fctr)
arrange(d_df, depth)
> # Source: local data frame [59,140 x 17]
> #
> # start end total.SNPs total.Bases depth unique.SNPs dhSNPs
> # (int) (int) (int) (int) (dbl) (int) (int)
> # 1 1234001 1235000 0 444 1 0 0
> # 2 1584001 1585000 0 716 1 0 0
> # 3 2799001 2800000 0 277 1 0 0
> # 4 2800001 2801000 0 115 1 0 0
> # 5 7092001 7093000 0 519 1 0 0
> # 6 7093001 7094000 2 686 1 2 0
> # 7 11611001 11612000 1 429 1 1 0
> # 8 12781001 12782000 0 165 1 0 0
> # 9 25765001 25766000 0 221 1 0 0
> # [ reached getOption("max.print") -- omitted 2 rows ]
> # Variables not shown: reference.Bases (int), Theta (dbl), Pi (dbl),
> # Heterozygosity (dbl), percent.GC (dbl), Recombination (dbl), Divergence
> # (dbl), Constraint (int), SNPs (int), GC.binned (fctr)
arrange(d_df, desc(total.SNPs), desc(depth))
> # Source: local data frame [59,140 x 17]
> #
> # start end total.SNPs total.Bases depth unique.SNPs dhSNPs
> # (int) (int) (int) (int) (dbl) (int) (int)
> # 1 2621001 2622000 93 11337 11.34 13 10
> # 2 13023001 13024000 88 11784 11.78 11 1
> # 3 47356001 47357000 87 12505 12.50 9 7
> # 4 59992001 59993000 84 11553 11.55 15 6
> # 5 8769001 8770000 83 10253 10.25 12 7
> # 6 17840001 17841000 81 13347 13.35 11 8
> # 7 47355001 47356000 80 14218 14.22 8 7
> # 8 45888001 45889000 80 9820 9.82 11 10
> # 9 24897001 24898000 78 11206 11.21 16 14
> # [ reached getOption("max.print") -- omitted 2 rows ]
> # Variables not shown: reference.Bases (int), Theta (dbl), Pi (dbl),
> # Heterozygosity (dbl), percent.GC (dbl), Recombination (dbl), Divergence
> # (dbl), Constraint (int), SNPs (int), GC.binned (fctr)
mutate()
function, we can add new columns to our dataframe: For example, we added a rescaled version of the Pi column as ddiversity—let’sdropddiversity using select()
# d_df <- select(d_df, -diversity) # remove our earlier diversity column
d_df <- mutate(d_df, diversity = Pi/(10 * 1000))
d_df
> # Source: local data frame [59,140 x 18]
> #
> # start end total.SNPs total.Bases depth unique.SNPs dhSNPs
> # (int) (int) (int) (int) (dbl) (int) (int)
> # 1 55001 56000 0 1894 3.41 0 0
> # 2 56001 57000 5 6683 6.68 2 2
> # 3 57001 58000 1 9063 9.06 1 0
> # 4 58001 59000 7 10256 10.26 3 2
> # 5 59001 60000 4 8057 8.06 4 0
> # 6 60001 61000 6 7051 7.05 2 1
> # 7 61001 62000 7 6950 6.95 2 1
> # 8 62001 63000 1 8834 8.83 1 0
> # 9 63001 64000 1 9629 9.63 1 0
> # [ reached getOption("max.print") -- omitted 2 rows ]
> # Variables not shown: reference.Bases (int), Theta (dbl), Pi (dbl),
> # Heterozygosity (dbl), percent.GC (dbl), Recombination (dbl), Divergence
> # (dbl), Constraint (int), SNPs (int), GC.binned (fctr), diversity (dbl)
dplyr
- select then filter%>
is the pipe operator provided by the magrittr
package and utilized by dplyr
pipe example from d_df
d_df %>% mutate(GC.scaled = scale(percent.GC)) %>% filter(GC.scaled > 4, depth >
4) %>% select(start, end, depth, GC.scaled, percent.GC) %>% arrange(desc(depth))
> # Source: local data frame [18 x 5]
> #
> # start end depth GC.scaled percent.GC
> # (int) (int) (dbl) (dbl) (dbl)
> # 1 62535001 62536000 7.66 4.040263 73.9740
> # 2 63065001 63066000 6.20 4.229954 75.3754
> # 3 62492001 62493000 5.25 4.243503 75.4755
> # 4 40680001 40681000 5.19 4.555139 77.7778
> # 5 63396001 63397000 5.17 4.094460 74.3744
> # 6 63441001 63442000 5.15 4.121559 74.5746
> # 7 4662001 4663000 5.11 4.514490 77.4775
> # 8 47099001 47100000 4.89 4.622885 78.2783
> # 9 58151001 58152000 4.66 4.216404 75.2753
> # 10 25033001 25034000 4.55 4.026713 73.8739
> # 11 62630001 62631000 4.43 4.338349 76.1762
> # 12 61382001 61383000 4.39 4.053812 74.0741
> # 13 49492001 49493000 4.34 4.067361 74.1742
> # 14 61563001 61564000 4.34 4.297701 75.8759
> # [ reached getOption("max.print") -- omitted 4 rows ]
## Factors
str(d_df)
> # Classes 'tbl_df', 'tbl' and 'data.frame': 59140 obs. of 18 variables:
> # $ start : int 55001 56001 57001 58001 59001 60001 61001 62001 63001 64001 ...
> # $ end : int 56000 57000 58000 59000 60000 61000 62000 63000 64000 65000 ...
> # $ total.SNPs : int 0 5 1 7 4 6 7 1 1 3 ...
> # $ total.Bases : int 1894 6683 9063 10256 8057 7051 6950 8834 9629 7999 ...
> # $ depth : num 3.41 6.68 9.06 10.26 8.06 ...
> # $ unique.SNPs : int 0 2 1 3 4 2 2 1 1 1 ...
> # $ dhSNPs : int 0 2 0 2 0 1 1 0 0 1 ...
> # $ reference.Bases: int 556 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
> # $ Theta : num 0 8.01 3.51 9.93 12.91 ...
> # $ Pi : num 0 10.35 1.99 9.56 8.51 ...
> # $ Heterozygosity : num 0 7.48 1.1 6.58 4.96 ...
> # $ percent.GC : num 54.8 42.4 37.2 38 41.3 ...
> # $ Recombination : num 0.0096 0.0096 0.0096 0.0096 0.0096 ...
> # $ Divergence : num 0.00301 0.01802 0.00701 0.01201 0.02402 ...
> # $ Constraint : int 0 0 0 0 0 0 0 0 58 0 ...
> # $ SNPs : int 0 0 0 0 0 0 0 0 1 1 ...
> # $ GC.binned : Factor w/ 5 levels "(0.716,17.7]",..: 4 3 3 3 3 3 3 3 3 2 ...
> # $ diversity : num 0 0.001035 0.000199 0.000956 0.000851 ...
read.csv()
will read any text vector as a factorGC.binned
is a factor w/5 levels: “(0.716,17.7]” “(17.7,34.7]” “(34.7,51.6]” “(51.6,68.5]” “(68.5,85.6]”First let’s switch to a dataset with more factors. mtfs
data has recombination rates around a degenerate sequence motif that occurs in repeats and contains estimates of the recombination rate for all windows within 40kb of each motif (for two motif variants). We’ll download from the internet.
NOTE: mtfs
dataset was created using the Genomic Ranges tools from tracks downloaded directly from the UCSC Genome Browser. With the appropriate tools and bioinformatics data skills, it takes surprisingly few steps to replicate part of this important scientific finding (though the original paper did much more than this—see Myers et al., 2008). For the code to reproduce the data used in this example, see the motif-example/ directory in this chapter’s directory on GitHub.
Let’s read in the mtfs data from github and convert to a dplyr table.
mtfs <- read.delim("https://raw.githubusercontent.com/vsbuffalo/bds-files/master/chapter-08-r/motif_recombrates.txt")
tbl_df
is a data frame tbl that wraps a local data frame – it comes from dplyr and makes printing tables prettier.mtfs_df <- tbl_df(mtfs)
head(mtfs_df, 3)
> # Source: local data frame [3 x 9]
> #
> # chr motif_start motif_end dist recomb_start recomb_end recom
> # (fctr) (int) (int) (dbl) (int) (int) (dbl)
> # 1 chrX 35471312 35471325 39323.0 35430651 35433340 0.0015
> # 2 chrX 35471312 35471325 36977.0 35433339 35435344 0.0015
> # 3 chrX 35471312 35471325 34797.5 35435343 35437699 0.0015
> # Variables not shown: motif (fctr), pos (fctr)
Let’s look at the structure of the data frame.
str(mtfs_df)
> # Classes 'tbl_df', 'tbl' and 'data.frame': 20050 obs. of 9 variables:
> # $ chr : Factor w/ 23 levels "chr1","chr10",..: 23 23 23 23 23 23 23 23 23 23 ...
> # $ motif_start : int 35471312 35471312 35471312 35471312 35471312 35471312 35471312 35471312 35471312 35471312 ...
> # $ motif_end : int 35471325 35471325 35471325 35471325 35471325 35471325 35471325 35471325 35471325 35471325 ...
> # $ dist : num 39323 36977 34798 31850 27463 ...
> # $ recomb_start: int 35430651 35433339 35435343 35437698 35441239 35446471 35446497 35452573 35453030 35454726 ...
> # $ recomb_end : int 35433340 35435344 35437699 35441240 35446472 35446498 35452574 35453031 35454727 35456283 ...
> # $ recom : num 0.0015 0.0015 0.0015 0.0015 0.0015 ...
> # $ motif : Factor w/ 2 levels "CCTCCCTAGCCAC",..: 2 2 2 2 2 2 2 2 2 2 ...
> # $ pos : Factor w/ 695 levels "chr1-101890123",..: 684 684 684 684 684 684 684 684 684 684 ...
Now we can use group_by
function to group the data (this is our split stage).
mtfs_df %>% group_by(chr)
> # Source: local data frame [20,050 x 9]
> # Groups: chr [23]
> #
> # chr motif_start motif_end dist recomb_start recomb_end recom
> # (fctr) (int) (int) (dbl) (int) (int) (dbl)
> # 1 chrX 35471312 35471325 39323.0 35430651 35433340 0.0015
> # 2 chrX 35471312 35471325 36977.0 35433339 35435344 0.0015
> # 3 chrX 35471312 35471325 34797.5 35435343 35437699 0.0015
> # 4 chrX 35471312 35471325 31849.5 35437698 35441240 0.0015
> # 5 chrX 35471312 35471325 27463.0 35441239 35446472 0.0015
> # 6 chrX 35471312 35471325 24834.0 35446471 35446498 0.0016
> # 7 chrX 35471312 35471325 21783.0 35446497 35452574 0.0017
> # 8 chrX 35471312 35471325 18516.5 35452573 35453031 1.5073
> # 9 chrX 35471312 35471325 17440.0 35453030 35454727 0.0076
> # [ reached getOption("max.print") -- omitted 2 rows ]
> # Variables not shown: motif (fctr), pos (fctr)
mtfs_df %>% group_by(chr) %>% summarize(max_recom = max(recom), mean_recom = mean(recom),
num = n())
> # Source: local data frame [23 x 4]
> #
> # chr max_recom mean_recom num
> # (fctr) (dbl) (dbl) (int)
> # 1 chr1 41.5648 2.217759 2095
> # 2 chr10 42.4129 2.162635 1029
> # 3 chr11 36.1703 2.774918 560
> # 4 chr12 31.6890 2.214382 1014
> # 5 chr13 36.2022 1.751010 773
> # 6 chr14 26.9460 1.963760 542
> # 7 chr15 28.8763 1.961306 508
> # 8 chr16 54.9638 2.436250 535
> # 9 chr17 23.4195 2.513056 574
> # 10 chr18 51.5166 2.537674 576
> # .. ... ... ... ...
Because we’ve grouped this data by chromosome, summarize() computes per-group summaries
n_distinct()
returns the unique number of observations in each groupfirst()
, last()
and nth()
return the first, last, and nth observations,arrange()
(because specific rows are arbitrary in unsorted data).mtfs_df %>% group_by(chr) %>% summarize(max_recom = max(recom), mean_recom = mean(recom),
num = n()) %>% arrange(desc(max_recom))
> # Source: local data frame [23 x 4]
> #
> # chr max_recom mean_recom num
> # (fctr) (dbl) (dbl) (int)
> # 1 chrX 74.0966 2.686840 693
> # 2 chr8 62.6081 1.913325 1727
> # 3 chr3 56.2775 1.889585 1409
> # 4 chr16 54.9638 2.436250 535
> # 5 chr22 54.4492 2.567579 894
> # 6 chr18 51.5166 2.537674 576
> # 7 chr4 49.9566 1.838564 759
> # 8 chr21 48.0816 1.709787 281
> # 9 chr10 42.4129 2.162635 1029
> # 10 chr1 41.5648 2.217759 2095
> # .. ... ... ... ...
Challenge
Looking at mtfs_df use dplyr to group by char and create summary statistics > for variance (var), standard deviation (sd), and mean on the recom column. > Arrange variance data by descending order.
mtfs_df %>% group_by(chr) %>% summarize(variance = var(recom), std = sd(recom),
mean_recom = mean(recom)) %>% arrange(desc(variance))
> # Source: local data frame [23 x 4]
> #
> # chr variance std mean_recom
> # (fctr) (dbl) (dbl) (dbl)
> # 1 chrX 42.44125 6.514695 2.686840
> # 2 chr18 38.76817 6.226409 2.537674
> # 3 chr22 37.66113 6.136866 2.567579
> # 4 chr8 26.45906 5.143837 1.913325
> # 5 chr11 24.64802 4.964677 2.774918
> # 6 chr6 23.32969 4.830082 2.402265
> # 7 chr21 21.06245 4.589385 1.709787
> # 8 chr10 20.10548 4.483913 2.162635
> # 9 chr3 19.69511 4.437918 1.889585
> # 10 chr16 18.99976 4.358871 2.436250
> # .. ... ... ... ...
Challenge
Go back to our d_df dataframe where we binned the data as GC.binned. Write > a dplyr stanza and group by that variable and summarize by another variable > of your choice, e.g. mean(Pi), st(Pi), etc.