install.packages("readxl")
# package that will read and write xls files
Download: http://www.datacarpentry.org/R-genomics/data/Ecoli_metadata.csv
metadata <- read.csv("http://www.datacarpentry.org/R-genomics/data/Ecoli_metadata.csv")
The above doesn’t produce any output b/c assignment doesn’t display anything. We can look at metadata
by wrapping the above in ()
.
(metadata <- read.csv("http://www.datacarpentry.org/R-genomics/data/Ecoli_metadata.csv"))
> # sample generation clade strain cit run genome_size
> # 1 REL606 0 <NA> REL606 unknown 4.62
> # 2 REL1166A 2000 unknown REL606 unknown SRR098028 4.63
> # 3 ZDB409 5000 unknown REL606 unknown SRR098281 4.60
> # 4 ZDB429 10000 UC REL606 unknown SRR098282 4.59
> # 5 ZDB446 15000 UC REL606 unknown SRR098283 4.66
> # 6 ZDB458 20000 (C1,C2) REL606 unknown SRR098284 4.63
> # 7 ZDB464* 20000 (C1,C2) REL606 unknown SRR098285 4.62
> # 8 ZDB467 20000 (C1,C2) REL606 unknown SRR098286 4.61
> # 9 ZDB477 25000 C1 REL606 unknown SRR098287 4.65
> # 10 ZDB483 25000 C3 REL606 unknown SRR098288 4.59
> # [ reached getOption("max.print") -- omitted 20 rows ]
head(metadata)
> # sample generation clade strain cit run genome_size
> # 1 REL606 0 <NA> REL606 unknown 4.62
> # 2 REL1166A 2000 unknown REL606 unknown SRR098028 4.63
> # 3 ZDB409 5000 unknown REL606 unknown SRR098281 4.60
> # 4 ZDB429 10000 UC REL606 unknown SRR098282 4.59
> # 5 ZDB446 15000 UC REL606 unknown SRR098283 4.66
> # 6 ZDB458 20000 (C1,C2) REL606 unknown SRR098284 4.63
str(metadata)
> # 'data.frame': 30 obs. of 7 variables:
> # $ sample : Factor w/ 30 levels "CZB152","CZB154",..: 7 6 18 19 20 21 22 23 24 25 ...
> # $ generation : int 0 2000 5000 10000 15000 20000 20000 20000 25000 25000 ...
> # $ clade : Factor w/ 7 levels "(C1,C2)","C1",..: NA 7 7 6 6 1 1 1 2 4 ...
> # $ strain : Factor w/ 1 level "REL606": 1 1 1 1 1 1 1 1 1 1 ...
> # $ cit : Factor w/ 3 levels "minus","plus",..: 3 3 3 3 3 3 3 3 3 3 ...
> # $ run : Factor w/ 30 levels "","SRR097977",..: 1 5 22 23 24 25 26 27 28 29 ...
> # $ genome_size: num 4.62 4.63 4.6 4.59 4.66 4.63 4.62 4.61 4.65 4.59 ...
read.csv()
will read any text vector as a factorcit
is a factor w/3 levels: minus
, plus
, and unknown
dplyr
is a package for making data manipulation easierstr()
& round()
are built indplyr
b/c it’s not built-in# install.packages('dplyr')
library("dplyr")
dplyr
written in C++
so it’s very fastdplyr functions
select
, filter()
, mutate()
, group_by
, summarise()
select()
for selecting columnsselect(metadata, sample, clade, cit, genome_size)
> # sample clade cit genome_size
> # 1 REL606 <NA> unknown 4.62
> # 2 REL1166A unknown unknown 4.63
> # 3 ZDB409 unknown unknown 4.60
> # 4 ZDB429 UC unknown 4.59
> # 5 ZDB446 UC unknown 4.66
> # 6 ZDB458 (C1,C2) unknown 4.63
> # 7 ZDB464* (C1,C2) unknown 4.62
> # 8 ZDB467 (C1,C2) unknown 4.61
> # 9 ZDB477 C1 unknown 4.65
> # 10 ZDB483 C3 unknown 4.59
> # 11 ZDB16 C1 unknown 4.61
> # 12 ZDB357 C2 unknown 4.62
> # 13 ZDB199* C1 minus 4.62
> # 14 ZDB200 C2 minus 4.63
> # 15 ZDB564 Cit+ plus 4.74
> # 16 ZDB30* C3 minus 4.61
> # 17 ZDB172 Cit+ plus 4.77
> # 18 ZDB158 C2 minus 4.63
> # [ reached getOption("max.print") -- omitted 12 rows ]
filter(metadata, cit == "plus")
> # sample generation clade strain cit run genome_size
> # 1 ZDB564 31500 Cit+ REL606 plus SRR098289 4.74
> # 2 ZDB172 32000 Cit+ REL606 plus SRR098042 4.77
> # 3 ZDB143 32500 Cit+ REL606 plus SRR098040 4.79
> # 4 CZB152 33000 Cit+ REL606 plus SRR097977 4.80
> # 5 CZB154 33000 Cit+ REL606 plus SRR098026 4.76
> # 6 ZDB87 34000 C2 REL606 plus SRR098035 4.75
> # 7 ZDB96 36000 Cit+ REL606 plus SRR098036 4.74
> # 8 ZDB107 38000 Cit+ REL606 plus SRR098038 4.79
> # 9 REL10979 40000 Cit+ REL606 plus SRR098029 4.78
dplyr
- select then filter%>
is the pipe operator provided by the magrittr
package and utilzed by dplyr
metadata %>% filter(cit == "plus") %>% select(sample, generation, clade)
> # sample generation clade
> # 1 ZDB564 31500 Cit+
> # 2 ZDB172 32000 Cit+
> # 3 ZDB143 32500 Cit+
> # 4 CZB152 33000 Cit+
> # 5 CZB154 33000 Cit+
> # 6 ZDB87 34000 C2
> # 7 ZDB96 36000 Cit+
> # 8 ZDB107 38000 Cit+
> # 9 REL10979 40000 Cit+
Above: 1. Send metadata
data through filter to keep rows where cit
was plus
2. Then send to select
to keep the sample
, generation
, and clade
columns
We don’t need to include the dataframe as an argument – the pipe handles that.
Let’s create a new object with this smaller dataset.
meta_citplus <- metadata %>% filter(cit == "plus") %>% select(sample, generation,
clade)
meta_citplus
> # sample generation clade
> # 1 ZDB564 31500 Cit+
> # 2 ZDB172 32000 Cit+
> # 3 ZDB143 32500 Cit+
> # 4 CZB152 33000 Cit+
> # 5 CZB154 33000 Cit+
> # 6 ZDB87 34000 C2
> # 7 ZDB96 36000 Cit+
> # 8 ZDB107 38000 Cit+
> # 9 REL10979 40000 Cit+
Challenge
Using pipes, subset the data to include rows where the clade is ‘Cit+’. >Retain columns sample, cit, and genome_size.
metadata %>% mutate(genome_bp = genome_size * 1e+06)
> # sample generation clade strain cit run genome_size
> # 1 REL606 0 <NA> REL606 unknown 4.62
> # 2 REL1166A 2000 unknown REL606 unknown SRR098028 4.63
> # 3 ZDB409 5000 unknown REL606 unknown SRR098281 4.60
> # 4 ZDB429 10000 UC REL606 unknown SRR098282 4.59
> # 5 ZDB446 15000 UC REL606 unknown SRR098283 4.66
> # 6 ZDB458 20000 (C1,C2) REL606 unknown SRR098284 4.63
> # 7 ZDB464* 20000 (C1,C2) REL606 unknown SRR098285 4.62
> # 8 ZDB467 20000 (C1,C2) REL606 unknown SRR098286 4.61
> # 9 ZDB477 25000 C1 REL606 unknown SRR098287 4.65
> # genome_bp
> # 1 4620000
> # 2 4630000
> # 3 4600000
> # 4 4590000
> # 5 4660000
> # 6 4630000
> # 7 4620000
> # 8 4610000
> # 9 4650000
> # [ reached getOption("max.print") -- omitted 21 rows ]
metadata %>% mutate(genome_bp = genome_size * 1e+06) %>% head
> # sample generation clade strain cit run genome_size
> # 1 REL606 0 <NA> REL606 unknown 4.62
> # 2 REL1166A 2000 unknown REL606 unknown SRR098028 4.63
> # 3 ZDB409 5000 unknown REL606 unknown SRR098281 4.60
> # 4 ZDB429 10000 UC REL606 unknown SRR098282 4.59
> # 5 ZDB446 15000 UC REL606 unknown SRR098283 4.66
> # 6 ZDB458 20000 (C1,C2) REL606 unknown SRR098284 4.63
> # genome_bp
> # 1 4620000
> # 2 4630000
> # 3 4600000
> # 4 4590000
> # 5 4660000
> # 6 4630000
filter()
metadata %>% mutate(genome_bp = genome_size * 1e+06) %>% filter(!is.na(clade)) %>%
head
> # sample generation clade strain cit run genome_size
> # 1 REL1166A 2000 unknown REL606 unknown SRR098028 4.63
> # 2 ZDB409 5000 unknown REL606 unknown SRR098281 4.60
> # 3 ZDB429 10000 UC REL606 unknown SRR098282 4.59
> # 4 ZDB446 15000 UC REL606 unknown SRR098283 4.66
> # 5 ZDB458 20000 (C1,C2) REL606 unknown SRR098284 4.63
> # 6 ZDB464* 20000 (C1,C2) REL606 unknown SRR098285 4.62
> # genome_bp
> # 1 4630000
> # 2 4600000
> # 3 4590000
> # 4 4660000
> # 5 4630000
> # 6 4620000
is.na()
is a function that determines whether something is or is not an NA.!
symbol negates it, so we’re asking for everything that isn’t NA
dplyr
makes this easy through group_by()
group_by()
splits data into groups upon which some operations can be runmetadata %>% group_by(cit) %>% tally()
> # Source: local data frame [3 x 2]
> #
> # cit n
> # (fctr) (int)
> # 1 minus 9
> # 2 plus 9
> # 3 unknown 12
Summary stats
When dealing with simple statistics like mean, easiest way is to just ignore NA
by using na.rm=TRUE
(rm standards for remove)
group_by()
is often used together with summarize()
which collapses each group into a single-row summary of that groupsummarize can then be used to apply a function such as genome_size
by mutant status:
metadata %>% group_by(cit) %>% summarize(mean_size = mean(genome_size, na.rm = TRUE))
> # Source: local data frame [3 x 2]
> #
> # cit mean_size
> # (fctr) (dbl)
> # 1 minus 4.614444
> # 2 plus 4.768889
> # 3 unknown 4.619167
We can group by multiple columns too:
metadata %>% group_by(cit, clade) %>% summarise(mean_size = mean(genome_size,
na.rm = TRUE))
> # Source: local data frame [13 x 3]
> # Groups: cit [?]
> #
> # cit clade mean_size
> # (fctr) (fctr) (dbl)
> # 1 minus C1 4.606667
> # 2 minus C2 4.625000
> # 3 minus C3 4.610000
> # 4 minus Cit+ 4.600000
> # 5 plus C2 4.750000
> # 6 plus Cit+ 4.771250
> # 7 unknown (C1,C2) 4.620000
> # 8 unknown C1 4.630000
> # 9 unknown C2 4.620000
> # 10 unknown C3 4.590000
> # 11 unknown UC 4.625000
> # 12 unknown unknown 4.615000
> # 13 unknown NA 4.620000
Looks like clade is missing. WE can remove using filter():
metadata %>% group_by(cit, clade) %>% summarise(mean_size = mean(genome_size,
na.rm = TRUE)) %>% filter(!is.na(clade))
> # Source: local data frame [12 x 3]
> # Groups: cit [3]
> #
> # cit clade mean_size
> # (fctr) (fctr) (dbl)
> # 1 minus C1 4.606667
> # 2 minus C2 4.625000
> # 3 minus C3 4.610000
> # 4 minus Cit+ 4.600000
> # 5 plus C2 4.750000
> # 6 plus Cit+ 4.771250
> # 7 unknown (C1,C2) 4.620000
> # 8 unknown C1 4.630000
> # 9 unknown C2 4.620000
> # 10 unknown C3 4.590000
> # 11 unknown UC 4.625000
> # 12 unknown unknown 4.615000
We can also summarize multiple variables at the same time:
metadata %>% group_by(cit, clade) %>% summarize(mean_size = mean(genome_size,
na.rm = TRUE), min_generation = min(generation))
> # Source: local data frame [13 x 4]
> # Groups: cit [?]
> #
> # cit clade mean_size min_generation
> # (fctr) (fctr) (dbl) (int)
> # 1 minus C1 4.606667 31500
> # 2 minus C2 4.625000 31500
> # 3 minus C3 4.610000 32000
> # 4 minus Cit+ 4.600000 34000
> # 5 plus C2 4.750000 34000
> # 6 plus Cit+ 4.771250 31500
> # 7 unknown (C1,C2) 4.620000 20000
> # 8 unknown C1 4.630000 25000
> # 9 unknown C2 4.620000 30000
> # 10 unknown C3 4.590000 25000
> # 11 unknown UC 4.625000 10000
> # 12 unknown unknown 4.615000 2000
> # 13 unknown NA 4.620000 0