How to represent categorical information in R?
You can create data/feline-data.csv
using a text editor (Nano), or within RStudio with the File -> New File -> Text File menu item.
coat,weight,likes_string
calico,2.1,1
black,5.0,0
tabby,3.2,1
cats <- read.csv(file = "data/feline-data.csv")
## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : incomplete final line found by readTableHeader on 'data/feline-
## data.csv'
cats
## coat weight likes_string
## 1 calico 2.1 1
## 2 black 5.0 0
## 3 tabby 3.2 1
read.csv
used fr reading in tablula data stored in a text file, commacats$weight
## [1] 2.1 5.0 3.2
cats$coat
## [1] calico black tabby
## Levels: black calico tabby
## say we discovered that the scale weighs two Kg light:
cats$weight + 2
## [1] 4.1 7.0 5.2
paste("My cat is", cats$coat)
## [1] "My cat is calico" "My cat is black" "My cat is tabby"
but what about
cats$weight + cats$coat
## Warning in Ops.factor(cats$weight, cats$coat): '+' not meaningful for
## factors
## [1] NA NA NA
data types
typeof(cats$weight)
## [1] "double"
5 main types:
typeof(3.14)
## [1] "double"
typeof(1L)
## [1] "integer"
typeof(1+1i)
## [1] "complex"
typeof(TRUE)
## [1] "logical"
typeof('banna')
## [1] "character"
note the L suffix for indicating an number is an iteger
file.show("data/feline-data_v2.csv")
weight
cats <- read.csv(file="data/feline-data_v2.csv")
## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : incomplete final line found by readTableHeader on 'data/feline-
## data_v2.csv'
typeof(cats$weight)
## [1] "integer"
cats$weight + 2
## Warning in Ops.factor(cats$weight, 2): '+' not meaningful for factors
## [1] NA NA NA NA
what happened?
double
then nobody in the column gets to be a doublea structure that R knows how to build out of basic data types.
we can see that it is a data.frame
by calling the class function on it:
class(cats)
## [1] "data.frame"
before:
coat,weight,likes_string
calico,2.1,1
black,5.0,0
tabby,3.2,1
tabby,2.3 or 2.4,1
after:
coat,weight,likes_string
calico,2.1,1
black,5.0,0
tabby,3.2,1
cats <- read.csv(file="data/feline-data.csv")
## Warning in read.table(file = file, header = header, sep = sep, quote =
## quote, : incomplete final line found by readTableHeader on 'data/feline-
## data.csv'
my_vector <- vector(length = 3)
my_vector
## [1] FALSE FALSE FALSE
another_vector <- vector(mode='character', length = 3)
another_vector
## [1] "" "" ""
str(another_vector)
## chr [1:3] "" "" ""
str(cats$weight)
## num [1:3] 2.1 5 3.2
data.frames
are all vectorsconcat_vector <- c(2,6,3)
concat_vector
## [1] 2 6 3
quiz_vector <- c(2,6,'3')
str(quiz_vector)
## chr [1:3] "2" "6" "3"
coercion <- c('a', TRUE)
str(coercion)
## chr [1:2] "a" "TRUE"
another_coercion_vector <- c(0, TRUE)
another_coercion_vector
## [1] 0 1
->
can be read as are transformed intoas.
functionscharacter_vector_example <- c('0','2','4')
character_vector_example
## [1] "0" "2" "4"
character_coerced_to_numeric <- as.numeric(character_vector_example)
character_coerced_to_numeric
## [1] 0 2 4
numeric_coerced_to_logical <- as.logical(character_coerced_to_numeric)
numeric_coerced_to_logical
## [1] FALSE TRUE TRUE
make sure everything is the same type in your vectors and your columns of data.frames
or you will get bad surprises
we can coerce this column by using as.logical
cats$likes_string
## [1] 1 0 1
cats$likes_string <- as.logical(cats$likes_string)
cats$likes_string
## [1] TRUE FALSE TRUE
ab_vector <- c('a', 'b')
ab_vector
## [1] "a" "b"
concat_example <- c(ab_vector, 'SWC')
concat_example
## [1] "a" "b" "SWC"
mySeries <- 1:10
mySeries
## [1] 1 2 3 4 5 6 7 8 9 10
seq(10)
## [1] 1 2 3 4 5 6 7 8 9 10
seq(1,10, by=0.1)
## [1] 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3
## [15] 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7
## [29] 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0 5.1
## [43] 5.2 5.3 5.4 5.5 5.6 5.7 5.8 5.9 6.0 6.1 6.2 6.3 6.4 6.5
## [57] 6.6 6.7 6.8 6.9 7.0 7.1 7.2 7.3 7.4 7.5 7.6 7.7 7.8 7.9
## [71] 8.0 8.1 8.2 8.3 8.4 8.5 8.6 8.7 8.8 8.9 9.0 9.1 9.2 9.3
## [85] 9.4 9.5 9.6 9.7 9.8 9.9 10.0
sequence_example <- seq(10)
head(sequence_example, n=2)
## [1] 1 2
tail(sequence_example, n=4)
## [1] 7 8 9 10
length(sequence_example)
## [1] 10
class(sequence_example)
## [1] "integer"
typeof(sequence_example)
## [1] "integer"
names_example <- 5:8
names(names_example) <- c("a", "b", "c", "d")
names_example
## a b c d
## 5 6 7 8
names(names_example)
## [1] "a" "b" "c" "d"
data.frames
were vectorsstr(cats$weight)
## num [1:3] 2.1 5 3.2
str(cats$likes_string)
## logi [1:3] TRUE FALSE TRUE
str(cats$coat)
## Factor w/ 3 levels "black","calico",..: 2 1 3
coats <- c('tabby', 'tortoiseshell', 'tortoiseshell', 'black', 'tabby')
coats
## [1] "tabby" "tortoiseshell" "tortoiseshell" "black"
## [5] "tabby"
str(coats)
## chr [1:5] "tabby" "tortoiseshell" "tortoiseshell" "black" ...
CATegories <- factor(coats)
class(CATegories)
## [1] "factor"
str(CATegories)
## Factor w/ 3 levels "black","tabby",..: 2 3 3 1 2
typeof(coats)
## [1] "character"
typeof(CATegories)
## [1] "integer"
http://swcarpentry.github.io/r-novice-gapminder/04-data-structures-part1#challenge-2
mydata <- c("case", "control", "control", "case")
factor_ordering_example <- factor(mydata, levels = c("control", "case"))
str(factor_ordering_example)
## Factor w/ 2 levels "control","case": 2 1 1 2
list
list_example <- list(1, "a", TRUE, 1+4i)
list_example
## [[1]]
## [1] 1
##
## [[2]]
## [1] "a"
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] 1+4i
another_list <- list(title = "Research Bazaar", numbers = 1:10, data = TRUE )
another_list
## $title
## [1] "Research Bazaar"
##
## $numbers
## [1] 1 2 3 4 5 6 7 8 9 10
##
## $data
## [1] TRUE
data.frame
what happens if we:typeof(cats)
## [1] "list"
data.frames
look like lists under the hooddata.frame is a special ist in which all vectors must have same length
in our cats exampel we have an integer, a double and logical variable
cats$coat
## [1] calico black tabby
## Levels: black calico tabby
cats[,1]
## [1] calico black tabby
## Levels: black calico tabby
typeof(cats[,1])
## [1] "integer"
str(cats[,1])
## Factor w/ 3 levels "black","calico",..: 2 1 3
*each row is an observation of different variables, itself a data.frame
and thus can be composed of element of diff. types
cats[1,]
## coat weight likes_string
## 1 calico 2.1 TRUE
typeof(cats[1,])
## [1] "list"
str(cats[1,])
## 'data.frame': 1 obs. of 3 variables:
## $ coat : Factor w/ 3 levels "black","calico",..: 2
## $ weight : num 2.1
## $ likes_string: logi TRUE
matrix_example <- matrix(0, ncol=6, nrow=3)
matrix_example
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0 0 0 0 0 0
## [2,] 0 0 0 0 0 0
## [3,] 0 0 0 0 0 0
class(matrix_example)
## [1] "matrix"
typeof(matrix_example)
## [1] "double"
str(matrix_example)
## num [1:3, 1:6] 0 0 0 0 0 0 0 0 0 0 ...
dim(matrix_example)
## [1] 3 6
nrow(matrix_example)
## [1] 3