Xianjun Dong, PhD | Bioinformatics Club | http://bioinformatics.bwh.harvard.edu
Xianjun Dong, PhD
http://bioinformatics.bwh.harvard.edu
method | wide->long | long->wide |
---|---|---|
reshape2 | melt | cast |
tidyr | gather | spread |
spreadsheet | unpiovt | pivot |
database | fold | unfold |
# variable value
# 1 ozone 23.615
# 2 ozone 29.444
# 3 ozone 59.115
# 4 ozone 59.962
# 5 wind 11.623
# 6 wind 10.267
# 7 wind 8.942
# 8 wind 8.794
# 9 temp 65.548
# ...
# ozone wind temp
# 1 23.62 11.623 65.55
# 2 29.44 10.267 79.10
# 3 59.12 8.942 83.90
# 4 59.96 8.794 83.97
ggplot2
, plyr
, lm()
, glm()
etc. reshape2 is based around two key functions: melt
and cast
:
melt
: convert wide- to long-formatcast
: convert long- to wide-formatThink of working with metal: if you melt metal, it drips and becomes long. If you cast it into a mould, it becomes wide.
# Daily air quality measurements in New York, May to September 1973.
# - Ozone: Mean trioxygen in parts per billion
# - Solar.R: Solar radiation in Langleys
# - Wind: Average wind speed in miles per hour
# - Temp: Maximum daily temperature in degrees Fahrenheit
names(airquality) <- tolower(names(airquality))
head(airquality)
## ozone solar.r wind temp month day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
By default, melt
has assumed that all columns with numeric values are variables with values.
library(reshape2)
aql <- melt(airquality) # [a]ir [q]uality [l]ong format
## No id variables; using all as measure variables
head(aql)
## variable value
## 1 ozone 41
## 2 ozone 36
## 3 ozone 12
## 4 ozone 18
## 5 ozone NA
## 6 ozone 28
tail(aql)
## variable value
## 913 day 25
## 914 day 26
## 915 day 27
## 916 day 28
## 917 day 29
## 918 day 30
What about if we want to know the values of ozone
, solar.r
, wind
, and temp
for each month and day?
aql <- melt(airquality, id.vars = c("month", "day"))
head(aql)
## month day variable value
## 1 5 1 ozone 41
## 2 5 2 ozone 36
## 3 5 3 ozone 12
## 4 5 4 ozone 18
## 5 5 5 ozone NA
## 6 5 6 ozone 28
?melt
Usage
# S3 method for class 'data.frame'
melt(data, id.vars, measure.vars,
variable.name = "variable", ..., na.rm = FALSE, value.name = "value",
factorsAsStrings = TRUE)
Arguments
argument | meaning |
---|---|
data | data frame to melt |
id.vars | vector of id variables. Can be integer (variable position) or string (variable name). If blank, will use all non-measured variables. |
measure.vars | vector of measured variables. Can be integer (variable position) or string (variable name)If blank, will use all non id.vars |
variable.name | name of variable used to store measured variable names |
… | further arguments passed to or from other methods. |
na.rm | Should NA values be removed from the data set? This will convert explicit missings to implicit missings. |
value.name | name of variable used to store values |
factorsAsStrings | Control whether factors are converted to character when melted as measure variables. When FALSE, coercion is forced if levels are not identical across the measure.vars. |
aql <- melt(airquality, id.vars = c("month", "day"),
variable.name = "climate_variable",
value.name = "climate_value")
head(aql)
## month day climate_variable climate_value
## 1 5 1 ozone 41
## 2 5 2 ozone 36
## 3 5 3 ozone 12
## 4 5 4 ozone 18
## 5 5 5 ozone NA
## 6 5 6 ozone 28
dim(aql)
## [1] 612 4
aql <- melt(airquality, id.vars = c("month", "day"), measure.vars=c("ozone",'temp'),
variable.name = "climate_variable",
value.name = "climate_value")
dim(aql)
## [1] 306 4
dcast uses a formula to describe the shape of the data. The arguments on the left refer to the ID variables and the arguments on the right refer to the measured variables.
aql <- melt(airquality, id.vars = c("month", "day"))
aqw <- dcast(aql, month + day ~ variable)
head(aqw)
## month day ozone solar.r wind temp
## 1 5 1 41 190 7.4 67
## 2 5 2 36 118 8.0 72
## 3 5 3 12 149 12.6 74
## 4 5 4 18 313 11.5 62
## 5 5 5 NA NA 14.3 56
## 6 5 6 28 NA 14.9 66
head(airquality) # original data
## ozone solar.r wind temp month day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
… is used when there is more than one value per data cell. For example,
aql <- melt(airquality, id.vars = c("month", "day"))
dcast(aql, month ~ variable)
## Aggregation function missing: defaulting to length
## month ozone solar.r wind temp
## 1 5 31 31 31 31
## 2 6 30 30 30 30
## 3 7 31 31 31 31
## 4 8 31 31 31 31
## 5 9 30 30 30 30
dcast(aql, month ~ variable, fun.aggregate = mean, na.rm = TRUE)
## month ozone solar.r wind temp
## 1 5 23.61538 181.2963 11.622581 65.54839
## 2 6 29.44444 190.1667 10.266667 79.10000
## 3 7 59.11538 216.4839 8.941935 83.90323
## 4 8 59.96154 171.8571 8.793548 83.96774
## 5 9 31.44828 167.4333 10.180000 76.90000
dcast(data, formula, fun.aggregate = NULL, ..., margins = NULL,
subset = NULL, fill = NULL, drop = TRUE,
value.var = guess_value(data))
aql <- melt(airquality, id.vars = c("month", "day"))
library(plyr) # needed to access . function
# subset
head(dcast(aql, month + day ~ variable, subset = .(variable == 'temp')))
## month day temp
## 1 5 1 67
## 2 5 2 72
## 3 5 3 74
## 4 5 4 62
## 5 5 5 56
## 6 5 6 66
# summarize rows and columns
head(dcast(aql, month ~ variable, fun.aggregate = max, na.rm = TRUE, margins = T))
## Warning in .fun(.value[0], ...): no non-missing arguments to max; returning
## -Inf
## month ozone solar.r wind temp (all)
## 1 5 115 334 20.1 81 334
## 2 6 71 332 20.7 93 332
## 3 7 135 314 14.9 92 314
## 4 8 168 273 15.5 97 273
## 5 9 96 259 16.6 93 259
## 6 (all) 168 334 20.7 97 334
head(dcast(aql, month ~ variable, fun.aggregate = mean, na.rm = TRUE, margins = T))
## month ozone solar.r wind temp (all)
## 1 5 23.61538 181.2963 11.622581 65.54839 68.70696
## 2 6 29.44444 190.1667 10.266667 79.10000 87.38384
## 3 7 59.11538 216.4839 8.941935 83.90323 93.49748
## 4 8 59.96154 171.8571 8.793548 83.96774 79.71207
## 5 9 31.44828 167.4333 10.180000 76.90000 71.82689
## 6 (all) 42.12931 185.9315 9.957516 77.88235 80.05722
Happy families are all alike; every unhappy family is unhappy in its own way.
Leo Tolstoy
Tidy datasets are all alike but every messy dataset is messy in its own way.
Hadley Wickham
read.csv("preg.csv", stringsAsFactors = FALSE)
#> name treatmenta treatmentb
#> 1 John Smith NA 18
#> 2 Jane Doe 4 1
#> 3 Mary Johnson 6 7
read.csv("preg2.csv", stringsAsFactors = FALSE)
#> treatment John.Smith Jane.Doe Mary.Johnson
#> 1 a NA 4 6
#> 2 b 18 1 7
A dataset is a collection of values. Every value belongs to a variable and an observation. (http://vita.had.co.nz/papers/tidy-data.html)
preg2
#> name treatment n
#> 1 Jane Doe a 4
#> 2 Jane Doe b 1
#> 3 John Smith a NA
#> 4 John Smith b 18
#> 5 Mary Johnson a 6
#> 6 Mary Johnson b 7
The goal of tidyr is to help you create a tidy data. Tidy data is data where:
# This dataset explores the relationship between income and religion in the US.
pew <- tbl_df(read.csv("pew.csv", stringsAsFactors = FALSE, check.names = FALSE))
#> # A tibble: 18 × 11
#> religion `<$10k` `$10-20k` `$20-30k` `$30-40k` `$40-50k`
#> <chr> <int> <int> <int> <int> <int>
#> 1 Agnostic 27 34 60 81 76
#> 2 Atheist 12 27 37 52 35
#> 3 Buddhist 27 21 30 34 33
#> 4 Catholic 418 617 732 670 638
#> 5 Don’t know/refused 15 14 15 11 10
#> 6 Evangelical Prot 575 869 1064 982 881
#> 7 Hindu 1 9 7 9 11
#> 8 Historically Black Prot 228 244 236 238 197
#> # ... with 8 more rows, and 5 more variables: `$50-75k` <int>,
#> # `$75-100k` <int>, `$100-150k` <int>, `>150k` <int>, `Don't
#> # know/refused` <int>
pew %>% gather(income, frequency, -religion)
#> # A tibble: 180 × 3
#> religion income frequency
#> <chr> <chr> <int>
#> 1 Agnostic <$10k 27
#> 2 Atheist <$10k 12
#> 3 Buddhist <$10k 27
#> 4 Catholic <$10k 418
#> 5 Don’t know/refused <$10k 15
#> 6 Evangelical Prot <$10k 575
#> 7 Hindu <$10k 1
#> 8 Historically Black Prot <$10k 228
#> 9 Jehovah's Witness <$10k 20
#> 10 Jewish <$10k 19
#> # ... with 170 more rows
# WHO dataset for the counts of confirmed tuberculosis cases by country, year, and demographic group
# The demographic groups are broken down by sex (m, f) and age (0-14, 15-25, 25-34, 35-44, 45-54, 55-64, unknown).
tb <- tbl_df(read.csv("tb.csv", stringsAsFactors = FALSE))
#> # A tibble: 5,769 × 22
#> iso2 year m04 m514 m014 m1524 m2534 m3544 m4554 m5564 m65 mu
#> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 AD 1989 NA NA NA NA NA NA NA NA NA NA
#> 2 AD 1990 NA NA NA NA NA NA NA NA NA NA
#> 3 AD 1991 NA NA NA NA NA NA NA NA NA NA
#> 4 AD 1992 NA NA NA NA NA NA NA NA NA NA
#> 5 AD 1993 NA NA NA NA NA NA NA NA NA NA
#> 6 AD 1994 NA NA NA NA NA NA NA NA NA NA
#> 7 AD 1996 NA NA 0 0 0 4 1 0 0 NA
#> # ... with 5,759 more rows, and 10 more variables: f04 <int>, f514 <int>,
#> # f014 <int>, f1524 <int>, f2534 <int>, f3544 <int>, f4554 <int>,
#> # f5564 <int>, f65 <int>, fu <int>
# We first gather up the non-variable columns:
tb2 <- tb %>%
gather(demo, n, -iso2, -year, na.rm = TRUE)
tb2
#> # A tibble: 35,750 × 4
#> iso2 year demo n
#> * <chr> <int> <chr> <int>
#> 1 AD 2005 m04 0
#> 2 AD 2006 m04 0
#> 3 AD 2008 m04 0
#> 4 AE 2006 m04 0
#> 5 AE 2007 m04 0
#> 6 AE 2008 m04 0
#> # ... with 35,740 more rows
# Then use separate() to split a compound variables into individual variables.
tb3 <- tb2 %>%
separate(demo, c("sex", "age"), 1)
tb3
#> # A tibble: 35,750 × 5
#> iso2 year sex age n
#> * <chr> <int> <chr> <chr> <int>
#> 1 AD 2005 m 04 0
#> 2 AD 2006 m 04 0
#> 3 AD 2008 m 04 0
#> 4 AE 2006 m 04 0
#> 5 AE 2007 m 04 0
#> 6 AE 2008 m 04 0
#> # ... with 35,740 more rows
# daily weather data for one weather station (MX17004) in Mexico for five months in 2010.
# It has variables in individual columns (id, year, month), spread across columns (day, d1-d31) and across rows (tmin, tmax) (minimum and maximum temperature).
weather <- tbl_df(read.csv("weather.csv", stringsAsFactors = FALSE))
#> # A tibble: 22 × 35
#> id year month element d1 d2 d3 d4 d5 d6 d7
#> <chr> <int> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 MX17004 2010 1 tmax NA NA NA NA NA NA NA
#> 2 MX17004 2010 1 tmin NA NA NA NA NA NA NA
#> 3 MX17004 2010 2 tmax NA 27.3 24.1 NA NA NA NA
#> 4 MX17004 2010 2 tmin NA 14.4 14.4 NA NA NA NA
#> 5 MX17004 2010 3 tmax NA NA NA NA 32.1 NA NA
#> 6 MX17004 2010 3 tmin NA NA NA NA 14.2 NA NA
#> # ... with 12 more rows, and 24 more variables: d8 <dbl>, d9 <lgl>,
#> # d10 <dbl>, d11 <dbl>, d12 <lgl>, d13 <dbl>, d14 <dbl>, d15 <dbl>,
#> # d16 <dbl>, d17 <dbl>, d18 <lgl>, d19 <lgl>, d20 <lgl>, d21 <lgl>,
#> # d22 <lgl>, d23 <dbl>, d24 <lgl>, d25 <dbl>, d26 <dbl>, d27 <dbl>,
#> # d28 <dbl>, d29 <dbl>, d30 <dbl>, d31 <dbl>
# we first gather the day columns:
weather2 <- weather %>%
gather(day, value, d1:d31, na.rm = TRUE)
weather2
#> # A tibble: 66 × 6
#> id year month element day value
#> * <chr> <int> <int> <chr> <chr> <dbl>
#> 1 MX17004 2010 12 tmax d1 29.9
#> 2 MX17004 2010 12 tmin d1 13.8
#> 3 MX17004 2010 2 tmax d2 27.3
#> 4 MX17004 2010 2 tmin d2 14.4
#> 5 MX17004 2010 11 tmax d2 31.3
#> 6 MX17004 2010 11 tmin d2 16.3
# and a little cleaning
weather3 <- weather2 %>%
mutate(day = extract_numeric(day)) %>%
select(id, year, month, day, element, value) %>%
arrange(id, year, month, day)
#> extract_numeric() is deprecated: please use readr::parse_number() instead
weather3
#> # A tibble: 66 × 6
#> id year month day element value
#> <chr> <int> <int> <dbl> <chr> <dbl>
#> 1 MX17004 2010 1 30 tmax 27.8
#> 2 MX17004 2010 1 30 tmin 14.5
#> 3 MX17004 2010 2 2 tmax 27.3
#> 4 MX17004 2010 2 2 tmin 14.4
#> 5 MX17004 2010 2 3 tmax 24.1
# then spread the element and value columns back out into the columns
weather3 %>% spread(element, value)
#> # A tibble: 33 × 6
#> id year month day tmax tmin
#> * <chr> <int> <int> <dbl> <dbl> <dbl>
#> 1 MX17004 2010 1 30 27.8 14.5
#> 2 MX17004 2010 2 2 27.3 14.4
#> 3 MX17004 2010 2 3 24.1 14.4
#> 4 MX17004 2010 2 11 29.7 13.4
#> 5 MX17004 2010 2 23 29.9 10.7
#> 6 MX17004 2010 3 5 32.1 14.2
#> # ... with 23 more rows
# the date a song first entered the billboard top 100 and the rank in each week after it enters
billboard <- tbl_df(read.csv("billboard.csv", stringsAsFactors = FALSE))
billboard
#> # A tibble: 317 × 81
#> year artist track time date.entered wk1
#> <int> <chr> <chr> <chr> <chr> <int>
#> 1 2000 2 Pac Baby Don't Cry (Keep... 4:22 2000-02-26 87
#> 2 2000 2Ge+her The Hardest Part Of ... 3:15 2000-09-02 91
#> 3 2000 3 Doors Down Kryptonite 3:53 2000-04-08 81
#> 4 2000 3 Doors Down Loser 4:24 2000-10-21 76
#> 5 2000 504 Boyz Wobble Wobble 3:35 2000-04-15 57
#> 6 2000 98^0 Give Me Just One Nig... 3:24 2000-08-19 51
#> # ... with 307 more rows, and 75 more variables: wk2 <int>, wk3 <int>,
#> # wk4 <int>, wk5 <int>, wk6 <int>, wk7 <int>, wk8 <int>, wk9 <int>,
#> # wk10 <int>, wk11 <int>, wk12 <int>, wk13 <int>, wk14 <int>,
#> # wk15 <int>, wk16 <int>, wk17 <int>, wk18 <int>, wk19 <int>,
#> # wk20 <int>, wk21 <int>, wk22 <int>, wk23 <int>, wk24 <int>,
#> # wk25 <int>, wk26 <int>, wk27 <int>, wk28 <int>, wk29 <int>, .... wk76
# gather week, convert week to number, date
billboard3 <- billboard %>%
gather(week, rank, wk1:wk76, na.rm = TRUE) %>%
mutate(
week = extract_numeric(week),
date = as.Date(date.entered) + 7 * (week - 1)) %>%
select(-date.entered) %>% arrange(artist, track, week)
#> # A tibble: 5,307 × 7
#> year artist track time week rank date
#> <int> <chr> <chr> <chr> <dbl> <int> <date>
#> 1 2000 2 Pac Baby Don't Cry (Keep... 4:22 1 87 2000-02-26
#> 2 2000 2 Pac Baby Don't Cry (Keep... 4:22 2 82 2000-03-04
#> 3 2000 2 Pac Baby Don't Cry (Keep... 4:22 3 72 2000-03-11
#> 4 2000 2 Pac Baby Don't Cry (Keep... 4:22 4 77 2000-03-18
#> 5 2000 2 Pac Baby Don't Cry (Keep... 4:22 5 87 2000-03-25
#> 6 2000 2 Pac Baby Don't Cry (Keep... 4:22 6 94 2000-04-01
# split billboard3 into song and rank
song <- billboard3 %>%
select(artist, track, year, time) %>%
unique() %>%
mutate(song_id = row_number())
song
#> # A tibble: 317 × 5
#> artist track year time song_id
#> <chr> <chr> <int> <chr> <int>
#> 1 2 Pac Baby Don't Cry (Keep... 2000 4:22 1
#> 2 2Ge+her The Hardest Part Of ... 2000 3:15 2
#> 3 3 Doors Down Kryptonite 2000 3:53 3
#> 4 3 Doors Down Loser 2000 4:24 4
#> 5 504 Boyz Wobble Wobble 2000 3:35 5
#> 6 98^0 Give Me Just One Nig... 2000 3:24 6
# split billboard3 into song and rank
# by replacing repeated song facts with a pointer to song details (a unique song id)
rank <- billboard3 %>%
left_join(song, c("artist", "track", "year", "time")) %>%
select(song_id, date, week, rank) %>%
arrange(song_id, date)
rank
#> # A tibble: 5,307 × 4
#> song_id date week rank
#> <int> <date> <dbl> <int>
#> 1 1 2000-02-26 1 87
#> 2 1 2000-03-04 2 82
#> 3 1 2000-03-11 3 72
#> 4 1 2000-03-18 4 77
#> 5 1 2000-03-25 5 87
#> 6 1 2000-04-01 6 94
# ldply() loops over each path, reading in the csv file and combining the results into a single data frame.
library(plyr)
paths <- dir("data", pattern = "\\.csv$", full.names = TRUE)
names(paths) <- basename(paths)
ldply(paths, read.csv, stringsAsFactors = FALSE)