The first we need to install and load the packages. You can install packages in R Studio by going to Tools -> Install Packages… in the menu bar. You can also install them in code (see below).

# install.packages("dplyr")
library(dplyr)

Great. Now we have dplyr loaded. Now what, you ask? Let’s load some data.

# read in the data files
lex_data <- read.delim("lexical_decision.txt", sep = " ")
# convert data to a tbl class (which is easier to read)
lex_data <- tbl_df(lex_data)
# view a snapshot of the data
glimpse(lex_data)
Observations: 8,190
Variables: 44
$ ExperimentName       <fctr> HomophoneLDT, HomophoneLDT, HomophoneLDT, HomophoneLDT, HomophoneLDT, HomophoneLDT, HomophoneLDT, HomophoneLDT, HomophoneLDT,...
$ Subject              <int> 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, ...
$ Session              <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ Clock.Information    <fctr> <?xml version=1.0?>\n<Clock xmlns:dt=urn:schemas-microsoft-com:datatypes><Description dt:dt=string>E-Prime Primary Realtime C...
$ Clock.StartTimeOfDay <fctr> 3/23/2015 4:04:49 PM, 3/23/2015 4:04:49 PM, 3/23/2015 4:04:49 PM, 3/23/2015 4:04:49 PM, 3/23/2015 4:04:49 PM, 3/23/2015 4:04:...
$ Display.RefreshRate  <dbl> 59.943, 59.943, 59.943, 59.943, 59.943, 59.943, 59.943, 59.943, 59.943, 59.943, 59.943, 59.943, 59.943, 59.943, 59.943, 59.943...
$ Group                <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ RandomSeed           <int> 233752768, 233752768, 233752768, 233752768, 233752768, 233752768, 233752768, 233752768, 233752768, 233752768, 233752768, 23375...
$ SessionDate          <fctr> 03-23-2015, 03-23-2015, 03-23-2015, 03-23-2015, 03-23-2015, 03-23-2015, 03-23-2015, 03-23-2015, 03-23-2015, 03-23-2015, 03-23...
$ SessionTime          <fctr> 16:04:49, 16:04:49, 16:04:49, 16:04:49, 16:04:49, 16:04:49, 16:04:49, 16:04:49, 16:04:49, 16:04:49, 16:04:49, 16:04:49, 16:04...
$ SessionTimeUtc       <fctr> 8:04:49 PM, 8:04:49 PM, 8:04:49 PM, 8:04:49 PM, 8:04:49 PM, 8:04:49 PM, 8:04:49 PM, 8:04:49 PM, 8:04:49 PM, 8:04:49 PM, 8:04:...
$ Block                <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,...
$ Break                <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ imageability         <fctr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, low, ...
$ key                  <int> 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, ...
$ List1                <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, ...
$ List1.Cycle          <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, ...
$ List1.Sample         <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 2, ...
$ PracList             <int> 20, 29, 23, 24, 5, 9, 11, 7, 10, 17, 25, 6, 30, 28, 3, 21, 14, 13, 4, 27, 26, 19, 1, 22, 15, 8, 2, 16, 12, 18, NA, NA, NA, NA,...
$ PracList.Cycle       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ PracList.Sample      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, NA, NA, NA, NA,...
$ Procedure            <fctr> PracProc, PracProc, PracProc, PracProc, PracProc, PracProc, PracProc, PracProc, PracProc, PracProc, PracProc, PracProc, PracP...
$ ptest                <fctr> INK, TENT, MILL, OLIVE, JUM, PROAD, SIF, PAME, SHEAL, CON, OVEN, MEAP, THUMB, RUIN, DOAK, KNIT, YILE, TRULE, GROBE, RALLY, PA...
$ Running              <fctr> PracList, PracList, PracList, PracList, PracList, PracList, PracList, PracList, PracList, PracList, PracList, PracList, PracL...
$ testing              <fctr> , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , BYE, NAYVY, BOAL, PERCE, DIRE, LERCE, WARN, JAW, SANDE, DURT, FAW...
$ type                 <fctr> word, word, word, word, nonword, nonword, nonword, nonword, nonword, word, word, nonword, word, word, nonword, word, nonword,...
$ word.ACC             <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0, 0, ...
$ word.CRESP           <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ word.DurationError   <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -99999...
$ word.OnsetDelay      <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 1, ...
$ word.OnsetTime       <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 83988,...
$ word.RESP            <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, 0, ...
$ word.RT              <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 719, 1...
$ word.RTTime          <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 84707,...
$ word1.ACC            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ word1.CRESP          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ word1.DurationError  <int> -999999, -999999, -999999, -999999, -999999, -999999, -999999, -999999, -999999, -999999, -999999, -999999, -999999, -999999, ...
$ word1.OnsetDelay     <int> 1, 1, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 17, 1, 1, 1, 1, 1, 1, 16, 1, 1, 1, 1, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ word1.OnsetTime      <int> 29753, 31805, 33372, 35008, 36860, 38762, 40480, 42165, 43900, 45752, 47370, 48955, 50723, 52508, 54093, 55678, 57630, 59832, ...
$ word1.RESP           <int> 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ word1.RT             <int> 1035, 543, 632, 844, 888, 698, 660, 727, 848, 588, 570, 753, 769, 576, 567, 934, 1190, 844, 856, 715, 541, 581, 652, 719, 592,...
$ word1.RTTime         <int> 30788, 32348, 34004, 35852, 37748, 39460, 41140, 42892, 44748, 46340, 47940, 49708, 51492, 53084, 54660, 56612, 58820, 60676, ...
$ wordlike             <fctr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, l...
$ WordList             <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 142, 7...

Yikes! This is all a bit much for me to work with. Sure, I made the Eprime file, so I know what the columns are supposed to be, but those headers just aren’t intuitive. Let’s fix that (and get rid of the useless columns, too).

# change the column names (to something more useful)
colnames(lex_data) <- c("V1","id","V2","V3","V4","V5","V6","V7","date","V8","V9","V10","V11","imageability","key","V12","V13","trialorder","V15","V16","V17","procedure","V18","V19","stimulus","type","V20","V21","V22","V23","V24","response","rt","V25","V26","V27","V28","V29","V30","V31","V32","V33","wordlikeness","V34")
# select the important columns (in the order that we want them)
lex_data <- select(lex_data, id, date, imageability, key, procedure, stimulus, type, response, rt, wordlikeness, trialorder)

Now would be a good time to take another look at the data using the glimpse function. I’ll wait… Isn’t that a lot more manageable? I’ll take your look of astonishment as an affirmative. Anyway, we have a lot of practice items and breaks that should be filtered out of the data before any analyses. Let’s do that now.

# filter for only the experimental rows (remove practices + breaks)
lex_data <- filter(lex_data, procedure == "AuthorProc")

Cool. Did you glipse the data? Did you see the problem? The stimuli are factors and we are going to need them to be character strings. Fear not, fixing that is easy peasy lemon… I’m done.

# convert the stimulus column to character strings
lex_data$stimulus <- as.character(lex_data$stimulus)

Exhausting, I know. Here’s the thing: A lot of the data needs to be manipulated before I can analyze it. I need accuracy information. Reaction times need to be log transformed. Trial order needs to be standardized. One of our stimuli was literally the word “NULL”. R doesn’t like that, so we need to make it something else (like “NADA”). You know how it is. Time to mutate. Cowabunga!

# create an accuracy column by comparing the key and response
lex_data <- mutate(lex_data, accuracy = ifelse(key == response, 1, 0))
# create an error column (reversed accuracy without the NAs)
lex_data <- mutate(lex_data, error = ifelse(is.na(accuracy), 1, ifelse(accuracy == 0, 1, 0)))
# replace RTs of less than 250ms with NA (because inhuman speed)
lex_data$rt <- ifelse(lex_data$rt < 250, NA, lex_data$rt)
# create a correct_rt column with only correct trial RTs (because science)
lex_data <- lex_data %>% mutate(correct_rt = ifelse(error == 1, NA, rt))
# create a log_rt column
lex_data <- mutate(lex_data, log_rt = log(correct_rt))
# standardize the trial order and prt
lex_data$trialorder <- scale(lex_data$trialorder)
# fix the missing stimulus
lex_data$stimulus[lex_data$stimulus == ""] <- "NADA"

Now that you’ve learned the secret of the ooze, take another glimpse of the data. That looks better, amirite? Sadly, we have some new columns making the old ones unnecessary. Here comes the select function again to save the day.

# select (and rename and reorder) the important columns
lex_data <- select(lex_data, id, rt = log_rt, err = error, stim = stimulus, type, img = imageability, word = wordlikeness, order = trialorder)
glimpse(lex_data)
Observations: 7,200
Variables: 8
$ id    <int> 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 6101, 610...
$ rt    <dbl> 6.577861, 7.134891, 6.735780, 6.693324, 6.572283, 6.541030, 6.280396, 6.721426, 6.318968, 6.603944, 6.536692, 7.375256, 6.697034, 6.641182, 7...
$ err   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
$ stim  <chr> "BYE", "NAYVY", "BOAL", "PERCE", "DIRE", "LERCE", "WARN", "JAW", "SANDE", "DURT", "FAWLT", "TEAR", "HAIK", "BOR", "WITCH", "JALE", "TRALE", "...
$ type  <fctr> word, pseudohomophone, pseudohomophone, pseudohomophone, word, nonword, word, word, pseudohomophone, pseudohomophone, pseudohomophone, word,...
$ img   <fctr> low, NA, NA, NA, low, NA, low, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, high, low, NA, NA, NA, NA, low, NA, NA, high, NA, NA, high, NA, NA, N...
$ word  <fctr> NA, less, more, less, NA, less, NA, NA, less, more, less, NA, less, more, NA, more, more, NA, NA, less, less, less, less, NA, less, more, NA...
$ order <dbl> -1.7194971, -1.7052863, -1.6910756, -1.6768649, -1.6626542, -1.6484435, -1.6342327, -1.6200220, -1.6058113, -1.5916006, -1.5773899, -1.563179...

Isn’t it glorious? Almost. We need to fix those a couple more data structures. (But seriously, we are almost to the fun part, I promise. Oh, you thought this was the fun part? Well, in that case, we are kindred spirits, friend.)

# change variables to more useful types
lex_data$id <- as.factor(lex_data$id)
lex_data$err <- as.integer(lex_data$err)

Since we converted to a tbl data structure, we can see the head information just by typing the name of the data frame (lex_data) into the console. Try it. It’s like glimpse, but wider!

Okay, now that the data is in a nice and neat format, let’s dive into that data! How about getting the means of all of the columns. Oh boy, I wonder what will happen.

# summarize each column to get means
summarise_each(lex_data, funs(mean))
argument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NAargument is not numeric or logical: returning NA

Well, I can see the mean error rate is just about seven percent and that the mean of order is zero because we standardized it. But what happened to the other columns? Factors and character strings have no means, but RT should have one, right? Yes, it should. But there are lots of NA values that screw everything up. Let’s take a closer look at that column while smoking our pipe (%>%).

# take the lex_data, filter out rows where RT is NA, take the mean of RT
lex_data %>% filter(!is.na(rt)) %>% summarise(mean(rt))

That’s better, but “what if I want a separate mean RT for each participant?” you ask? Behold!

# group_by Subject id to get individual mean RTs
lex_data %>% filter(!is.na(rt)) %>% group_by(id) %>% summarise(mean(rt))
# we can do it for error rates, too
lex_data %>% group_by(id) %>% summarise(mean(err))

What about those manipulations in our data? Those seem important, right? Yo, I’ll tell you what I want. What I really, really want. I wanna, really really really wanna determine effects of pseudo-homophony. (This is going to be THE hit of 2017.) Oh, and it sure would be nice to save those results in tables.

# create a table with subject mean RTs for pseudohomophones
pseudohomophone_rt <- lex_data %>% filter(!is.na(rt)) %>% filter(type == "pseudohomophone") %>%
  group_by(id) %>% summarise(ph_rt = mean(rt))
# create a table with subject mean RTs for nonwords
nonword_rt <- lex_data %>% filter(!is.na(rt)) %>% filter(type == "nonword") %>%
  group_by(id) %>% summarise(nw_rt = mean(rt))
# create a table with subject mean error rates for pseudohomophones
pseudohomophone_err <- lex_data %>% filter(type == "pseudohomophone") %>%
  group_by(id) %>% summarise(ph_err = mean(err))
# create a table with subject mean error rates for pseudohomophones
nonword_err <- lex_data %>% filter(type == "nonword") %>%
  group_by(id) %>% summarise(nw_err = mean(err))

Remember how we had too many columns before? Well, now we have too many tables. The last thing I have to teach you (until next week, at least) is how to combine these tables using one of the join functions. Don’t think about goodbyes. Just enjoy this beautiful code. (Okay, one annoying thing is that you cannot join more than two tables at once. Maybe someday the package will be updated to support this functionality. In my dreams.)

# combine tables using full_join
both_pseudohomophones = full_join(pseudohomophone_rt, pseudohomophone_err)
Joining, by = "id"
both_nonwords = full_join(nonword_rt, nonword_err)
Joining, by = "id"
pseudohomophone_effects = full_join(both_pseudohomophones, both_nonwords)
Joining, by = "id"
# use rm to declutter the Global Environment (top right)
rm(pseudohomophone_rt, pseudohomophone_err, both_pseudohomophones, nonword_rt, nonword_err, both_nonwords)
# save our data sets to file (space-delimited, csv)
write.table(lex_data, "lex_data.txt")
write.csv(pseudohomophone_effects, "ph_effects.csv")

Glimpse that data! Hey, maybe you can try doing the same thing for imageability and wordlikeness. I’ll leave a little code block here for you to give it a try:

Did it work? You did it, right? You did? That’s great! Congratulations, you made it to the end of the tutorial! (And if you didn’t do it… Well, congrats on exercising your free will, right? Wrong. Free will is an illusion. Go back and do it. I’ll wait…)

