rm(list=ls()) sink("R-EMNLP-trace.txt", append=TRUE, split=TRUE) data <- read.table("EMNLPdata.allNgrams", head=TRUE) head(data) d <- subset(data, !is.na(srprsl) & !is.na(ami_3gram) & duration>0 &duration < 2 & !is.na(GIGAfreq) & mary_dur>0 & ami_3gramwd!="" & ptb_3gramwd!="" & boword !="" & srprsl>0) d$speakerid <- as.factor(paste(d$meeting, d$speaker)) d$gigafreq <- log10(d$GIGAfreq/151066512) d$PTBfreq <- log10(d$ptbfreq/6152488) d$AMIfreq <- log10(d$amifreq/978352) d$unkword <- ifelse(d$boword=="",TRUE,FALSE) summary(d) library(lme4) d$cmary_dur <- d$mary_dur - mean(d$mary_dur) d$cmary_context <- d$mary_context - mean(d$mary_context) d$cAmiFreq <- d$AMIfreq - mean(d$AMIfreq) d$cgigafreq <- d$gigafreq - mean(d$gigafreq) d$csrprsl <- d$srprsl - mean(d$srprsl) d$cami_3gram <- d$ami_3gram - mean(d$ami_3gram) d$cptb_3gram <- d$ptb_3gram - mean(d$ptb_3gram) d$cbnc_3gram <- d$bnc_3gram - mean(d$bnc_3gram) d$cami_4gram <- d$ami_4gram - mean(d$ami_4gram) d$cptb_4gram <- d$ptb_4gram - mean(d$ptb_4gram) d$cbnc_4gram <- d$bnc_4gram - mean(d$bnc_4gram) d$csynSp <- d$synSp - mean(d$synSp) d$cgigafreq <- d$gigafreq - mean(d$gigafreq) d$clexSp <- d$lexSp - mean(d$lexSp) d$cgiga_3gram <- d$giga_3gram - mean(d$giga_3gram) d$cgiga_4gram <- d$giga_4gram - mean(d$giga_4gram) d$cgiga_5gram <- d$giga_5gram - mean(d$giga_5gram) marydur <- lmer(duration ~ cmary_dur + (1+cmary_dur | speakerid), data=d, REML=FALSE) marycontext <- lmer(duration ~ cmary_context + (1+cmary_context | speakerid), data=d, REML=FALSE) maryboth <- lmer(duration ~ cmary_context +mary_dur+ (1+cmary_context+mary_dur | speakerid), data=d, REML=FALSE) anova(marydur, maryboth) anova(marycontext, maryboth) baselineA <- marycontext baselineC <-lmer(duration~cmary_context+cAmiFreq +(1+cmary_context+cAmiFreq|speakerid), data=d, REML=FALSE) anova(baselineA, baselineC) baselineA baselineC baselineE <-lmer(duration~cmary_context+cAmiFreq+cgigafreq +(1+cmary_context+cAmiFreq|speakerid), data=d,control = list(maxIter = 1000), REML=FALSE) anova(baselineC, baselineE) baselineE3<-lmer(duration ~ cmary_context* cAmiFreq +cgigafreq + (1 + cmary_context + cAmiFreq | speakerid) , data=d, REML=FALSE) anova(baselineE3, baselineE) baselineE baselineE3 fullmodel<-lmer(duration ~ cmary_context* cAmiFreq +cgigafreq + cami_4gram + cgiga_4gram + srprsl+(1 + cmary_context + cAmiFreq +srprsl| speakerid) , data=d) intercept = as.vector(fixef(fullmodel)[1]) betasrprsl = as.vector(fixef(fullmodel)[7]) eff= betasrprsl*max(d$srprsl) - betasrprsl*min(d$srprsl) betasrprsl*16.277 - betasrprsl*2.179 lev <- levels(d$word) levmin <- tapply(d$srprsl,d$word,min) levmax <- tapply(d$srprsl,d$word,max) levdiff <- levmax -levmin lf<-data.frame(lev,levmin,levmax,levdiff) min(subset(d, word=="thing")$srprsl) max(subset(d, word=="thing")$srprsl) d$residcgiga_4gram <- residuals(lm(cgiga_4gram ~ cmary_context* cAmiFreq +cgigafreq , data=d)) d$residcami_4gram <- residuals(lm(cami_4gram ~ cmary_context* cAmiFreq +cgigafreq + cgiga_4gram, data=d)) baselineF<-lmer(duration ~ cmary_context* cAmiFreq +cgigafreq + residcami_4gram + residcgiga_4gram + (1 + cmary_context + cAmiFreq | speakerid) , data=d, REML=FALSE) baselineF baseline <- baselineE3 d$residuals <- residuals(baseline) baseline2 <- baselineF d$residuals2 <- residuals(baseline2) ma<-lmer(residuals~srprsl+(1+srprsl|speakerid), data=d) ma mc<-lmer(residuals~synSp*lexSp+(1+synSp+lexSp|speakerid), data=d) mc md<-lmer(residuals~ami_3gram+(1+ami_3gram|speakerid), data=d) md me<-lmer(residuals~ptb_3gram+(1+ptb_3gram|speakerid), data=d) me md4<-lmer(residuals~ami_4gram+(1+ami_4gram|speakerid), data=d) md4 me4<-lmer(residuals~ptb_4gram+(1+ptb_4gram|speakerid), data=d) me4 ma2<-lmer(residuals2~srprsl+(1+srprsl|speakerid), data=d) ma2 mc2<-lmer(residuals2~synSp*lexSp+(1+synSp+lexSp|speakerid), data=d) mc2 mm <-lmer(duration~cmary_context+cAmiFreq + cgigafreq + (1|speakerid), data=d) mma<-lmer(duration~cmary_context+cAmiFreq + cgigafreq+csrprsl+(1+csrprsl|speakerid), data=d) anova(mm, mma) mmb<-lmer(duration~cmary_context+cAmiFreq + cgigafreq+cami_3gram+(1+cami_3gram|speakerid), data=d) anova(mm, mmb) mmd<-lmer(duration~cmary_context+cAmiFreq + cgigafreq+cptb_3gram+(1+cptb_3gram|speakerid), data=d) anova(mm, mmd) mme<-lmer(duration~cmary_context+cAmiFreq + cgigafreq+ csynSp+(1+csynSp|speakerid), data=d) anova(mm, mme) mmf1<-lmer(duration~cmary_context+cAmiFreq + cgigafreq+clexSp+(1+clexSp|speakerid), data=d) mmf2<-lmer(duration~cmary_context+cAmiFreq + cgigafreq+ csynSp+(1+csynSp|speakerid), data=d) mmf<-lmer(duration~cmary_context+cAmiFreq + cgigafreq+clexSp+ csynSp+(1+clexSp+csynSp|speakerid), data=d) anova(mm, mmf, mmf1, mmf2) mmg<-lmer(duration~cmary_context+cAmiFreq + cgigafreq+clexSp+ cptb_3gram+(1+clexSp|speakerid), data=d) anova(mm, mmf1, mmd, mmg) mmh<-lmer(duration~cmary_context+cAmiFreq + cgigafreq+ csrprsl+(1+csrprsl|speakerid), data=d) mmb4<-lmer(duration~cmary_context+cAmiFreq + cgigafreq+cami_4gram+(1+cami_4gram|speakerid), data=d) mmd4<-lmer(duration~cmary_context+cAmiFreq + cgigafreq+cptb_4gram+(1+cptb_4gram|speakerid), data=d) mmb34<-lmer(duration~cmary_context+cAmiFreq + cgigafreq+cami_4gram+cami_3gram+(1+cami_4gram|speakerid), data=d) mmd34<-lmer(duration~cmary_context+cAmiFreq + cgigafreq+cptb_4gram+cptb_3gram+(1+cptb_4gram|speakerid), data=d) anova(mmb4, mmb34) anova(mmd4, mmd34) mmb34b<-lmer(duration~cmary_context+cAmiFreq + cgigafreq+cami_4gram+cami_3gram+(1+cami_3gram|speakerid), data=d) mmd34b<-lmer(duration~cmary_context+cAmiFreq + cgigafreq+cptb_4gram+cptb_3gram+(1+cptb_3gram|speakerid), data=d) anova(mmb, mmb34b) anova(mmd, mmd34b) mmgiga4<-lmer(duration~cmary_context*cAmiFreq + cgigafreq+cgiga_4gram+(1+cgiga_4gram|speakerid), data=d) mmgiga3<-lmer(duration~cmary_context*cAmiFreq + cgigafreq+cgiga_3gram+(1+cgiga_3gram|speakerid), data=d) mmgiga5<-lmer(duration~cmary_context*cAmiFreq + cgigafreq+cgiga_5gram+(1+cgiga_5gram|speakerid), data=d) mmgiga4<-lmer(duration~cmary_context*cAmiFreq + cgigafreq+cgiga_4gram+(1+cgiga_4gram|speakerid), data=d) mmgiga3<-lmer(duration~cmary_context*cAmiFreq + cgigafreq+cgiga_3gram+(1+cgiga_3gram|speakerid), data=d) mmgiga54<-lmer(duration~cmary_context*cAmiFreq + cgigafreq+cgiga_4gram+cgiga_5gram+(1+cgiga_5gram|speakerid), data=d) anova(mmgiga5, mmgiga54) ############checking validity of surprisal over ptb frequencies or trigrams############# ptb <- subset(d, !is.na(PTBfreq)) ptb$residresid <- residuals(lmer(residuals ~ PTBfreq + cptb_4gram +(1|speakerid), data = d)) lmer(residresid ~ srprsl + (1+srprsl|speakerid), data=ptb) ###########native speakers ######## dnative <- subset(d, native_language=="english") dnative$cmary_dur <- dnative$mary_dur - mean(dnative$mary_dur) dnative$cmary_context <- dnative$mary_context - mean(dnative$mary_context) dnative$cAmiFreq <- dnative$AMIfreq - mean(dnative$AMIfreq) dnative$cgigafreq <- dnative$gigafreq - mean(dnative$gigafreq) dnative$csrprsl <- dnative$srprsl - mean(dnative$srprsl) dnative$cami_3gram <- dnative$ami_3gram - mean(dnative$ami_3gram) dnative$cgiga_4gram <- dnative$giga_4gram - mean(dnative$giga_4gram) dnative$csynSp <- dnative$synSp - mean(dnative$synSp) engBaseline <-lmer(duration~cmary_context*cAmiFreq+cgigafreq+(1+cmary_context+cAmiFreq|speakerid), data=dnative) engBaseline2 <-lmer(duration~cmary_context*cAmiFreq+cgigafreq+cami_3gram + cgiga_4gram+(1+cmary_context+cAmiFreq|speakerid), data=dnative) engBaseline engBaseline2 dnative$residEng <- residuals(engBaseline) dnative$residEng2 <- residuals(engBaseline2) lmer(residEng2~srprsl+(1+srprsl|speakerid), data=dnative) lmer(residEng~srprsl+(1+srprsl|speakerid), data=dnative) lmer(residEng~ami_3gram+(1+ami_3gram|speakerid), data=dnative) lmer(residEng~ptb_3gram+(1+ptb_3gram|speakerid), data=dnative) ############## dnon <- subset(d, native_language!="english") dnon$cmary_dur <- dnon$mary_dur - mean(dnon$mary_dur) dnon$cmary_context <- dnon$mary_context - mean(dnon$mary_context) dnon$cAmiFreq <- dnon$AMIfreq - mean(dnon$AMIfreq) dnon$cgigafreq <- dnon$gigafreq - mean(dnon$gigafreq) dnon$csrprsl <- dnon$srprsl - mean(dnon$srprsl) dnon$cami_3gram <- dnon$ami_3gram - mean(dnon$ami_3gram) dnon$cgiga_4gram <- dnon$giga_4gram - mean(dnon$giga_4gram) dnon$csynSp <- dnon$synSp - mean(dnon$synSp) nonnatBaseline <-lmer(duration~cmary_context*cAmiFreq+cgigafreq +(1+cmary_context+cAmiFreq|speakerid), data=dnon) nonnatBaseline2 <-lmer(duration~cmary_context*cAmiFreq+cgigafreq+cgiga_4gram +(1+cmary_context+cAmiFreq|speakerid), data=dnon) nonnatBaseline nonnatBaseline2 dnon$residEng <- residuals(nonnatBaseline) dnon$residEng2 <- residuals(nonnatBaseline2) lmer(residEng2~srprsl+(1+srprsl|speakerid), data=dnon) cor(d[,c("duration", "mary_dur", "mary_context", "gigafreq", "PTBfreq", "AMIfreq", "ami_3gram", "srprsl", "giga_4gram")],d[,c("duration", "mary_dur", "mary_context", "gigafreq", "PTBfreq", "AMIfreq", "ami_3gram", "srprsl", "giga_4gram")]) sink()