This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
replacemask snc: #replacemask# 14061.5
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
plot(cars)
Add a new katze hund dreimal schwarz by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
load libraries and set SBC ressources adress
#20240103(07.42)
###############################################################
### this script runs without local files, all data fetched from online sources.
###############################################################################
R.1<-"https://www.linguistics.ucsb.edu/research/santa-barbara-corpus"
Q.2<-"https://www.linguistics.ucsb.edu/sites/secure.lsit.ucsb.edu.ling.d7/files/sitefiles/research/SBC/SBCorpus.zip"
library(utils)
library(stringi)
library(readr)
library(quanteda.textstats)
library(quanteda)
library(udpipe) # for pos tagging
library(writexl)
#knitr::write_bib(c(.packages()), "packages.bib")
local<-"~/boxHKW/21S/DH/local/SPUND/corpuslx"
udpipepath<-sprintf("%s/english-ewt-ud-2.5-191206.udpipe",local)
### if not yet, the model must be downloaded, comment in above line
get.udp<-function(){
udpipe_download_model("english",model_dir = tempdir("md"))
mdf<-list.files(tempdir())
mdw<-grep(".udpipe",mdf)
mdfile<-paste0(tempdir(),"/",mdf[mdw])
md<-udpipe_load_model(mdfile)
}
### if the model is not on disk, it is downloaded
ifelse(exists("udpipepath"),md<-udpipe_load_model(udpipepath),md<-get.udp())
getwd()
### tempfile to store zip
sbctemp<-tempfile("SBCtemp.zip")
sbctempdir<-tempdir()
download.file(Q.2,sbctemp)
unzip(sbctemp,exdir = sbctempdir)
sbctrn<-paste0(sbctempdir,"/TRN/")
filestrn<-list.files(sbctrn)
trndf<-data.frame(scb=NA,id=NA,text=NA)
#k<-1
for(k in 1:length(filestrn)){
#cat(k,"\n")
trntemp<-read_delim(paste0(sbctrn,filestrn[k]),
delim = "\t", escape_double = FALSE,
trim_ws = TRUE,col_names = c("id","spk","text"))
l1<-length(trntemp)
trntext<-trntemp[,l1]
colnames(trntext)<-"text"
trntemp.2<-data.frame(scb=k,id=1:length(trntext$text),text=trntext)
trndf<-rbind(trndf,trntemp.2)
}
trn.doc.1<-readLines(paste0(sbctrn,filestrn[1]))
print(trn.doc.1[1:15])
create subsets for /make/ /take/ /give/
trndf.2<-trndf[2:length(trndf$scb),]
trndf.2$lfd<-1:length(trndf.2$scb)
rownames(trndf.2)<-trndf.2$lfd
#trndf$lfd<-1:length(trndf$scb)
trndf<-trndf.2
the annotation of concrete/light use of lemma /make/ was done in a
subset table of the corpus, assigning either 0 for concrete and 1 for
light use of the verb in context. concrete use would include forms with
objects of the lemma that (for /make/) that (in the sense of the
alternate constructions) can be also
produced, built, generated, created
. these are obvious
semantic alternatives of /make/ defined in (Mehl 2021).
######################################
### instances concrete vs. light
### Q.1: (Mehl 2021)
i.make.w<-c(concrete=68,light=321) #17% vs. 83% written ICE
i.make.s<-c(concrete=96,light=353) #spoken ICE
i.take<-c(con=62,light=85)
i.give<-c(con=52,light=167)
###########################
#trndf_sf<-trndf # saved created, load only annotations dataframe
# dtemp<-tempfile()
# download.file("https://github.com/esteeschwarz/SPUND-LX/raw/main/corpusLX/14015-HA/data/SBC.ann.df.RData",dtemp)
# load(dtemp)
# get.ann.x<-function(scb,ann.df){
# trndf.lm<-cbind(scb,ann.df)
# }
# trndf.lm<-get.ann.x(trndf.2,scb.ann.df)
tokenize and lemmatize/pos-tag df
################
### new with preloaded/built df
#load("~/boxHKW/21S/DH/local/SPUND/corpuslx/stefanowitsch/HA/data/trndf.lm.cpt.RData")
scb.unique<-unique(trndf$scb)
k<-1
#scb.unique<-unique(trndf$scb)
scb<-1
# scb.ann.list<-list()
write.ann.df<-function(df){
scb.ann.list<-list()
scb.token.list<-list()
trndf.lm<-df
for(scb in 1:length(scb.unique)){
cat(scb,"\n")
###14043.
sbc<-trndf.lm
scb.id<-scb.unique[scb]
scb.sub<-subset(sbc,sbc$scb==scb.id)
colnames(scb.sub)[1]<-"doc_id"
scb.sub$text<-gsub("[+%?~,-.0-9()=<>@]|\\]|\\[","",scb.sub$text)
scb.sub$text<-gsub("(^ )","",scb.sub$text)
colnames(scb.sub)[3]<-"text_field"
sbc.sub.c<-corpus(scb.sub,docid_field = 'doc_id',text_field = 'text_field',unique_docnames = F)
an4<-udpipe_annotate(md,x=sbc.sub.c,tokenizer="tokenizer",tagger = "default",trace = 2)
###wks.
an7<-data.frame(an4)
an7$doc_id<-gsub("doc","",an7$doc_id)
colnames(an7)[1]<-"line"
an8<-an7[,c(1,4,5,6,7,8,9,10,11,12)]
an8$sbc.id<-scb.id
an8<-an8[,c(11,1,2,3,4,5,6,7,8,9,10)]
an8$alt<-"a-other"
an8$light<-NA
line.u<-unique(an8$line)
ns.list<-paste0("sbc",scb.id)
scb.ann.list[[ns.list]]<-an8
}
return(scb.ann.list)
}
### call above function which tokenizes, lemmatizes and postags the corpus & writes xlsx-tables per interview / returns dataframe (list type) of annotated corpus
sbc.token.list<-write.ann.df(trndf.lm)
ann.x<-write.ann.df(trndf.lm)
scb.pos.df.list<-ann.x
scb.ann.list<-scb.pos.df.list
#save(scb.pos.df.list,file = "~/boxHKW/21S/DH/local/SPUND/corpuslx/stefanowitsch/HA/data/scb.ann.list.pos-all-dfs.RData")
#wks.
#save(scb.ann.list,file="~/boxHKW/21S/DH/local/SPUND/corpuslx/stefanowitsch/HA/data/scb.ann.list.RData")
#t<-grep("token",colnames(scb.ann.list$sbc1))
t<-colnames(scb.ann.list$sbc1)=="token"
t<-which(t)
scbns<-colnames(scb.ann.list$sbc1)
### this is necessary for pepper to recognize the token column in the df
scbns[t]<-"tok"
scbns<-gsub("\\.","_",scbns)
rename.list<-function(x){
x2<-data.frame(x)
colnames(x2)=scbns
return(x2)
}
scb.ann.list.nr<-lapply(scb.ann.list, rename.list)
head(scb.ann.list.nr$sbc1)
scb.ann.list<-scb.ann.list.nr
### same as above writing xlsx, here from annotated list
# for(k in 1:length(scb.ann.list)){
# colnames(scb.ann.list[[k]])<-scbns
# xldir<-"~/boxHKW/21S/DH/local/SPUND/corpuslx/annis/xls"
# ns.df<-paste0(xldir,"/SCB-pos_",k,".xlsx")
# write_xlsx(scb.ann.list[[k]],ns.df)
#
# }
#x<-scb.ann.list$sbc1
export.ann<-function(x){
ns<-colnames(x)
wns<-ns=="sentence"|ns=="tok"
wns
x1<-x[,!wns]
return(x1)
}
sbc.only.pos.annotation<-lapply(scb.ann.list, export.ann)
#save(sbc.only.pos.annotation,file = "~/Documents/GitHub/SPUND-LX/corpusLX/14015-HA/data/sbc.only.pos.annotation.RData")
the evaluation statistics can be done already with the df, but for different purposes e.g. the better visualisation of the corpus and extensive queries an ANNIS (Druskat, Gast, and Krause (2016)) has been created.
call to external scripts which run pepper on the provided xlsx and create an ANNIS graph file for import to the ANNIS installation.
pepper.call("~/boxHKW/21S/DH/local/SPUND/corpuslx/annis/r-conxl5.pepper","SBC_v1.0.1","SBC_v1.0.1")
pepper.call("~/boxHKW/21S/DH/local/SPUND/corpuslx/annis/r-conxl6.pepper","SBC_v1.0.1","SBC_v1.0.1")
zipannis("SBC_annis","SBC_annis.zip")
configuration xml: r-conxl5.pepper
<?xml version='1.0' encoding='UTF-8'?>
<pepper-job id="tt001" version="1.0">
<importer name="SpreadsheetImporter" path="./xls"/>
<exporter name="TreetaggerExporter" path="./SBC_v1.0.1/"/>
</pepper-job>
configuration xml: r-conxl6.pepper
<?xml version='1.0' encoding='UTF-8'?>
<pepper-job id="tt001" version="1.0">
<importer name="TreetaggerImporter" path="./SBC_v1.0.1/">
</importer>
<exporter name="ANNISExporter" path="./SBC_annis/">
</exporter>
</pepper-job>
# dtemp<-tempfile()
# download.file("https://github.com/esteeschwarz/SPUND-LX/raw/main/corpusLX/14015-HA/data/sbc.only.pos.annotation.RData",dtemp)
# load(dtemp)
get.corpus.deprel<-function(x){
ns<-colnames(x)
x2<-cbind()
# assign unique id to token
x$sbc.token.id<-as.double(paste0(x$sbc_id,"_", 1:length(x$sbc.id)))
x$pos.0<-"lfd.pos"
x$obj<-NA
tdf<-t(x)
rtdf<-rownames(tdf)
rtdf.0<-which(rtdf=="pos.0")
all.zero<-which(tdf=="lfd.pos")-rtdf.0
#########################################
pos.deprel<-all.zero+grep("dep",rtdf)
pos.head<-all.zero+grep("head",rtdf)
pos.upos<-all.zero+grep("upos",rtdf)
pos.lemma<-all.zero+grep("lemma",rtdf)
pos.token<-all.zero+which(rtdf=="token")
pos.token.id<-all.zero+which(rtdf=="token_id")
#########################
t.head<-pos.head
t.id<-pos.token.id
t.tag<-pos.upos
t.token<-pos.token
t.lemma<-pos.lemma
t.deprel<-pos.deprel
m.h.t.0<-which(tdf[t.head]==0)
h.t.pos.rel<-as.double(tdf[t.id])-as.double(tdf[t.head])
h.t.pos.rel[m.h.t.0]<-0
h.t.pos.abs<-t.id-(h.t.pos.rel*length(tdf[,1]))
h.t.value<-t.token-(h.t.pos.rel*length(tdf[,1]))
m1<-which(h.t.value<0)
sum(m1)
h.t.value[m1]<-1 # prevent negative subscripts
h.l.value<-t.lemma-(h.t.pos.rel*length(tdf[,1]))
m2<-which(h.l.value<0)
sum(m2)
h.l.value[m2]<-1 # prevent negative subscripts
head(tdf[h.l.value])
####################
all.obj<-tdf=="obj"
all.ob.w<-which(all.obj)
all.ob.w
obj.head<-as.double(all.ob.w-1)
tdf[obj.head]
obj.tag<-all.ob.w-4 #4
tdf[obj.tag]
obj.lemma<-all.ob.w-5 #5
tdf[obj.lemma]
obj.token<-all.ob.w-6 #6
tdf[obj.token]
obj.id<-as.double(all.ob.w-7) #7
tdf[obj.id]
h.t.o.pos.rel<-as.double(tdf[obj.id])-as.double(tdf[obj.head])
h.t.o.pos.abs<-obj.id-(h.t.o.pos.rel*length(tdf[,1]))
tdf[h.t.o.pos.abs]
h.t.o.value<-obj.token-(h.t.o.pos.rel*length(tdf[,1]))
tdf[h.t.o.value]
h.l.o.value<-obj.lemma-(h.t.o.pos.rel*length(tdf[,1]))
tdf[h.l.o.value]
obj.pos<-all.ob.w+5
tdf[obj.pos]<-tdf[h.l.o.value]
####################
tdf.r<-as.data.frame(t(tdf))
m<-!is.na(tdf.r$obj)
x$obj<-tdf.r$obj
x$head_token_value<-tdf[h.t.value]
x$head_lemma_value<-tdf[h.l.value]
mode(x$line)<-"double"
mode(x$sbc.id)<-"double"
mode(x$token_id)<-"double"
mode(x$head_token_id)<-"double"
return(x)
}
corpus.head.list<-lapply(scb.pos.df.list, get.corpus.deprel)
### get corpus object
corpus.df.deprel_f<-data.frame(corpus.head.list$sbc1)
for (k in 2:length(corpus.head.list)){
corpus.df.deprel_f<-rbind(corpus.df.deprel_f,corpus.head.list[[k]])
}
on base of concrete object arrays for make/take/give
get.light.annotation<-function(corpus.df.deprel){
concrete.give<-c(1066,2620,10469,20369,20373,20377,31957,41100,45424,45538,48045,50236,51759,52340,52341,54654,56016,60668,
61952,64351,67497,69012,70356,71167,74595,75162,76991,77442,77553,81098,81099,81859,81860,94278,
96953,99281,99880)
concrete.give.txt<-c("sticker","sweets","antibiotic","gift","iguana","recognition","toothpick","herb","anything","enzyme","cake","lettuce","candy","card","literature","ornament","tape","ticket","pair","clothes","juice","pepper","money","goldfish","machine","cup","kiss","amount","bit","picture","mine","pass","dollar","ten","drink","something","car","lot")
concrete.make.txt<-c("horseshoe","sound","cartilage","ceviche","food","noise","hay","grape","cookie","spatula","clothes","wiper","quilt","outfit","copy","tape","string","intercession","application","balloon","basket","kebab","salad","juice","gravy","tamale","sauce","ton","tail","stuff","papers","pasta","loaf","sandwich","ornament","picture","pillow","database","statue","pizza","fudge","recipe","pan","plate","decaf","tart")
concrete.take<-c(848,6381,14466,16674,18611,18809,19366,22031,24813,24827,24829,24831,24832,24834,24835,29159,32908,36540,
38239,38243,38247,38253,38254,38258,45020,45021,45032,49577,49582,49583,49588,53267,56405,56406,56409,
59372,61588,61592,65654,65656,65657,66021,69440,71127,72201,72320,73797,73798,78435,78440,78442,
79454,79456,82282,83099,83834,83836,84599,85311,85932,88155,89310,91865,93070,96464,96465,99149,
99745,104020,117695)
concrete.take.txt<-c("balloon","shelf","checkbook","car","bag","everything","puppies","silverware","torque","tree","Tupperware","wastebasket","wire","money","capsule","guitar","stub","tail","Tylenol","blanket","clipping","tablecloth","crown","medicine","nail","spacesuit","sweater","hers","knife","rack","rock","diary","woodwork","pill","ticket","trash","plug","some","tape","band","flip","water","container","pants","buck","insulin","foot","painting","drug","gift","cart","hair","egg","ball","dollar","pound","drink","thing","NPH")
concrete.false.take<-c("while","time","care","advantage","picture","half","off","down","dollars to do it","look","with me","out","them to")
concrete.false.take.regx<-paste0(concrete.false.take,collapse = "|")
concrete.false.take.regx<-paste0("(",concrete.false.take.regx,")")
#concrete.take.txt<-gsub("\\.[NA0-1]","",concrete.take.txt)
# write_clip(paste0(concrete.take.txt,collapse = '","'))
alt.array<-c(make=c("build","create","produce","generate"),take=c("carry","bring"),give="give")
##########################################################
### apply light label
corpus.df.deprel$light<-NA
corpus.df.deprel$alt<-"a-other"
###
# q.lemma<-"make|made|making"
# q.lemma<-"give|given|gave"
# lemma<-"make"
# lemma<-"give"
# ###
# # concrete.array<-c(concrete.make.txt)
# # concrete.array<-c(concrete.give.txt)
# # # concrete.array
apply.light<-function(corpus.df.deprel=corpus.df.deprel,q.lemma,lemma,concrete.array){
corpus.df.deprel$lemma<-gsub("[^a-zA-z']","",corpus.df.deprel$lemma)
corpus.df.deprel$head_lemma_value<-gsub("[^a-zA-z']","",corpus.df.deprel$head_lemma_value)
m5<-corpus.df.deprel$lemma==""
corpus.df.deprel$lemma[m5]<-NA
m6<-corpus.df.deprel$head_lemma_value==""
corpus.df.deprel$head_lemma_value[m6]<-NA
m1<-grepl(q.lemma,corpus.df.deprel$sentence)
m13<-grepl(lemma,corpus.df.deprel$lemma)
m14<-grepl(lemma,corpus.df.deprel$head_lemma_value)
# sum(m1)
# sum(m13)
# #corpus.df.deprel$sentence[m1]
# #unique(corpus.df.deprel$head_lemma_value)
# length(unique(corpus.df.deprel$head_token_value))
# length(unique(corpus.df.deprel$token))
# #table(corpus.df.deprel$lemma)
corpus.df.deprel$alt[m1]<-lemma # set concrete instances
corpus.df.deprel$light[m13]<-1 # set all to light
#lemma
library(stringi)
library(purrr)
concrete.regx<-paste0(concrete.array,collapse = "|")
concrete.regx<-paste0('(',concrete.regx,')')
m38<-grepl(concrete.regx,corpus.df.deprel$lemma)
m41.alt<-corpus.df.deprel$alt==lemma
#sum(m41)
corpus.df.deprel$light[m41.alt]<-1
m42.conc<-corpus.df.deprel$lemma[m41.alt]%in%concrete.array|corpus.df.deprel$token[m41.alt]%in%concrete.array
# sum(m42.conc)
m43.obj<-corpus.df.deprel$obj[m41.alt][m42.conc]==lemma
corpus.df.deprel$light[m41.alt][m42.conc][m43.obj]<-0
# sum(m40)
# corpus.df.deprel$lemma[m41][m39][m40]
m39.conc.sent<-grepl(concrete.regx,corpus.df.deprel$sentence[m41.alt])
m40.lemma.alt.sent<-grepl(lemma,corpus.df.deprel$lemma[m41.alt][m39.conc.sent])
# sum(lemma,corpus.df.deprel$lemma[m39][m40])
# corpus.df.deprel$light[m38]<-0
corpus.df.deprel$light[m41.alt][m39.conc.sent][m40.lemma.alt.sent]<-0
return(corpus.df.deprel)
}
corpus.df.deprel<-apply.light(corpus.df.deprel,"make|made|making","make",concrete.make.txt)
corpus.df.deprel<-apply.light(corpus.df.deprel,"take|took|taken|taking","take",concrete.take.txt)
corpus.df.deprel<-apply.light(corpus.df.deprel,"give|gave|given|giving","give",concrete.give.txt)
corpus.df.deprel<-apply.light(corpus.df.deprel,"(produce[^r]|produced|producing)","produce",concrete.make.txt)
corpus.df.deprel<-apply.light(corpus.df.deprel,"create|created|creating","create",concrete.make.txt)
corpus.df.deprel<-apply.light(corpus.df.deprel,"generate|generated|generating","generate",concrete.make.txt)
corpus.df.deprel<-apply.light(corpus.df.deprel,"build|built|building","build",concrete.make.txt)
corpus.df.deprel<-apply.light(corpus.df.deprel,"carry|carried|carrying","carry",concrete.take.txt)
corpus.df.deprel<-apply.light(corpus.df.deprel,"bring|brought|bringing","bring",concrete.take.txt)
table(corpus.df.deprel$alt,corpus.df.deprel$light,corpus.df.deprel$head_lemma_value)
#chk
return(corpus.df.deprel)
}
corpus.df.deprel.new<-get.light.annotation(corpus.df.deprel)
functions to create a subset and feed into collexeme analysis
get.collex<-function(coll6,filter.pos,vers,na.rm=FALSE){
m3<-coll6$lemma==coll6$head_lemma_value # remove observations with lemma==head_lemma
sum(m3,na.rm = T)
#coll6na<-coll6
coll6<-coll6[!m3,]
m4<-!is.na(coll6$light)
# sum(m4)
# k<-1
if(na.rm==F)
coll6$light[m4]<-"n.a."
# m5<-is.na(coll6$obj.to)
# coll6<-coll6[!m5,]
if(length(filter.pos)>0){
for(k in length(filter.pos)){
col<-names(filter.pos[k])
coll6<-coll6[coll6[[col]]%in%filter.pos[[k]],]
}
}
if(vers=="light"){
colldf.light<-data.frame(head_lemma=coll6$head_lemma_value,lemma=coll6$lemma,light=coll6$light)
coll6.2<-collex.covar.mult(colldf.light,threshold = 1,decimals = 3)
}
# coll6.2
if(vers=="lemma"){
colldf.lemma<-data.frame(head_lemma=coll6$head_lemma_value,lemma=coll6$lemma)
coll6.2<-collex.covar(colldf.lemma,decimals = 3)
}
return(coll6.2)
}
#discard<-"queen"
get.collex.obj<-function(coll6,display.light=NULL,select.filter=NULL,display.filter=NULL,discard=NULL){
coll6.obj<-data.frame(lemma=unlist(coll6$lemma),obj=unlist(coll6$obj),upos=unlist(coll6$upos),light=coll6$light)
coll6.obj.n<-coll6.obj[coll6.obj$upos=="NOUN"&!is.na(coll6.obj$obj),]
colldf<-data.frame(obj=coll6.obj.n$obj,lemma=coll6.obj.n$lemma,light=coll6.obj.n$light)
if(length(select.filter)>0){
colldf<-colldf[colldf$obj%in%select.filter,]
}
if(length(discard)>0){
colldf<-colldf[colldf$lemma!=discard,]
}
if(length(display.light)==0){
coll6.2<-collex.covar(data.frame(colldf[,1],colldf[,2]),decimals = 3)
}
if(length(display.light)>0){
coll6.2<-collex.covar.mult(colldf,threshold = 1,decimals = 3)
coll6.2<-coll6.2[coll6.2$light%in%display.light,]
}
if(length(display.filter)>0){
coll6.2<-coll6.2[coll6.2[,1]%in%display.filter,]
}
return(coll6.2)
}
coll6<-corpus.df.deprel.new
#############
### instances concrete vs. light
### Q.1: (Mehl 2021)
i.make.w<-c(concrete=68,light=321) #17% vs. 83% written ICE
i.make.s<-c(concrete=96,light=353) #spoken ICE
i.take.w<-c(con=62,light=85)
i.give.w<-c(con=52,light=167)
i.take.s<-c(con=131,light=79)
i.give.s<-c(con=105,light=227)
"in the written portion of ICE-GB, the light use of each verb is more common than the concrete sense.
For example, out of the total number of instances of make in all concrete and light uses,
just over 80% of instances are the light use, and just under 20% are the concrete use."
rownames(plotdf.ann$plot.dist)<-c("concrete","light")
lsbc<-length(corpus.df.deprel$sbc.id)
plotdf.ann<-list(lsbc=lsbc,plot.dist=plotdf1,ann=list(main="distribution of lemmas over corpora",ylab="absolute occurences",
legend.text = c("concrete use","light use")))
rownames(plotdf.ann$plot.dist)<-c("concrete","light")
define subsets and create collex df
coll6.2<-get.collex.obj(coll6,display.light=c(0,1),display.filter = "make")
coll6.2<-get.collex.obj(coll6,display.light=NULL,display.filter = c("make","take","give"))
coll6.2.light<-get.collex.obj(coll6,display.light=c(0))
#table(coll6.2.light$obj)
coll6.2.obj<-get.collex.obj(coll6,display.light=NULL)
obj.array<-c(make.array,take.array,"give")
display.filter<- obj.array
coll6.2.obj.f<-get.collex.obj(coll6,display.light=c(0,1),display.filter = obj.array)
apply.model<-function(coll6,p.lower.tail,select.filter=NULL){
#boxplot(amodel$COLL.STR.LOGL~amodel$SLOT1)
amodel<-get.collex.obj(coll6,select.filter = select.filter)
df<-length(levels(factor(amodel$SLOT1)))-1
df
amodel$p<-pt(amodel$COLL.STR.LOGL,df,lower.tail = p.lower.tail)
amodel<-amodel[amodel$SLOT1%in%make.array,]
amodel<-rbind(amodel[duplicated(amodel$SLOT2,fromLast = T),],amodel[duplicated(amodel$SLOT2,fromLast = F),])
#amodel<-amodel.d[amodel.d$SLOT1%in%make.array,]
ifelse(length(select.filter)>0,model<-"model2",model<-"model1")
xlab<-sprintf("lemma in equivalent context; lower.tail=%s, %s",p.lower.tail,model)
boxplot(amodel$COLL.STR.LOGL~amodel$SLOT1,outline=F,main="preference of make vs. alternates",xlab = xlab,ylab = "T-score of lemma/object association strength")
boxplot(amodel$p~amodel$SLOT1,outline=F,main="preference of make vs. alternates",xlab = xlab,ylab = "p-value of lemma/object association strength")
### preference of make over produce
amodel<-get.collex.obj(coll6)
df<-length(levels(factor(amodel$SLOT1)))-1
df
amodel$p<-pt(amodel$COLL.STR.LOGL,df,lower.tail = p.lower.tail)
amodel<-amodel[amodel$SLOT1%in%take.array,]
amodel<-rbind(amodel[duplicated(amodel$SLOT2,fromLast = T),],amodel[duplicated(amodel$SLOT2,fromLast = F),])
#amodel<-amodel.d[amodel.d$SLOT1%in%make.array,]
boxplot(amodel$COLL.STR.LOGL~amodel$SLOT1,outline=F,main="preference of take vs. alternates",xlab = xlab,ylab = "T-score of lemma/object association strength")
boxplot(amodel$p~amodel$SLOT1,outline=F,main="preference of take vs. alternates",
xlab = xlab,ylab = "p-value of lemma/object association strength")
}
obj.make<-coll6.2.obj[coll6.2.obj$SLOT1%in%make.array,]
sema.make<-obj.make[obj.make$SLOT2%in%obj.make[duplicated(obj.make$SLOT2),][,2],]
obj.take<-coll6.2.obj[coll6.2.obj$SLOT1%in%take.array,]
sema.take<-obj.take[obj.take$SLOT2%in%obj.take[duplicated(obj.take$SLOT2),][,2],]
coll.sema.1<-list(make=sema.make,take=sema.take)
x<-sema.make
y<-make.array
sema.sum<-function(y)y=sum(x['COLL.STR.LOGL'][x['SLOT1']==y])
make.sum<-list(make.array)
make.sum[make.array]<-lapply(y, sema.sum)
make.sum
x<-sema.take
y<-take.array
sema.sum<-function(y)y=sum(x['COLL.STR.LOGL'][x['SLOT1']==y])
take.sum<-list(take.array)
take.sum[take.array]<-lapply(y, sema.sum)
df<-factor(coll6.2.obj$SLOT1)
df<-length(levels(df))
get.p<-function(x)pt(x,df,lower.tail = F)
#get.p<-function(x)pt(x,df,lower.tail = T)
eval1<-sema.make
eval1$p<-unlist(lapply(eval1$COLL.STR.LOGL, get.p))
eval2<-sema.take
eval2$p<-unlist(lapply(eval2$COLL.STR.LOGL, get.p))
par(las=3)
eval.make<-eval1
eval.take<-eval2
### concrete objects frequency
make.array<-c("make","generate","produce","create","build")
take.array<-c("carry","bring")
obj.array<-c(make.array,take.array,"give")
display.filter<- obj.array
coll6.2.obj.f<-get.collex.obj(coll6,display.light=NULL,display.filter = obj.array)
coll6.2.obj.f
obj.t<-table(coll6.2.obj.f[,1])
obj.all.t<-obj.t[obj.t!=0]
### concrete objects:
concrete.array<-c("give",concrete.make.txt,concrete.take.txt)
sub.obj.t<-coll6.2.obj.f[coll6.2.obj.f$SLOT2%in%concrete.array,]
conc.obj.t<-table(sub.obj.t[,1])
obj.all.conc.t<-conc.obj.t[conc.obj.t!=0]
obj.eval<-rbind(all.objects=obj.all.t,all.conc.obj=obj.all.conc.t)
obj.eval