我正在尝试创建语料库,但是我希望在文档中组合2个连续单词,我不想要单个单词的语料库。通过在r中组合词语创建语料库
我正在使用下面的脚本。有没有一种方法可以创建语料库“文档”,这将在每个文档中包含合并的2个连续词汇?请指教。
library(plyr)
library(tm)
library(e1071)
setwd("C:/Assignment/Assignment-Group-Prediction/IPM")
training<- read.csv("Data.csv",header=T,na.strings=c(""))
Res_Desc_Train <- subset(training,select=c("Group","Description"))
##Step 1 : Create Document Matrix
docs <- Corpus(VectorSource(Res_Desc_Train$Description))
docs <-tm_map(docs,content_transformer(tolower))
#remove potentially problematic symbols
toSpace <- content_transformer(function(x, pattern) { return (gsub(pattern, " ", x))})
removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]","",x)
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "-")
docs <- tm_map(docs, toSpace, ":")
docs <- tm_map(docs, toSpace, ";")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\(")
docs <- tm_map(docs, toSpace, ")")
docs <- tm_map(docs, toSpace, ",")
docs <- tm_map(docs, toSpace, "_")
docs <- tm_map(docs, content_transformer(removeSpecialChars))
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeWords, stopwords("en"))
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, removeNumbers)
您可能还希望尝试tidytext包,这是更容易比TM封装(恕我直言)工作,并具有二元特征。 – lawyeR