-1
当我运行我的topicmodel代码时,发生了一个非常奇怪的错误。 基本上我有一个包含用户注释的.csv文件。我想创建一个dtm,每个注释都是一个文档。我采取了8K评论的样本,并使用以下代码:R:topicmodels,2个相似的文档,代码与其中一个工作,不与另一个
> #LOAD LIBRARYS
>
> library(tm)
> library(SnowballC)
> library(stringr)
> library(tictoc)
> tic()
>
> #SET FILE LOCATION
> file_loc <- "C:/Users/Andreas/Desktop/first8k.csv"
>
> #LOAD DOCUMENTS
> Database <- read.csv(file_loc, header = FALSE)
> require(tm)
>
> #PROCEED
> Database <- Corpus(DataframeSource(Database))
>
> Database <-tm_map(Database,content_transformer(tolower))
>
>
> Database <- tm_map(Database, removePunctuation)
> Database <- tm_map(Database, removeNumbers)
> Database <- tm_map(Database, removeWords, stopwords("english"))
> Database <- tm_map(Database, stripWhitespace)
>
>
> myStopwords <- c("some", "individual", "stop","words")
> Database <- tm_map(Database, removeWords, myStopwords)
>
> Database <- tm_map(Database,stemDocument)
>
>
> dtm <- DocumentTermMatrix(Database, control=list(minDocFreq=2,minWordLength=2))
>
> row_total = apply(dtm, 1, sum)
> dtm.new = dtm[row_total>0,]
>
> removeSparseTerms(dtm, .99)
>
>>Outcome:DocumentTermMatrix (documents: 12753, terms: 194)
>Non-/sparse entries: 66261/2407821
>Sparsity : 97%
>Maximal term length: 11
>Weighting : term frequency (tf)
>
> #TOPICMODELLING
>
> library(topicmodels)
>
>
>
> burnin <- 100
> iter <- 500
> thin <- 100
> seed <-list(200,5,500,3700,1666)
> nstart <- 5
> best <- TRUE
>
>
> k <- 12
>
>
> ldaOut <-LDA(dtm.new,k, method="Gibbs", control=list(nstart=nstart, seed = seed, best=best, burnin = burnin, iter = iter, thin=thin))
>
所以这一个工程就好了。如果我拿的8K的意见,也csv文件,格式相同另一个样品等则出现下列错误:
> library(tm)
> library(SnowballC)
> library(stringr)
> library(tictoc)
> tic()
>
> #SET FILE LOCATION
> file_loc <- "C:/Users/Andreas/Desktop/try8k.csv"
>
> #LOAD DOCUMENTS
> Database <- read.csv(file_loc, header = FALSE)
> require(tm)
>
> #PROCEED
> Database <- Corpus(DataframeSource(Database))
>
> Database <-tm_map(Database,content_transformer(tolower))
>
>
> Database <- tm_map(Database, removePunctuation)
> Database <- tm_map(Database, removeNumbers)
> Database <- tm_map(Database, removeWords, stopwords("english"))
> Database <- tm_map(Database, stripWhitespace)
>
>
> myStopwords <- c("some", "individual", "stop","words")
> Database <- tm_map(Database, removeWords, myStopwords)
>
> Database <- tm_map(Database,stemDocument)
>
> dtm <- DocumentTermMatrix(Database,control=list(minDocFreq=2,minWordLength=2))
>
> row_total = apply(dtm, 1, sum)
> dtm.new = dtm[row_total>0,]
>
> removeSparseTerms(dtm, .99)
>
>>Outcome:DocumentTermMatrix (documents: 9875, terms: 0)
Non-/sparse entries: 0/0
Sparsity : 100%
Maximal term length: 0
Weighting : term frequency (tf)
>
> #TOPICMODELLING
>
> library(topicmodels)
>
>
>
> burnin <- 100
> iter <- 500
> thin <- 100
> seed <-list(200,5,500,3700,1666)
> nstart <- 5
> best <- TRUE
>
>
> k <- 12
>
>
> ldaOut <-LDA(dtm.new,k, method="Gibbs", control=list(nstart=nstart, seed = seed, best=best, burnin = burnin, iter = iter, thin=thin))
>Fehler in obj[[i]][[which.max(sapply(obj[[i]], logLik))]] :
>attempt to select less than one element in get1index
我想与DTM自己是不是沃金,因为它说,有9875个文档,但根本没有条件。但我完全不知道为什么代码适用于一个样本,但不适用于另一个样本。请告诉我,如果我在代码上做了错误的事,或者发现了其他错误。
提前致谢!
感谢您的回答。但正如我所说,我的2个数据库是相似的。所以第二个也包含当然的术语。我不明白的是为什么R过滤这些条款或不注意到它们。它的预处理... – Andres