我是noob R程序员。我写了一个代码,需要将一个函数应用于按因素分割的数据框。数据框本身包含大约一百万个324961个观察值,其中有64376个因子用于我们用于切分数据帧的变量。在R中并行使用
的代码如下:
library("readstata13")
# Reading the Stata Data file into R
bod_fb <- read.dta13("BoD_nonmissing_fb.dta")
gen_fuzzy_blau <- function(bod_sample){
# Here we drop the Variables that are not required in creating the Fuzzy-Blau index
bod_sample <- as.data.frame(bod_sample)
bod_sample$tot_occur <- as.numeric(bod_sample$tot_occur)
bod_sample$caste1_occ <- as.numeric(bod_sample$caste1_occ)
bod_sample$caste2_occ <- as.numeric(bod_sample$caste2_occ)
bod_sample$caste3_occ <- as.numeric(bod_sample$caste3_occ)
bod_sample$caste4_occ <- as.numeric(bod_sample$caste4_occ)
# Calculating the Probabilites of a director belonging to a caste
bod_sample$caste1_occ <- (bod_sample$caste1_occ)/(bod_sample$tot_occur)
bod_sample$caste2_occ <- (bod_sample$caste2_occ)/(bod_sample$tot_occur)
bod_sample$caste3_occ <- (bod_sample$caste3_occ)/(bod_sample$tot_occur)
bod_sample$caste4_occ <- (bod_sample$caste4_occ)/(bod_sample$tot_occur)
#Dropping the Total Occurances column, as we do not need it anymore
bod_sample$tot_occur<- NULL
# Here we replace all the blanks with NA
bod_sample <- apply(bod_sample, 2, function(x) gsub("^$|^ $", NA, x))
bod_sample <- as.data.frame(bod_sample)
# Here we push all the NAs in the caste names and caste probabilities to the end of the row
# So if there are only two castes against a name, then they become caste1 and caste2
caste_list<-data.frame(bod_sample$caste1,bod_sample$caste2,bod_sample$caste3,bod_sample$caste4)
caste_list = as.data.frame(t(apply(caste_list,1, function(x) { return(c(x[!is.na(x)],x[is.na(x)]))})))
caste_list_prob<-data.frame(bod_sample$caste1_occ,bod_sample$caste2_occ,bod_sample$caste3_occ,bod_sample$caste4_occ)
caste_list_prob = as.data.frame(t(apply(caste_list_prob,1, function(x) { return(c(x[!is.na(x)],x[is.na(x)]))})))
# Here we write two functions: 1. gen_castelist
# 2. gen_caste_prob
# gen_castelist: This function takes the row number (serial number of the direcor)
# and returns the names of all the castes for which he has a non-zero
# probability.
# gen_caste_prob: This function takes the row number (serial number of the director)
# and returns the probability with which he belongs to the caste
#
gen_castelist <- function(x){
y <- caste_list[x,]
y <- as.vector(y[!is.na(y)])
return(y)
}
gen_caste_prob <- function(x){
z <- caste_list_prob[x,]
z <- z[!is.na(z)]
z <- as.numeric(z)
return(z)
}
caste_ls <-list()
caste_prob_ls <- list()
for(i in 1:nrow(bod_sample))
{
caste_ls[[i]]<- gen_castelist(i)
caste_prob_ls[[i]]<- gen_caste_prob(i)
}
gridcaste <- expand.grid(caste_ls)
gridcaste <- data.frame(lapply(gridcaste, as.character), stringsAsFactors=FALSE)
gridcasteprob <- expand.grid(caste_prob_ls)
# Generating the Joint Probability
gridcasteprob$JP <- apply(gridcasteprob,1,prod)
# Generating the Similarity Index
gen_sim_index <- function(x){
x <- t(x)
a <- as.data.frame(table(x))
sim_index <- sum(a$Freq^2)/(sum(a$Freq))^2
return(sim_index)
}
gridcaste$sim_index <- apply(gridcaste,1,gen_sim_index)
# Generating fuzzyblau
gridcaste$fb <- gridcaste$sim_index * gridcasteprob$JP
fuzzy_blau_index <- sum(gridcaste$fb)
remove_list <- c("gridcaste","")
return(fuzzy_blau_index)
}
fuzzy_blau_output <- by(bod_fb,bod_fb$code_year,gen_fuzzy_blau)
# Saving the output as a dataframe with two columns
# Column 1 is the fuzzy blau index
# Column 2 is the code_year
code_year <- names(fuzzy_blau_output)
fuzzy_blau <- as.data.frame(as.vector(unlist(fuzzy_blau_output)))
names(fuzzy_blau) <- c("fuzzy_blau_index")
fuzzy_blau$code_year <- code_year
bod_fb <- merge(bod_fb,fuzzy_blau,by = "code_year")
save.dta13(bod_fb,"bod_fb_example.dta")
如果代码TL;博士,摘要如下:
我有一个数据帧bod_fb
。我需要通过以bod_fb$code_year
的因子对数据帧进行限幅来在此数据帧上应用gen_fuzzy_blau
函数。
由于函数非常大,顺序处理花费的时间不止一天,最终会耗尽内存。函数gen_fuzzy_blau
为数据帧的每个code_year返回一个数值变量fuzzy_blau_index
。我使用by
来应用每个片上的功能。我想知道是否有一种方法可以并行实现此代码,以便函数的多个实例在数据帧的不同片段上同时运行。我没有找到parallel
包的by
实现,我不知道如何在使用foreach
和doParallel
包时将数据帧作为迭代器传递。
我有一个4GB内存的AMD A8笔记本电脑和Windows 7 SP1家庭基本。我给了20GB作为页面文件内存(这是我得到内存错误后)。
谢谢
编辑1: @milkmotel我已经消除代码冗余并删除了for循环,但在gen_sim_index
被浪费在函数的时间,数额巨大,我现在用的是proc.time()
功能来衡量代码的每个部分所采取的时间。
该函数被假定为如下行: 如果我们有一行(不是向量)说:a a b c
相似性指数将是(2/4)^ 2 +(1/4)^ 2 + (1/4)^ 2即(每行的每个独特元素的出现次数/该行中的元素总数)的总和^ 2
我无法直接在行上使用apply
函数,因为连续的每个元素,因为行中的每个元素具有不同的因子,并且table()
不会正确输出频率。
什么是编码gen_sim_index
函数的有效方法?