我正在从大型数据集中制作一些图表.在这段代码中,生成的所需绘图对象的大小非常小,但内存使用量的增加远不止于此.
到目前为止,我的发现是,内存使用量的增加似乎是由于一些对象造成的.特别是,在tab_ind
图形绘制过程(使用identical()
函数检查)之后,对象的值不会改变,但是在处理之后它的大小显着增加(使用object.size()
函数检查).我tab_ind
在这个过程中唯一要做的就是将它作为参数传递给函数.
可重复的例子
可以通过改变来控制模拟的大小N
.在运行结束时,tab_ind
打印尺寸的变化并检查相同性.
library(data.table) library(magrittr) library(ggplot2) N <- 6000 set.seed(runif(1, 0, .Machine$integer.max) %>% ceiling) logit <- function(x) {return(log(x/(1-x)))} invLogit <- function(x) {return(exp(x)/(1+exp(x)))} tab_dat <- data.table(datasetID = seq(N), MIX_MIN_SUCCESS = sample(c(0, 1), N, replace = T), MIX_ALL = sample(c(0, 1), N, replace = T)) tab_dat[MIX_MIN_SUCCESS == 0, MIX_ALL := 0] n <- sample(20:300, N, replace = T) tab_ind <- data.table( datasetID = rep(seq(N), times = n), SIM_ADJ_PP1 = runif(sum(n), 0.00001, 0.99999), MIX_ADJ_PP1 = runif(sum(n), 0.00001, 0.99999) ) tab_ind[, c("SIM_ADJ_LOGIT_PP1", "MIX_ADJ_LOGIT_PP1") := list(logit(SIM_ADJ_PP1), logit(MIX_ADJ_PP1))] checkMem_gc <- function(status) { print(status) print(memory.size()) gc() print(memory.size()) } ## Individual bins for x and y tab_by_bin_idxy <- function(dt, x, y, xNItv, yNItv, by = "quantile") { #Binning if (by == "even") { checkMem_gc("start x-y breaks") checkMem_gc("start x breaks") minN = dt[, min(get(x), na.rm = T)] checkMem_gc("after x min") maxN = dt[, max(get(x), na.rm = T)] checkMem_gc("after x max") xBreaks = seq(minN, maxN, length.out = xNItv + 1) checkMem_gc("after seq") checkMem_gc("after x breaks") yBreaks = dt[, seq(min(get(y), na.rm = T), max(get(y), na.rm = T), length.out = yNItv + 1)] checkMem_gc("after y breaks") } else if (by == "quantile") { xBreaks = dt[, quantile(get(x), seq(0, 1, length.out = xNItv + 1), names = F)] yBreaks = dt[, quantile(get(y), seq(0, 1, length.out = yNItv + 1), names = F)] } else {stop("type of 'by' not support")} checkMem_gc("after x-y breaks") xbinCode = dt[, .bincode(get(x), breaks = xBreaks, include.lowest = T)] checkMem_gc("after x binCode") xbinMid = sapply(seq(xNItv), function(i) {return(mean(xBreaks[c(i, i+1)]))})[xbinCode] checkMem_gc("after x binMid") ybinCode = dt[, .bincode(get(y), breaks = yBreaks, include.lowest = T)] checkMem_gc("after y binCode") ybinMid = sapply(seq(yNItv), function(i) {return(mean(yBreaks[c(i, i+1)]))})[ybinCode] checkMem_gc("after y binMid") #Creating table tab_match = CJ(xbinCode = seq(xNItv), ybinCode = seq(yNItv)) checkMem_gc("after tab match") tab_plot = data.table(xbinCode, xbinMid, ybinCode, ybinMid)[ tab_match, .(xbinMid = xbinMid[1], ybinMid = ybinMid[1], N = .N), keyby = .EACHI, on = c("xbinCode", "ybinCode") ] checkMem_gc("after tab plot") colnames(tab_plot)[colnames(tab_plot) == "xbinCode"] = paste0(x, "_binCode") colnames(tab_plot)[colnames(tab_plot) == "xbinMid"] = paste0(x, "_binMid") colnames(tab_plot)[colnames(tab_plot) == "ybinCode"] = paste0(y, "_binCode") colnames(tab_plot)[colnames(tab_plot) == "ybinMid"] = paste0(y, "_binMid") checkMem_gc("after col name") rm(list = c("xBreaks", "yBreaks", "xbinCode", "ybinCode", "xbinMid", "ybinMid", "tab_match")) checkMem_gc("after rm") #Returning table return(tab_plot) } tab_by_obin_x_str_y <- function(dt, x, y, width, Nbin, by = "even") { #Binning if (by == "even") { xLLim = dt[, seq(min(get(x), na.rm = T), max(get(x), na.rm = T) - width, length.out = Nbin)] xULim = dt[, seq(min(get(x), na.rm = T) + width, max(get(x), na.rm = T), length.out = Nbin)] } else if (by == "quantile") { xLLim = dt[, quantile(get(x), seq(0, 1 - width, length.out = Nbin), names = F)] xULim = dt[, quantile(get(x), seq(width, 1, length.out = Nbin), names = F)] } else {stop("type of 'by' not support")} xbinMid = (xLLim + xULim) / 2 #summarizing y tab_out <- sapply(seq(Nbin), function(i) { dt[get(x) >= xLLim[i] & get(x) <= xULim[i], c(mean(get(y), na.rm = T), sd(get(y), na.rm = T), quantile(get(y), c(0.025, 0.975), names = F))] }) %>% t %>% as.data.table %>% set_colnames(., c("mean", "sd", ".025p", ".975p")) %>% cbind(data.table(binCode = seq(Nbin), xLLim, xbinMid, xULim), .) tab_out[, c("mean_plus_1sd", "mean_minus_1sd") := list(mean + sd, mean - sd)] return(tab_out) } plotEnv <- new.env() backupEnv <- new.env() gc() gc() checkMem_gc("Starting memory size checking") start.mem.size <- memory.size() start_ObjSizes <- sapply(ls(), function(x) {object.size(get(x))}) start_tab_ind <- tab_ind start_tab_ind_size <- object.size(tab_ind) dummyEnv <- new.env() with(dummyEnv, { ## Set function for analyses against SIM_PP1 fcn_SIM_PP1 <- function(dt, newTab = T) { dat_prob = tab_by_bin_idxy(dt, x = "SIM_ADJ_PP1", y = "MIX_ADJ_PP1", xNItv = 50, yNItv = 50, by = "even") checkMem_gc("after tab prob") dat_logit = tab_by_bin_idxy(dt, x = "SIM_ADJ_LOGIT_PP1", y = "MIX_ADJ_LOGIT_PP1", xNItv = 50, yNItv = 50, by = "even") checkMem_gc("after tab logit") if ((!newTab) && exists("summarytab_logit_SIM_ADJ_PP1", where = backupEnv) && exists("summarytab_prob_SIM_ADJ_PP1", where = backupEnv)) { summarytab_logit = get("summarytab_logit_SIM_ADJ_PP1", envir = backupEnv) summarytab_prob = get("summarytab_prob_SIM_ADJ_PP1", envir = backupEnv) } else { summarytab_logit = tab_by_obin_x_str_y(dt, x = "SIM_ADJ_LOGIT_PP1", y = "MIX_ADJ_LOGIT_PP1", width = 0.05, Nbin = 1000, by = "even") summarytab_prob = summarytab_logit[, .( binCode, invLogit(xLLim), invLogit(xbinMid), invLogit(xULim), invLogit(mean), sd, invLogit(`.025p`), invLogit(`.975p`), invLogit(mean_plus_1sd), invLogit(mean_minus_1sd) )] %>% set_colnames(colnames(summarytab_logit)) assign("summarytab_logit_SIM_ADJ_PP1", summarytab_logit, envir = backupEnv) assign("summarytab_prob_SIM_ADJ_PP1", summarytab_prob, envir = backupEnv) } checkMem_gc("after summary tab") plot_prob <- ggplot(dat_prob, aes(x = SIM_ADJ_PP1_binMid)) + geom_vline(xintercept = 1, linetype = "dotted") + geom_hline(yintercept = 1, linetype = "dotted") + geom_abline(slope = 1, intercept = 0, size = 1.5, linetype = "dashed", alpha = 0.5) + geom_point(aes(y = MIX_ADJ_PP1_binMid, size = N), alpha = 0.5, na.rm = T) + geom_line(data = summarytab_prob, aes(x = xbinMid, y = mean), size = 1.25, color = "black", na.rm = T) + geom_line(data = summarytab_prob, aes(x = xbinMid, y = mean_plus_1sd), size = 1.25, color = "blue", na.rm = T, linetype = "dashed") + geom_line(data = summarytab_prob, aes(x = xbinMid, y = mean_minus_1sd), size = 1.25, color = "blue", na.rm = T, linetype = "dashed") + scale_size_continuous(range = c(0.5, 5)) + scale_x_continuous(name = "Simulated PP", breaks = seq(0, 1, 0.25), labels = c("0%", "25%", "50%", "75%", "100%")) + scale_y_continuous(name = "Estimated PP", limits = c(0, 1), breaks = seq(0, 1, 0.25), labels = c("0%", "25%", "50%", "75%", "100%")) + theme_classic() + theme(axis.title = element_text(size = 18), axis.text = element_text(size = 16)) checkMem_gc("after plot prob") rm(dat_prob) rm(summarytab_prob) checkMem_gc("after removing dat_prob and summary_prob") plot_logit <- ggplot(dat_logit, aes(x = SIM_ADJ_LOGIT_PP1_binMid)) + geom_abline(slope = 1, intercept = 0, size = 1.5, linetype = "dashed", alpha = 0.5) + geom_point(aes(y = MIX_ADJ_LOGIT_PP1_binMid, size = N), alpha = 0.5, na.rm = T) + geom_line(data = summarytab_logit, aes(x = xbinMid, y = mean), size = 1.25, color = "black", na.rm = T) + geom_line(data = summarytab_logit, aes(x = xbinMid, y = mean_plus_1sd), size = 1.25, color = "blue", na.rm = T, linetype = "dashed") + geom_line(data = summarytab_logit, aes(x = xbinMid, y = mean_minus_1sd), size = 1.25, color = "blue", na.rm = T, linetype = "dashed") + scale_size_continuous(range = c(0.5, 5)) + scale_x_continuous(name = "Simulated LOGIT PP1", breaks = c(0.00001, 0.001, 0.05, 0.5, 0.95, 0.999, 0.99999) %>% logit, labels = c("0.001%", "0.1%", "5%", "50%", "95%", "99.9%", "99.999%")) + scale_y_continuous(name = "Estimated LOGIT PP1", limits = c(-12, 12), breaks = c(0.00001, 0.001, 0.05, 0.5, 0.95, 0.999, 0.99999) %>% logit, labels = c("0.001%", "0.1%", "5%", "50%", "95%", "99.9%", "99.999%")) + theme_classic() + theme(axis.title = element_text(size = 18), axis.text = element_text(size = 16)) checkMem_gc("after plot logit") rm(summarytab_logit) rm(dat_logit) checkMem_gc("after removing dat_logit and summary_logit") return(list(plot_prob, plot_logit)) } checkMem_gc("after defining function") ## Tabling tab_stat <- tab_ind[, c("MIX_MIN_SUCCESS", "MIX_ALL") := list( tab_dat[tab_ind[, datasetID], MIX_MIN_SUCCESS], tab_dat[tab_ind[, datasetID], MIX_ALL] )] checkMem_gc("after new tab_stat") tab_stat_MIN_SUCCESS <- tab_stat[MIX_MIN_SUCCESS == 1] checkMem_gc("after new new tab_stat_MIN_SUCCESS") tab_stat_MIX_ALL <- tab_stat[MIX_ALL == 1] checkMem_gc("after new tab_stat_MIX_ALL") # Generating ggplot objects print("--- start lst full ---") lst_full <- fcn_SIM_PP1(tab_stat, newTab = F) checkMem_gc("after lst full") rm(tab_stat) checkMem_gc("after rm tab_stat") print("--- start lst MIN_SUCCESS ---") lst_MIN_SUCCESS <- fcn_SIM_PP1(tab_stat_MIN_SUCCESS, newTab = F) checkMem_gc("after lst MIN_SUCCESS") rm(tab_stat_MIN_SUCCESS) checkMem_gc("after rm tab_MIN_SUCCESS") print("--- start lst MIX_ALL ---") lst_MIX_ALL <- fcn_SIM_PP1(tab_stat_MIX_ALL, newTab = F) checkMem_gc("after lst MIX_ALL") rm(tab_stat_MIX_ALL) checkMem_gc("after rm tab_stat_MIX_ALL") ## Start plotting print("--- Start plotting ---") assign("full_sp_MIX_ADJ_PP1_vs_SIM_ADJ_PP1", lst_full[[1]], envir = plotEnv) checkMem_gc("after assign1") assign("full_sp_MIX_ADJ_LOGIT_PP1_vs_SIM_ADJ_LOGIT_PP1", lst_full[[2]], envir = plotEnv) checkMem_gc("after assign2") rm(lst_full) checkMem_gc("after removing lst_full") assign("MIN_SUCCESS_sp_MIX_ADJ_PP1_vs_SIM_ADJ_PP1", lst_MIN_SUCCESS[[1]], envir = plotEnv) checkMem_gc("after assign3") assign("MIN_SUCCESS_sp_MIX_ADJ_LOGIT_PP1_vs_SIM_ADJ_LOGIT_PP1", lst_MIN_SUCCESS[[2]], envir = plotEnv) checkMem_gc("after assign4") rm(lst_MIN_SUCCESS) checkMem_gc("after removing lst_MIN_SUCCESS") assign("MIX_ALL_sp_MIX_ADJ_PP1_vs_SIM_ADJ_PP1", lst_MIX_ALL[[1]], envir = plotEnv) checkMem_gc("after assign5") assign("MIX_ALL_sp_MIX_ADJ_LOGIT_PP1_vs_SIM_ADJ_LOGIT_PP1", lst_MIX_ALL[[2]], envir = plotEnv) checkMem_gc("after assign6") rm(lst_MIX_ALL) checkMem_gc("after removing lst_MIX_ALL") }) checkMem_gc("--- Finishing ---") rm(dummyEnv) gc() checkMem_gc("After clean up") final.mem.size <- memory.size() end_ObjSizes <- sapply(ls(), function(x) {object.size(get(x))}) print("") print("") print("--- The sizes of all objects (under .GlobalEnv) BEFORE the graph plotting process ---") print("--- (Before the process starts, all existing objects are stored under .GlobalEnv) ---") print(start_ObjSizes) print("") print("--- The sizes of all objects (under .GlobalEnv) AFTER the graph plotting process ---") print(end_ObjSizes) print("--- I have not altered any existing objects under .GlobalEnv during the process, I only passed them to functions. And yet their sizes increase! ---") print("--- Let's look at the object tab_ind, which shows the largest inflation in object size ---") print("--- This is the size of tab_ind BEFORE the process: ---") print(start_tab_ind_size) print("--- This is the size of tab_ind AFTER the process: ---") print(object.size(tab_ind)) print("--- But they are identical (checked using the function identical())! ---") print(identical(start_tab_ind, tab_ind)) print("")
更新的可重复示例
这是一个更新,更简单的可重复示例.最新的发现是,要制作一个data.table
对象的副本,<- data.table::copy()
应该用来代替<-
.后者仅创建指向相同值的指针(即通过引用).改变新指针的值会改变原始指针的对象大小,这就是当我更改新指针时对象大小膨胀的原因.虽然我不确定它是否是内存使用率膨胀的唯一来源.
library(data.table) library(magrittr) library(ggplot2) N <- 6000 set.seed(runif(1, 0, .Machine$integer.max) %>% ceiling) logit <- function(x) {return(log(x/(1-x)))} invLogit <- function(x) {return(exp(x)/(1+exp(x)))} tab_dat <- data.table(datasetID = seq(N), MIX_MIN_SUCCESS = sample(c(0, 1), N, replace = T), MIX_ALL = sample(c(0, 1), N, replace = T)) tab_dat[MIX_MIN_SUCCESS == 0, MIX_ALL := 0] n <- sample(20:300, N, replace = T) tab_ind <- data.table( datasetID = rep(seq(N), times = n), SIM_ADJ_PP1 = runif(sum(n), 0.00001, 0.99999), MIX_ADJ_PP1 = runif(sum(n), 0.00001, 0.99999) ) ## Individual bins for x and y tab_by_bin_idxy <- function(dt, x, y, xNItv, yNItv, by = "quantile") { #Binning if (by == "even") { minN = dt[, min(get(x), na.rm = T)] maxN = dt[, max(get(x), na.rm = T)] xBreaks = seq(minN, maxN, length.out = xNItv + 1) yBreaks = dt[, seq(min(get(y), na.rm = T), max(get(y), na.rm = T), length.out = yNItv + 1)] } else if (by == "quantile") { xBreaks = dt[, quantile(get(x), seq(0, 1, length.out = xNItv + 1), names = F)] yBreaks = dt[, quantile(get(y), seq(0, 1, length.out = yNItv + 1), names = F)] } xbinCode = dt[, .bincode(get(x), breaks = xBreaks, include.lowest = T)] xbinMid = sapply(seq(xNItv), function(i) {return(mean(xBreaks[c(i, i+1)]))})[xbinCode] ybinCode = dt[, .bincode(get(y), breaks = yBreaks, include.lowest = T)] ybinMid = sapply(seq(yNItv), function(i) {return(mean(yBreaks[c(i, i+1)]))})[ybinCode] #Creating table tab_match = CJ(xbinCode = seq(xNItv), ybinCode = seq(yNItv)) tab_plot = data.table(xbinCode, xbinMid, ybinCode, ybinMid)[ tab_match, .(xbinMid = xbinMid[1], ybinMid = ybinMid[1], N = .N), keyby = .EACHI, on = c("xbinCode", "ybinCode") ] colnames(tab_plot)[colnames(tab_plot) == "xbinCode"] = paste0(x, "_binCode") colnames(tab_plot)[colnames(tab_plot) == "xbinMid"] = paste0(x, "_binMid") colnames(tab_plot)[colnames(tab_plot) == "ybinCode"] = paste0(y, "_binCode") colnames(tab_plot)[colnames(tab_plot) == "ybinMid"] = paste0(y, "_binMid") rm(list = c("xBreaks", "yBreaks", "xbinCode", "ybinCode", "xbinMid", "ybinMid", "tab_match")) #Returning table return(tab_plot) } plotEnv <- new.env() backupEnv <- new.env() gc() gc(verbose = T) start.mem.size <- memory.size() start_ObjSizes <- sapply(ls(), function(x) {object.size(get(x))}) start_tab_ind <- copy(tab_ind) start_tab_ind_size <- object.size(tab_ind) dummyEnv <- new.env() with(dummyEnv, { ## Set function for analyses against SIM_PP1 fcn_SIM_PP1 <- function(dt, newTab = T) { dat_prob = tab_by_bin_idxy(dt, x = "SIM_ADJ_PP1", y = "MIX_ADJ_PP1", xNItv = 50, yNItv = 50, by = "even") plot_prob <- ggplot(dat_prob, aes(x = SIM_ADJ_PP1_binMid)) + geom_vline(xintercept = 1, linetype = "dotted") + geom_hline(yintercept = 1, linetype = "dotted") + geom_abline(slope = 1, intercept = 0, size = 1.5, linetype = "dashed", alpha = 0.5) + geom_point(aes(y = MIX_ADJ_PP1_binMid, size = N), alpha = 0.5, na.rm = T) + scale_size_continuous(range = c(0.5, 5)) + scale_x_continuous(name = "Simulated PP", breaks = seq(0, 1, 0.25), labels = c("0%", "25%", "50%", "75%", "100%")) + scale_y_continuous(name = "Estimated PP", limits = c(0, 1), breaks = seq(0, 1, 0.25), labels = c("0%", "25%", "50%", "75%", "100%")) + theme_classic() + theme(axis.title = element_text(size = 18), axis.text = element_text(size = 16)) return(plot_prob) } ## Tabling tab_stat <- copy(tab_ind) tab_stat <- tab_stat[, c("MIX_MIN_SUCCESS", "MIX_ALL") := list( tab_dat[tab_stat[, datasetID], MIX_MIN_SUCCESS], tab_dat[tab_stat[, datasetID], MIX_ALL] )] tab_stat_MIN_SUCCESS <- tab_stat[MIX_MIN_SUCCESS == 1] tab_stat_MIX_ALL <- tab_stat[MIX_ALL == 1] # Generating ggplot objects lst_full <- fcn_SIM_PP1(tab_stat, newTab = F) lst_MIN_SUCCESS <- fcn_SIM_PP1(tab_stat_MIN_SUCCESS, newTab = F) lst_MIX_ALL <- fcn_SIM_PP1(tab_stat_MIX_ALL, newTab = F) ## Start plotting assign("full_sp_MIX_ADJ_PP1_vs_SIM_ADJ_PP1", lst_full, envir = plotEnv) assign("MIN_SUCCESS_sp_MIX_ADJ_PP1_vs_SIM_ADJ_PP1", lst_MIN_SUCCESS, envir = plotEnv) assign("MIX_ALL_sp_MIX_ADJ_PP1_vs_SIM_ADJ_PP1", lst_MIX_ALL, envir = plotEnv) }) rm(dummyEnv) rm(start_tab_ind) gc(verbose = T) final.mem.size <- memory.size() end_ObjSizes <- sapply(ls(), function(x) {object.size(get(x))})
我sessionInfo()
在运行上面的例子时:
R version 3.5.0 (2018-04-23) Platform: x86_64-w64-mingw32/x64 (64-bit) Running under: Windows >= 8 x64 (build 9200) Matrix products: default locale: [1] LC_COLLATE=English_Hong Kong SAR.1252 LC_CTYPE=English_Hong Kong SAR.1252 LC_MOnETARY=English_Hong Kong SAR.1252 [4] LC_NUMERIC=C LC_TIME=English_Hong Kong SAR.1252 attached base packages: [1] stats graphics grDevices utils datasets methods base other attached packages: [1] ggplot2_2.2.1 magrittr_1.5 data.table_1.11.4 loaded via a namespace (and not attached): [1] colorspace_1.3-2 scales_0.5.0 compiler_3.5.0 lazyeval_0.2.1 plyr_1.8.4 tools_3.5.0 pillar_1.2.3 gtable_0.2.0 [9] tibble_1.4.2 yaml_2.1.19 Rcpp_0.12.18 grid_3.5.0 rlang_0.2.1 munsell_0.4.3
Technophobe0.. 5
我的感觉是你需要增加--min-vsize=
.为什么?该错误cannot allocate vector of size ...
意味着您需要增加--min-vsize=
.
R --min-vsize=400M
在.Renviron
文件中创建或添加条目.
R_VSIZE=400M
参考:友好的R启动配置
https://cran.r-project.org/web/packages/startup/vignettes/startup-intro.html
你在运行64位操作系统吗?[是/否]
你在运行64位版本的R吗?[是/否]
如果您对这些问题中的任何一个回答"否",我建议您升级.
这里的现实是,如果您需要增加最小vsize,您可能希望查看您的代码以获取分配陷阱.在大多数情况下,您会发现通过复制分配复制数据.
有关R Gotcha的更多信息,我强烈建议您阅读:
Patrick Burns在线书籍R Inferno
R为固定和可变大小的对象维护单独的区域.第一个被分配为一个cons单元数组(Lisp程序员将知道它们是什么,其他人可能会认为它们是语言本身的构建块,解析树等),第二个被抛出堆'Vcells'每个8字节.每个cons单元在32位构建的R上占用28个字节,(通常)在64位构建上占用56个字节.
默认值是(当前)初始设置为350k cons单元和6Mb向量堆.请注意,这些区域最初并未实际分配:而是这些值是触发垃圾回收的大小.这些值可通过命令行选项来设置--min-nsize
和--min-vsize
(或如果它们不使用时,环境变量R_NSIZE
和R_VSIZE
)R启动时.此后,R将根据使用情况增大或缩小区域,从不降低到初始值以下.可以使用环境变量设置最大向量堆大小R_MAX_VSIZE
.
R在垃圾收集器中花费的时间将取决于这些初始设置以及内存管理器在内存填满时所做的权衡,即在收集垃圾以释放未使用的内存和增长这些区域之间.可以通过将环境变量R_GC_MEM_GROW设置为0到3之间的整数值来指定用于增长的策略.此变量在启动时读取.较高的值会更积极地增加堆,从而减少垃圾收集时间但使用更多内存.
参考:https: //www.rdocumentation.org/packages/base/versions/3.5.1/topics/Memory
除非操作系统的默认设置已更改为允许更多(最高3Gb),否则32位Windows下的地址空间限制为2Gb.请参阅https://www.microsoft.com/whdc/system/platform/server/PAE/PAEmem.mspx和https://msdn.microsoft.com/en-us/library/bb613473(VS.85).aspx.在大多数64位版本的Windows下,32位版本的R的限制是4Gb:对于最老的版本,它是2Gb.64位版本的R(由OS强加)的限制是8Tb.
由于Windows在地址空间中间的预分配,即使在64位Windows上,通常也不可能将2Gb分配给32位版本的R中的单个向量.
在Windows下,R对单个会话可用的总内存分配施加限制,因为操作系统无法执行此操作:请参阅memory.size和memory.limit.
我的感觉是你需要增加--min-vsize=
.为什么?该错误cannot allocate vector of size ...
意味着您需要增加--min-vsize=
.
R --min-vsize=400M
在.Renviron
文件中创建或添加条目.
R_VSIZE=400M
参考:友好的R启动配置
https://cran.r-project.org/web/packages/startup/vignettes/startup-intro.html
你在运行64位操作系统吗?[是/否]
你在运行64位版本的R吗?[是/否]
如果您对这些问题中的任何一个回答"否",我建议您升级.
这里的现实是,如果您需要增加最小vsize,您可能希望查看您的代码以获取分配陷阱.在大多数情况下,您会发现通过复制分配复制数据.
有关R Gotcha的更多信息,我强烈建议您阅读:
Patrick Burns在线书籍R Inferno
R为固定和可变大小的对象维护单独的区域.第一个被分配为一个cons单元数组(Lisp程序员将知道它们是什么,其他人可能会认为它们是语言本身的构建块,解析树等),第二个被抛出堆'Vcells'每个8字节.每个cons单元在32位构建的R上占用28个字节,(通常)在64位构建上占用56个字节.
默认值是(当前)初始设置为350k cons单元和6Mb向量堆.请注意,这些区域最初并未实际分配:而是这些值是触发垃圾回收的大小.这些值可通过命令行选项来设置--min-nsize
和--min-vsize
(或如果它们不使用时,环境变量R_NSIZE
和R_VSIZE
)R启动时.此后,R将根据使用情况增大或缩小区域,从不降低到初始值以下.可以使用环境变量设置最大向量堆大小R_MAX_VSIZE
.
R在垃圾收集器中花费的时间将取决于这些初始设置以及内存管理器在内存填满时所做的权衡,即在收集垃圾以释放未使用的内存和增长这些区域之间.可以通过将环境变量R_GC_MEM_GROW设置为0到3之间的整数值来指定用于增长的策略.此变量在启动时读取.较高的值会更积极地增加堆,从而减少垃圾收集时间但使用更多内存.
参考:https: //www.rdocumentation.org/packages/base/versions/3.5.1/topics/Memory
除非操作系统的默认设置已更改为允许更多(最高3Gb),否则32位Windows下的地址空间限制为2Gb.请参阅https://www.microsoft.com/whdc/system/platform/server/PAE/PAEmem.mspx和https://msdn.microsoft.com/en-us/library/bb613473(VS.85).aspx.在大多数64位版本的Windows下,32位版本的R的限制是4Gb:对于最老的版本,它是2Gb.64位版本的R(由OS强加)的限制是8Tb.
由于Windows在地址空间中间的预分配,即使在64位Windows上,通常也不可能将2Gb分配给32位版本的R中的单个向量.
在Windows下,R对单个会话可用的总内存分配施加限制,因为操作系统无法执行此操作:请参阅memory.size和memory.limit.