2016-09-21 157 views
1

我正在尝试使用RedditExtractoR软件包,因为我有很多次过去。自上个月以来我没有使用它,但本周尝试使用它时,它会返回一个空的数据框。R RedditExtractoR软件包解决方法

get_reddit(subreddit="jokes") 
|======================================================================================================================================| 100% 
    [1] id    structure  post_date  comm_date  num_comments  subreddit  upvote_prop  post_score  
    [9] author   user    comment_score controversiality comment   title   post_text  link    
    [17] domain   URL    
    <0 rows> (or 0-length row.names)' 

我探索的功能get_reddit(),它似乎使用功能reddit_urls(),然后采取URL和加载页面的JSON。 reddit_urls()函数返回一个带有Reddit页面的url的数据框,并且将.JSON附加到url的末尾似乎仍然将该页面加载为JSON对象。

是否有其他人对此包有问题和/或他们是否知道将JSON对象解析为datafrme的解决方法?

谢谢

回答

1

我有同样的问题。这里是我的修复...

如果您删除https?://的选项,并使其必须是https://reddit_content();所以函数看起来像:

reddit_content <- function (URL, wait_time = 2) 
{ 
    if (is.null(URL) | length(URL) == 0 | !is.character(URL)) { 
     stop("invalid URL parameter") 
    } 
    GetAttribute = function(node, feature) { 
     Attribute = node$data[[feature]] 
     replies = node$data$replies 
     reply.nodes = if (is.list(replies)) 
      replies$data$children 
     else NULL 
     return(list(Attribute, lapply(reply.nodes, function(x) { 
      GetAttribute(x, feature) 
     }))) 
    } 
    get.structure = function(node, depth = 0) { 
     if (is.null(node)) { 
      return(list()) 
     } 
     filter = is.null(node$data$author) 
     replies = node$data$replies 
     reply.nodes = if (is.list(replies)) 
      replies$data$children 
     else NULL 
     return(list(paste0(filter, " ", depth), lapply(1:length(reply.nodes), 
      function(x) get.structure(reply.nodes[[x]], paste0(depth, 
       "_", x))))) 
    } 
    data_extract = data.frame(id = numeric(), structure = character(), 
     post_date = as.Date(character()), comm_date = as.Date(character()), 
     num_comments = numeric(), subreddit = character(), upvote_prop = numeric(), 
     post_score = numeric(), author = character(), user = character(), 
     comment_score = numeric(), controversiality = numeric(), 
     comment = character(), title = character(), post_text = character(), 
     link = character(), domain = character(), URL = character()) 
    pb = utils::txtProgressBar(min = 0, max = length(URL), style = 3) 
    for (i in seq(URL)) { 
     if (!grepl("^https://(.*)", URL[i])) 
      URL[i] = paste0("https://www.", gsub("^.*(reddit\\..*$)", 
       "\\1", URL[i])) 
     if (!grepl("\\?ref=search_posts$", URL[i])) 
      URL[i] = paste0(gsub("/$", "", URL[i]), "/?ref=search_posts") 
     X = paste0(gsub("\\?ref=search_posts$", "", URL[i]), 
      ".json?limit=500") 
     raw_data = tryCatch(RJSONIO::fromJSON(readLines(X, warn = FALSE)), 
      error = function(e) NULL) 
     if (is.null(raw_data)) { 
      Sys.sleep(min(1, wait_time)) 
      raw_data = tryCatch(RJSONIO::fromJSON(readLines(X, 
       warn = FALSE)), error = function(e) NULL) 
     } 
     if (is.null(raw_data) == FALSE) { 
      meta.node = raw_data[[1]]$data$children[[1]]$data 
      main.node = raw_data[[2]]$data$children 
      if (min(length(meta.node), length(main.node)) > 0) { 
       structure = unlist(lapply(1:length(main.node), 
        function(x) get.structure(main.node[[x]], x))) 
       TEMP = data.frame(id = NA, structure = gsub("FALSE ", 
        "", structure[!grepl("TRUE", structure)]), 
        post_date = format(as.Date(as.POSIXct(meta.node$created_utc, 
        origin = "1970-01-01")), "%d-%m-%y"), comm_date = format(as.Date(as.POSIXct(unlist(lapply(main.node, 
        function(x) { 
         GetAttribute(x, "created_utc") 
        })), origin = "1970-01-01")), "%d-%m-%y"), 
        num_comments = meta.node$num_comments, subreddit = ifelse(is.null(meta.node$subreddit), 
        "UNKNOWN", meta.node$subreddit), upvote_prop = meta.node$upvote_ratio, 
        post_score = meta.node$score, author = meta.node$author, 
        user = unlist(lapply(main.node, function(x) { 
        GetAttribute(x, "author") 
        })), comment_score = unlist(lapply(main.node, 
        function(x) { 
         GetAttribute(x, "score") 
        })), controversiality = unlist(lapply(main.node, 
        function(x) { 
         GetAttribute(x, "controversiality") 
        })), comment = unlist(lapply(main.node, function(x) { 
        GetAttribute(x, "body") 
        })), title = meta.node$title, post_text = meta.node$selftext, 
        link = meta.node$url, domain = meta.node$domain, 
        URL = URL[i], stringsAsFactors = FALSE) 
       TEMP$id = 1:nrow(TEMP) 
       if (dim(TEMP)[1] > 0 & dim(TEMP)[2] > 0) 
        data_extract = rbind(TEMP, data_extract) 
       else print(paste("missed", i, ":", URL[i])) 
      } 
     } 
     utils::setTxtProgressBar(pb, i) 
     Sys.sleep(min(2, wait_time)) 
    } 
    close(pb) 
    return(data_extract) 
} 

然后是get_reddit()功能重置为:

get_reddit <- function (search_terms = NA, regex_filter = "", subreddit = NA, 
    cn_threshold = 1, page_threshold = 1, sort_by = "comments", 
    wait_time = 2) 
{ 
    URL = unique(as.character(reddit_urls(search_terms, regex_filter, 
     subreddit, cn_threshold, page_threshold, sort_by, wait_time)$URL)) 
    retrieved_data = reddit_content(URL, wait_time) 
    return(retrieved_data) 
}