RSelenium 爬取 中检院生物制品批签发信息公示表

#java -jar /usr/local/bin/selenium-server-standalone-3.9.1.jar

library(RSelenium)
library(rvest)
library(stringr)
library(magrittr)
library(tidyverse)
remDr <- remoteDriver(
  remoteServerAddr = "localhost",
  port = 4444,
  browserName = "firefox")

#打开浏览器
remDr$open()
remDr$navigate("http://www.nifdc.org.cn/CL0903/")
webElems <- remDr$findElements(using = "partial link text", "中检院生物制品批签发信息公示表")
links <- unlist(lapply(webElems, function(e) {e$getElementAttribute("href")}))
# 下一页
nextElem <- remDr$findElement(using = "partial link text", "下一页")
a.elem <- nextElem$getElementAttribute('href')[[1]]
remDr$navigate(a.elem)
nextWebElems <- remDr$findElements(using = "partial link text", "中检院生物制品批签发信息公示表")
links <- c(links,unlist(lapply(nextWebElems, function(e) {e$getElementAttribute("href")})))

dataframe <-data.frame()
for (i in 1:length(links)) {
  remDr$navigate(links[i])
  Sys.sleep(1)
  childWebElem <- remDr$findElement(using='xpath',"/html/body/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[5]/td/table/tbody")
  doc <- read_html(childWebElem$getElementAttribute("outerHTML")[[1]])
  name<-doc%>%
    html_nodes("td")%>%
    html_text()
 
  df <-data.frame(matrix(ncol = 13, nrow = (length(name)/13-1)))
  colnames(df) <- c("序号","产品名称","规格","批号","签发量","有效期至","生产企业","收检编号","证书编号","报告编号","签发日期","签发结论","批签发机构")
 
  for(j in 1:(length(name)/13-1)){
    df$序号[j] <- name[j*13+1]
    df$产品名称[j] <- name[j*13+2]
    df$规格[j] <- name[j*13+3]
    df$批号[j] <- name[j*13+4]
    df$签发量[j] <- name[j*13+5]
    df$有效期至[j] <- name[j*13+6]
    df$生产企业[j] <- name[j*13+7]
    df$收检编号[j] <- name[j*13+8]
    df$证书编号[j] <- name[j*13+9]
    df$报告编号[j] <- name[j*13+10]
    df$签发日期[j] <- name[j*13+11]
    df$签发结论[j] <- name[j*13+12]
    df$批签发机构[j] <- name[j*13+13]
  }
 
  dataframe <- rbind(dataframe,df)
  remDr$goBack()
}

dataframe %<>% filter(str_detect(产品名称,'疫苗')==T)

评论

此博客中的热门博文

V2ray websocket(ws)+tls+nginx分流

Rstudio 使用代理