爬虫 生物制品批签发产品情况汇总

#java -jar /usr/local/bin/selenium-server-standalone-3.9.1.jar

library(RSelenium)

library(rvest)

library(stringr)

library(magrittr)

library(tidyverse)

remDr <- remoteDriver(
 
  remoteServerAddr = "localhost",
 
  port = 4444,
 
  browserName = "firefox")



#打开浏览器

remDr$open()

remDr$navigate("http://bio.nifdc.org.cn/pqf/search.do?formAction=pqfGsByJG&orgId=1")

webElems <- remDr$findElements(using = "partial link text", "生物制品批签发产品情况汇总")

links <- unlist(lapply(webElems, function(e) {e$getElementAttribute("href")}))

dataframe <-data.frame()

for (i in 1:length(links)) {
 
  remDr$navigate(links[i])
 
  Sys.sleep(1)
 
  childWebElem <- remDr$findElement(using='xpath',"/html/body/center/div/div[2]/table/tbody/tr/td/table/tbody/tr")
  childWebElem <- remDr$findElement(using='xpath',"/html/body/center/div/div[2]/table/tbody/tr/td/table/tbody")
 
  doc <- read_html(childWebElem$getElementAttribute("outerHTML")[[1]])
 
  name<-doc%>%
   
    html_nodes("td")%>%
   
    html_text()
 
 
 
  df <-data.frame(matrix(ncol = 13, nrow = (length(name)/13)))
 
  colnames(df) <- c("序号","产品名称","规格","批号","签发量","有效期至","生产企业","收检编号","证书编号","报告编号","签发日期","签发结论","批签发机构")
 
 
 
  for(j in 1:(length(name)/13)){
   
    df$序号[j] <- name[j*13+1]
   
    df$产品名称[j] <- name[j*13+2]
   
    df$规格[j] <- name[j*13+3]
   
    df$批号[j] <- name[j*13+4]
   
    df$签发量[j] <- name[j*13+5]
   
    df$有效期至[j] <- name[j*13+6]
   
    df$生产企业[j] <- name[j*13+7]
   
    df$收检编号[j] <- name[j*13+8]
   
    df$证书编号[j] <- name[j*13+9]
   
    df$报告编号[j] <- name[j*13+10]
   
    df$签发日期[j] <- name[j*13+11]
   
    df$签发结论[j] <- name[j*13+12]
   
    df$批签发机构[j] <- name[j*13+13]
   
  }
 
 
 
  dataframe <- rbind(dataframe,df)
 
  remDr$goBack()
 
}



dataframe %<>% filter(str_detect(产品名称,'疫苗')==T)
write.csv(dataframe,'/home/xuefliang/df.csv')

评论

此博客中的热门博文

V2ray websocket(ws)+tls+nginx分流

Rstudio 使用代理