RSelenium 爬取 中检院生物制品批签发信息公示表
#java -jar /usr/local/bin/selenium-server-standalone-3.9.1.jar
library(RSelenium)
library(rvest)
library(stringr)
library(magrittr)
library(tidyverse)
remDr <- remoteDriver(
remoteServerAddr = "localhost",
port = 4444,
browserName = "firefox")
#打开浏览器
remDr$open()
remDr$navigate("http://www.nifdc.org.cn/CL0903/")
webElems <- remDr$findElements(using = "partial link text", "中检院生物制品批签发信息公示表")
links <- unlist(lapply(webElems, function(e) {e$getElementAttribute("href")}))
# 下一页
nextElem <- remDr$findElement(using = "partial link text", "下一页")
a.elem <- nextElem$getElementAttribute('href')[[1]]
remDr$navigate(a.elem)
nextWebElems <- remDr$findElements(using = "partial link text", "中检院生物制品批签发信息公示表")
links <- c(links,unlist(lapply(nextWebElems, function(e) {e$getElementAttribute("href")})))
dataframe <-data.frame()
for (i in 1:length(links)) {
remDr$navigate(links[i])
Sys.sleep(1)
childWebElem <- remDr$findElement(using='xpath',"/html/body/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[5]/td/table/tbody")
doc <- read_html(childWebElem$getElementAttribute("outerHTML")[[1]])
name<-doc%>%
html_nodes("td")%>%
html_text()
df <-data.frame(matrix(ncol = 13, nrow = (length(name)/13-1)))
colnames(df) <- c("序号","产品名称","规格","批号","签发量","有效期至","生产企业","收检编号","证书编号","报告编号","签发日期","签发结论","批签发机构")
for(j in 1:(length(name)/13-1)){
df$序号[j] <- name[j*13+1]
df$产品名称[j] <- name[j*13+2]
df$规格[j] <- name[j*13+3]
df$批号[j] <- name[j*13+4]
df$签发量[j] <- name[j*13+5]
df$有效期至[j] <- name[j*13+6]
df$生产企业[j] <- name[j*13+7]
df$收检编号[j] <- name[j*13+8]
df$证书编号[j] <- name[j*13+9]
df$报告编号[j] <- name[j*13+10]
df$签发日期[j] <- name[j*13+11]
df$签发结论[j] <- name[j*13+12]
df$批签发机构[j] <- name[j*13+13]
}
dataframe <- rbind(dataframe,df)
remDr$goBack()
}
dataframe %<>% filter(str_detect(产品名称,'疫苗')==T)
library(RSelenium)
library(rvest)
library(stringr)
library(magrittr)
library(tidyverse)
remDr <- remoteDriver(
remoteServerAddr = "localhost",
port = 4444,
browserName = "firefox")
#打开浏览器
remDr$open()
remDr$navigate("http://www.nifdc.org.cn/CL0903/")
webElems <- remDr$findElements(using = "partial link text", "中检院生物制品批签发信息公示表")
links <- unlist(lapply(webElems, function(e) {e$getElementAttribute("href")}))
# 下一页
nextElem <- remDr$findElement(using = "partial link text", "下一页")
a.elem <- nextElem$getElementAttribute('href')[[1]]
remDr$navigate(a.elem)
nextWebElems <- remDr$findElements(using = "partial link text", "中检院生物制品批签发信息公示表")
links <- c(links,unlist(lapply(nextWebElems, function(e) {e$getElementAttribute("href")})))
dataframe <-data.frame()
for (i in 1:length(links)) {
remDr$navigate(links[i])
Sys.sleep(1)
childWebElem <- remDr$findElement(using='xpath',"/html/body/div/table[2]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr[5]/td/table/tbody")
doc <- read_html(childWebElem$getElementAttribute("outerHTML")[[1]])
name<-doc%>%
html_nodes("td")%>%
html_text()
df <-data.frame(matrix(ncol = 13, nrow = (length(name)/13-1)))
colnames(df) <- c("序号","产品名称","规格","批号","签发量","有效期至","生产企业","收检编号","证书编号","报告编号","签发日期","签发结论","批签发机构")
for(j in 1:(length(name)/13-1)){
df$序号[j] <- name[j*13+1]
df$产品名称[j] <- name[j*13+2]
df$规格[j] <- name[j*13+3]
df$批号[j] <- name[j*13+4]
df$签发量[j] <- name[j*13+5]
df$有效期至[j] <- name[j*13+6]
df$生产企业[j] <- name[j*13+7]
df$收检编号[j] <- name[j*13+8]
df$证书编号[j] <- name[j*13+9]
df$报告编号[j] <- name[j*13+10]
df$签发日期[j] <- name[j*13+11]
df$签发结论[j] <- name[j*13+12]
df$批签发机构[j] <- name[j*13+13]
}
dataframe <- rbind(dataframe,df)
remDr$goBack()
}
dataframe %<>% filter(str_detect(产品名称,'疫苗')==T)
评论
发表评论