爬虫 生物制品批签发产品情况汇总
#java -jar /usr/local/bin/selenium-server-standalone-3.9.1.jar
library(RSelenium)
library(rvest)
library(stringr)
library(magrittr)
library(tidyverse)
remDr <- remoteDriver(
remoteServerAddr = "localhost",
port = 4444,
browserName = "firefox")
#打开浏览器
remDr$open()
remDr$navigate("http://bio.nifdc.org.cn/pqf/search.do?formAction=pqfGsByJG&orgId=1")
webElems <- remDr$findElements(using = "partial link text", "生物制品批签发产品情况汇总")
links <- unlist(lapply(webElems, function(e) {e$getElementAttribute("href")}))
dataframe <-data.frame()
for (i in 1:length(links)) {
remDr$navigate(links[i])
Sys.sleep(1)
childWebElem <- remDr$findElement(using='xpath',"/html/body/center/div/div[2]/table/tbody/tr/td/table/tbody/tr")
childWebElem <- remDr$findElement(using='xpath',"/html/body/center/div/div[2]/table/tbody/tr/td/table/tbody")
doc <- read_html(childWebElem$getElementAttribute("outerHTML")[[1]])
name<-doc%>%
html_nodes("td")%>%
html_text()
df <-data.frame(matrix(ncol = 13, nrow = (length(name)/13)))
colnames(df) <- c("序号","产品名称","规格","批号","签发量","有效期至","生产企业","收检编号","证书编号","报告编号","签发日期","签发结论","批签发机构")
for(j in 1:(length(name)/13)){
df$序号[j] <- name[j*13+1]
df$产品名称[j] <- name[j*13+2]
df$规格[j] <- name[j*13+3]
df$批号[j] <- name[j*13+4]
df$签发量[j] <- name[j*13+5]
df$有效期至[j] <- name[j*13+6]
df$生产企业[j] <- name[j*13+7]
df$收检编号[j] <- name[j*13+8]
df$证书编号[j] <- name[j*13+9]
df$报告编号[j] <- name[j*13+10]
df$签发日期[j] <- name[j*13+11]
df$签发结论[j] <- name[j*13+12]
df$批签发机构[j] <- name[j*13+13]
}
dataframe <- rbind(dataframe,df)
remDr$goBack()
}
dataframe %<>% filter(str_detect(产品名称,'疫苗')==T)
write.csv(dataframe,'/home/xuefliang/df.csv')
library(RSelenium)
library(rvest)
library(stringr)
library(magrittr)
library(tidyverse)
remDr <- remoteDriver(
remoteServerAddr = "localhost",
port = 4444,
browserName = "firefox")
#打开浏览器
remDr$open()
remDr$navigate("http://bio.nifdc.org.cn/pqf/search.do?formAction=pqfGsByJG&orgId=1")
webElems <- remDr$findElements(using = "partial link text", "生物制品批签发产品情况汇总")
links <- unlist(lapply(webElems, function(e) {e$getElementAttribute("href")}))
dataframe <-data.frame()
for (i in 1:length(links)) {
remDr$navigate(links[i])
Sys.sleep(1)
childWebElem <- remDr$findElement(using='xpath',"/html/body/center/div/div[2]/table/tbody/tr/td/table/tbody/tr")
childWebElem <- remDr$findElement(using='xpath',"/html/body/center/div/div[2]/table/tbody/tr/td/table/tbody")
doc <- read_html(childWebElem$getElementAttribute("outerHTML")[[1]])
name<-doc%>%
html_nodes("td")%>%
html_text()
df <-data.frame(matrix(ncol = 13, nrow = (length(name)/13)))
colnames(df) <- c("序号","产品名称","规格","批号","签发量","有效期至","生产企业","收检编号","证书编号","报告编号","签发日期","签发结论","批签发机构")
for(j in 1:(length(name)/13)){
df$序号[j] <- name[j*13+1]
df$产品名称[j] <- name[j*13+2]
df$规格[j] <- name[j*13+3]
df$批号[j] <- name[j*13+4]
df$签发量[j] <- name[j*13+5]
df$有效期至[j] <- name[j*13+6]
df$生产企业[j] <- name[j*13+7]
df$收检编号[j] <- name[j*13+8]
df$证书编号[j] <- name[j*13+9]
df$报告编号[j] <- name[j*13+10]
df$签发日期[j] <- name[j*13+11]
df$签发结论[j] <- name[j*13+12]
df$批签发机构[j] <- name[j*13+13]
}
dataframe <- rbind(dataframe,df)
remDr$goBack()
}
dataframe %<>% filter(str_detect(产品名称,'疫苗')==T)
write.csv(dataframe,'/home/xuefliang/df.csv')
评论
发表评论