Googlenew

from bs4 import BeautifulSoup as soup
import pandas as pd
import urllib.request
import urllib

class GoogleNews():

def __init__(self):
self.texts = []
self.links = []
self.results = []

def search(self, key):
self.key = "+".join(key.split(" "))
self.getpage()

def getpage(self, page=1):
self.user_agent='Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:64.0) Gecko/20100101 Firefox/64.0'
self.headers={'User-Agent':self.user_agent}
##中文url做了特殊处理
self.url="https://www.google.com/search?q="+urllib.parse.quote(self.key)+"&tbm=nws&start=%d" % (10*(page-1))
try:
self.req=urllib.request.Request(self.url, headers=self.headers)
self.response=urllib.request.urlopen(self.req)
self.page=self.response.read()
self.content=soup(self.page, "html.parser")
result=self.content.find(id="ires").find_all("div", class_="g")
for item in result:
self.texts.append(item.find("h3").text)
self.links.append(item.find("h3").find("a").get("href"))
self.results.append({'title':item.find("h3").text,'media':item.find("div", class_="slp").find_all("span")[0].text,'date':item.find("div", class_="slp").find_all("span")[2].text,'desc':item.find("div", class_="st").text,'link':item.find("h3").find("a").get("href"),'img':item.find("img").get("src")})
self.response.close()
except Exception:
pass

def result(self):
return self.results

def gettext(self):
return self.texts

def getlinks(self):
return self.links

def clear(self):
self.texts = []
self.links = []
self.results = []


googlenews=GoogleNews()
#googlenews.search('vaccine')
googlenews.search('疫苗 反应')

for i in range(3):
googlenews.getpage(i)

googlenews.result()
googlenews.gettext()
googlenews.getlinks()

df = pd.DataFrame(list(zip(googlenews.gettext(), googlenews.getlinks())),
columns =['title', 'link'])
df.to_csv('/home/xuefliang/Downloads/result.csv',index=False)

评论

此博客中的热门博文

V2ray websocket(ws)+tls+nginx分流

Rstudio 使用代理