天气爬虫
import time
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
year=['2011','2012','2013','2014','2015','2016','2017','2018','2019','2020','2021','2022']
mon=['01','02','03','04','05','06','07','08','09','10','11','12']
tian=pd.DataFrame(columns=['日期', '最高气温', '最低气温', '天气', '风向'])
for y in year:
for m in mon:
response=requests.get(f"http://lishi.tianqi.com/pingliang/{y}{m}.html",headers=headers)
time.sleep(0.5)
content = response.text
soup = BeautifulSoup(content,'lxml')
divs = soup.find_all("div",class_ = 'tian_three')
row_data = [div.text.replace("\n\n\n\n", ";").replace("\n\n",'').replace('\n查看更多','').replace('\n',',') for div in divs]
text_data = ' '.join(row_data)
text_data=text_data.replace(";,", ";")
data = text_data.split(';')
data_list = [i.split(',') for i in data]
df = pd.DataFrame(data_list[1:], columns=data_list[0])
tian=pd.concat([tian,df])
#tian=tian.append(df)
tian.to_excel("D:\\tianqi.xlsx",index=False)
评论
发表评论