天气爬虫

 import time

import pandas as pd

import numpy as np

import requests

from bs4 import BeautifulSoup

headers = {

    "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"

}


year=['2011','2012','2013','2014','2015','2016','2017','2018','2019','2020','2021','2022']

mon=['01','02','03','04','05','06','07','08','09','10','11','12']


tian=pd.DataFrame(columns=['日期', '最高气温', '最低气温', '天气', '风向'])


for y in year:

    for m in mon:

        response=requests.get(f"http://lishi.tianqi.com/pingliang/{y}{m}.html",headers=headers)

        time.sleep(0.5)

        content = response.text

        soup = BeautifulSoup(content,'lxml')

        divs = soup.find_all("div",class_ = 'tian_three')

        row_data = [div.text.replace("\n\n\n\n", ";").replace("\n\n",'').replace('\n查看更多','').replace('\n',',') for div in divs]

        text_data = ' '.join(row_data)

        text_data=text_data.replace(";,", ";")

        data = text_data.split(';')

        data_list = [i.split(',') for i in data]

        df = pd.DataFrame(data_list[1:], columns=data_list[0])

        tian=pd.concat([tian,df])

        #tian=tian.append(df)


tian.to_excel("D:\\tianqi.xlsx",index=False)

评论

此博客中的热门博文

V2ray websocket(ws)+tls+nginx分流

Rstudio 使用代理