python 爬虫

九月 15, 2020

import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

url='https://mops.twse.com.tw/mops/web/ajax_t100sb15'
proxy = { "http": "http://127.0.0.1:8889",} 
headers= {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0'}
payload={
    'encodeURIComponent':"1",
    'step': "1",
    'firstin':"1",
    'TYPEK':"sii",
    'RYEAR':"108"
}
res=requests.post(url,data=payload,proxies=proxy,headers=headers)
#res=requests.post(url,data=payload)
print(res.text)

dfs=pd.read_html(res.text)

df=dfs[0].iloc[:,[0,1,2,5,6,7]]
df.head()
df.info()

df.columns=['產業類別','公司代號','公司名稱','平均數108','平均數107','中位數108']
df.sort_values('中位數108',ascending=False)

#df.plot(kind='bar',title='bar title')
df['中位數108'].plot(kind='bar',title='bar title')

df['中位數108'].plot(kind="bar",stacked=True)#画柱状图，stacked相互交合 
plt.show()

# a=(np.random.randn(100)*10+70).astype(int)
# a.mean()
# a=pd.Series(a)
# a.median()

# b=np.array([1000])
# a=np.concatenate([a,b])
# a=pd.Series(a)
# a.median()
# a.mean()


import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

articles=[]
for i in range(1,5):
    url='https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D1%26q%3D%2523%25E7%2596%25AB%25E8%258B%2597%2523&page_type=searchall&page={}'.format(i)
    res=requests.get(url)
    jd=res.json()
    articles.extend([rec['mblog'] for rec in jd['data']['cards'] if rec.get('mblog')])
print(jd)
print(jd['data']['cards'])

#articles=[rec['mblog'] for rec in jd['data']['cards'] if rec.get('mblog')]
df=pd.DataFrame(articles)
df.head()
df.info()
df['text']
len(df)

def parseArticle(e):
    soup=BeautifulSoup(e,'lxml')
    return soup.text

df['text'].map(parseArticle)

搜索此博客

xuefliang

python 爬虫

评论

发表评论

此博客中的热门博文

windows 命令行下查看端口占用情况的方法

V2ray websocket(ws)+tls+nginx分流

Rstudio 使用代理

python 爬虫

评论

发表评论

此博客中的热门博文

windows 命令行下 查看端口占用情况的方法

V2ray websocket(ws)+tls+nginx分流

Rstudio 使用代理

windows 命令行下查看端口占用情况的方法