博文

目前显示的是 2020的博文

na替换0

 mutate(across(everything(), ~replace_na(.x, 0)))

R 多列类型转换

mutate_at(vars(contains("数")),as.numeric)    mutate_if(str_detect(colnames(.), "数"),as.numeric)  mutate_if(is.character,as.numeric)   在dplyr中,选择包含“ Sepal”的变量,并将NA分配给Species为“ setosa”的那些行:   iris %>%     mutate_at(vars(contains("Sepal")), funs(ifelse(Species == "setosa", NA, .)))    iris %>%     mutate_at(vars(contains("Sepal")),                funs(na_if(Species, "setosa")))

pip conda you-get 使用代理

 pip install --proxy http://127.0.0.1:7890 pandas --upgrade conda config --set proxy_servers.http  http://127.0.0.1:7890 you-get -x 127.0.0.1:7890 'https://www.youtube.com/watch?v=jNQXAC9IVRw'

R proxy

Sys.setenv(http_proxy = "http://127.0.0.1:8001") Sys.setenv(https_proxy = "https://127.0.0.1:8001")

分组抽样

 library(sampling) sam_group <- strata(dat,stratanames = 'xiangzhen',size = seq(1:20),method = 'srswr') sam_group <- strata(dat,stratanames = 'xiangzhen',size = rep(3,20),method = 'srswr') #分组抽样 sam_group <- dat %>%    filter(weihao !='000') %>%    group_by(xiangzhen) %>%    slice(sample(3))

python 爬虫

import requests import pandas as pd import numpy as np import matplotlib.pyplot as plt url= 'https://mops.twse.com.tw/mops/web/ajax_t100sb15' proxy = { "http" : "http://127.0.0.1:8889" ,} headers= { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:77.0) Gecko/20100101 Firefox/77.0' } payload={ 'encodeURIComponent' : "1" , 'step' : "1" , 'firstin' : "1" , 'TYPEK' : "sii" , 'RYEAR' : "108" } res=requests.post(url,data=payload,proxies=proxy,headers=headers) #res=requests.post(url,data=payload) print(res.text) dfs=pd.read_html(res.text) df=dfs[ 0 ].iloc[:,[ 0 , 1 , 2 , 5 , 6 , 7 ]] df.head() df.info() df.columns=[ '產業類別' , '公司代號' , '公司名稱' , '平均數108' , '平均數107' , '中位數108' ] df.sort_values( '中位數108' ,ascending= False ) #df.plot(kind='bar',title='bar title&#

R 添加分隔符

 library(tidyverse) library(openxlsx) dat <- read.xlsx('./Documents/镇原县注射器库存.xlsx') %>%    select(b) %>%    as_vector() %>%    str_c(collapse = "','")

R 计算月龄

library(lubridate) calc_age <- function(birthDate, refDate = Sys.Date()) {   require(lubridate)   period <- as.period(interval(birthDate, refDate),unit = "days")   period$day } #满8月龄 ymd('2020-05-31') %m-% months(3*12) ymd('2019-12-31') %m+% months(1:12)

hosts修改

vim  /etc/hosts 0.0.0.0 account.jetbrains.com 0.0.0.0 www.jetbrains.com 127.0.0.1 transact.netsarang.com 127.0.0.1 update.netsarang.com 127.0.0.1 www.netsarang.com 127.0.0.1 www.netsarang.co.kr 127.0.0.1 sales.netsarang.com

分组建模

mtcars %>%   group_by(cyl) %>%   group_modify(     ~broom::tidy(lm(mpg ~ wt, data = .))   ) mtcars %>%   group_by(cyl) %>%   summarise(     broom::tidy(lm(mpg ~ wt))   ) mtcars %>%   group_by(cyl) %>%   summarise(     broom::tidy(lm(mpg ~ wt, data = cur_data()))   ) mtcars %>%   group_by(cyl) %>%   nest() %>%   mutate(model = purrr::map(data, ~ lm(mpg ~ wt, data = .))) %>%   mutate(result = purrr::map(model, ~ broom::tidy(.))) %>%   unnest(result) mtcars %>%   nest_by(cyl) %>%   mutate(model = list(lm(mpg ~ wt, data = data))) %>%   summarise(broom::tidy(model)) mtcars %>%   nest_by(cyl) %>%   summarise(     broom::tidy(lm(mpg ~ wt, data = data))   )

LBE 安装

if (!requireNamespace("BiocManager", quietly = TRUE))   install.packages("BiocManager") BiocManager::install(version = "3.11") if (!requireNamespace("BiocManager", quietly = TRUE))   install.packages("BiocManager") BiocManager::install("LBE") sudo R CMD javareconf JAVAC=/usr/lib/jvm/java-1.8.0-openjdk-amd64/bin/javac JAR=/usr/lib/jvm/java-1.8.0-openjdk-amd64/bin/jar JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64/bin/jar

计算接种率

#R 实现 inoc %>%   filter(str_sub(gldw_bm,1,2)=='62') %>%    mutate(yzyl=floor(day(as.period(interval(csrq,ymd('2021-01-25')),unit = "days"))/30)) %>%       filter(yzyl>=8 & yzyl<18) %>%    group_by(xian) %>%   summarise(deno=n_distinct(grda_et_lsh),num=sum(str_sub(ym_bm,1,2) %in% c('14') & jz_zc %in% c(1),na.rm = TRUE)) %>%   mutate(prec=round(num/deno*100,1)) %>%    left_join(xzqh,by=c('xian'='dzbm')) %>%    arrange(xian) Python实现 (inoc >>head( 1000 ) >>mutate( yzyl =((pd.to_datetime( '2021-01-25' )-_.csrq).dt.days/ 30 ), xian =_.gldw_bm.str[ 0 : 6 ]) >> filter (_.gldw_bm.str[ 0 : 2 ]== '62' ,_.yzyl>= 8 ) >>group_by(_.xian) >>summarize( n =n_distinct(_.grda_code), n3 =(((_.ym_bm.str[ 0 : 2 ]== '12' ) & (_.jz_zc== 1 )).sum( skipna = True ))) >>mutate( pert =np.round(_.n3/_.n* 100 , 2 )) >>arrange(_.xian) )

forcats包

library(tidyverse) x <- c("A","z","g",NA) x %>% as.factor()  # 默认按照字母排序 x %>% as_factor()  # 默认按照出现顺序排列 gss_cat$relig %>% fct_count() %>% arrange(-n) #fct_anon:用任意数字标识符替换因子级别 gss_cat$relig %>% fct_anon() %>% fct_count() %>% arrange(-n) gss_cat$relig %>% fct_anon("X") %>% fct_count() #fct_c:合并级别,连接因子 fa <- factor("a") fb <- factor("b") fab <- factor(c("a", "b")) c(fa, fb, fab) fct_c(fa, fb, fab) #fct_collapse:因子转换,将多个因子合并 fct_count(gss_cat$partyid) %>% arrange(n) partyid2 <- fct_collapse(gss_cat$partyid,                          missing = c("No answer", "Don't know"),                          other = "Other party",                          rep = c("Strong republican", "Not str republican"),                          ind = c("Ind,near rep", "Independent", "Ind,near

ifelse sqlite

library(DBI) library(openxlsx) library(tidyverse) library(magrittr) library(hablar) con <- dbConnect(RSQLite::SQLite(), "~/Downloads/phone.db") dbListTables(con) phones <- dbGetQuery(con, "select * from phones") #phones <- tbl(con,sql("select * from phones")) %>% collect() regions <- dbGetQuery(con, "select * from regions") phones %<>%   left_join(regions,by=c('region_id'='id')) %>%   select(number,province,city,zip_code,area_code) dbDisconnect(con) df2015 <- read.xlsx('./Downloads/甘肃全国12320数据至-20200223.xlsx',sheet = 1) %>%   mutate(year=2015) df2016 <- read.xlsx('./Downloads/甘肃全国12320数据至-20200223.xlsx',sheet = 2) %>%   mutate(year=2016) df2017 <- read.xlsx('./Downloads/甘肃全国12320数据至-20200223.xlsx',sheet = 3) %>%   mutate(year=2017) df2018 <- read.xlsx('./Downloads/甘肃全国12320数据至-20200223.xlsx',sheet = 4) %>%   mutate(year=2018) df

hablar 类型转换

library(hablar) mtcars %>%   convert(num(gear),           chr(mpg),           fct(cyl)) df <- data.frame(a = c("A", NA, "B", "C", "C"),                  b = c(NA, 1, 1, 3, 3),                  c = c(7, 8, 2, 3, 3),                  stringsAsFactors = FALSE) # Returns duplicated rows df %>% find_duplicates() df %>% find_duplicates(b:c) df %>% find_na(b)

slider 行

library(slider) library(lubridate) library(tidyverse) x <- c(1, 2, 3, 4, 5) # .before: How many elements before the current one should be included in the window? # .after: How many elements after the current one should be included in the window? # .complete: Should .f only be evaluated when there is enough data to make a complete window?ff # .step: The number of elements to shift forward between calls to .f. slide_vec(x, mean, .before = 1) slide_vec(x, mean, .after = 1) slide_vec(x, sum, .before = 2) slide_vec(x, sum, .before = 2, .complete = T) index_vec <- as.Date("2019-08-29") + c(0, 1, 5, 6) wday_vec <- as.character(wday(index_vec, label = TRUE)) sales_vec <- c(2, 4, 3, 5) company <- tibble(sales = sales_vec,                   index = index_vec,                   wday = wday_vec) # Over columns: map(company, ~ .x) # Over rows: # slide(company, ~ .x) big_index_vec <- c(as.Date("2019-08-30") + 0:4,                

代理测试

http代理 curl -x 127.0.0.1:10809 https://google.com sock5代理 curl --socks5 127.0.0.1:10808 https://google.com/

pip upgrade all

import pkg_resources from subprocess import call packages = [dist.project_name for dist in pkg_resources.working_set] call("pip install --upgrade " + ' '.join(packages), shell=True) ----- pip install -U $(pip freeze | awk '{split($0, a, "=="); print a[1]}') ---- pip freeze > pip_frozen.txt pip install -r pip_frozen.txt --upgrade

Python pip配置国内源

1、 Linux平台安装方式: (1)创建 pip.conf 文件  首先运行以下命令 cd ~/.pip # 运行此命令切换目录  如果提示目录不存在,自行创建一个(如果目录存在,可跳过此步),如下: mkdir ~/.pip cd ~/.pip  在 .pip 目录下创建一个 pip.conf 文件,如下: touch pip.conf (2)编辑 pip.conf 文件  首先打开文件,命令如下: sudo vi ~/.pip/pip.conf  接着,写入以下内容: [global] index-url = https://pypi.tuna.tsinghua.edu.cn/simple [install] trusted-host = https://pypi.tuna.tsinghua.edu.cn # trusted-host 此参数是为了避免麻烦,否则使用的时候可能会提示不受信任 然后保存退出即可。  2、Window平台安装方式: (1)新建  pip  配置文件夹,直接在user用户目录中创建一个名为 pip  的文件夹( 即 %HOMEPATH%\pip ),如下图所示: (2)接着在 pip 文件夹中创建一个名为 pip 的文本文件(后缀名由" .txt "改为 " .ini "),格式如下所示:  文件内容如下: [global] index-url = https://pypi.tuna.tsinghua.edu.cn/simple [install] trusted-host = https://pypi.tuna.tsinghua.edu.cn # trusted-host 此参数是为了避免麻烦,否则使用的时候可能会提示不受信任 修改完成后保存,启动 cmd ,使用 " pip install xxx "( xxx 为你要下载的包名),即可默认使用国内源下载。

across and case_when

library(tidyverse) #across将一个或多个函数应用在选择列上 iris %>% as_tibble() %>%   mutate(mean=(rowMeans(across(starts_with('Sepal'))))) iris %>%   group_by(Species) %>%   summarise(across(starts_with('Sepal'),mean)) iris %>%   group_by(Species) %>%   summarise(across(starts_with('Sepal'),~mean(.x,na.rm = T))) #避免了写多个列的麻烦 iris %>%   group_by(Species) %>%   summarise(mean=mean(Sepal.Length)) starwars %>%   select(name:mass, gender, species) %>%   mutate(     type = case_when(       height > 200 | mass > 200 ~ "large",       species == "Droid"        ~ "robot",       TRUE                      ~ "other"     )   )

pip不能安装、升级、卸载软件问题

Cannot uninstall ‘nibabel’. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall. 问题解析: 旧版本依赖多,不能清晰的删除,此时应该忽略旧版本升级,即如下 解决办法:sudo pip install nibabel --ignore-installed nibabel 下载安装 https://www.lfd.uci.edu/~gohlke/pythonlibs/ conda update --all --force-reinstall conda update conda --force-reinstall Windows10中,用户目录下.condarc 配置为: channels: – defaults # Show channel URLs when displaying what is going to be downloaded and # in ‘conda list’. The default is False. show_channel_urls: True allow_other_channels: True proxy_servers: http: socks5:// 127.0 . 0.1 : 1080 https: socks5:// 127.0 . 0.1 : 1080 ssl_verify: False

ubuntu install qqff

wget -O- https://deepin-wine.i-m.dev/setup.sh | sh sudo apt install deepin.com.qq.im sudo apt install deepin

R 重采样

library(hyfo) library(lubridate) # Daily to monthly 降采样 TS <- data.frame(Date = seq(ymd('1999-01-01'), length = 365, by = '1 day'),                  num=runif(365, 3, 10)) TS_new <- resample(TS, method = 'day2mon') # Monthly to daily 升采样 TS <- data.frame(Date = seq(ymd('1999-01-01'), length = 12, by = '1 month'),                  num=runif(12, 3, 10)) TS_new <- resample(TS, method = 'mon2day') library(dplyr) library(lubridate) set.seed(2017) options(digits=4) expenses <- tibble(   date=seq(ymd("2019-01-01"), ymd("2020-12-31"), by=1),   amount=rgamma(length(date), shape = 2, scale = 20)) expenses %>% group_by(month=floor_date(date, "month")) %>%   summarize(amount=sum(amount)) expenses %>% group_by(month=floor_date(date, "3month")) %>%   summarize(amount=sum(amount)) expenses %>% group_by(month=floor_date(date, "year")) %>%   summarize(amount=sum(amount)

R#将1列拆分成多列

library(tidyverse) library(stringr) df <- read.csv('~/PycharmProjects/datascience/data/911.csv') #将1列拆分成多列 df3 <- df %>% separate(title,c('a','b'),sep=': ') table(df3$a) df <- read.csv('~/PycharmProjects/datascience/data/911.csv',header = T) df2 <- as.tibble(do.call(rbind, str_split(df$title, ': '))) table(df2$V1)

python on Rstudio.py

Sys.setenv(RETICULATE_PYTHON = "/usr/bin/python3.6") # reticulate::py_config() import request import matplotlib import numpy as np import pandas as pd

RMySQL

图片
library(RMySQL) con <-  dbConnect(RMySQL::MySQL(),                   username = "root",                   password = "123456",                   host = "127.0.0.1",                   port = 3306,                   dbname = "python" ) con <- DBI::dbConnect(odbc::odbc(),                       driver = "/opt/rstudio-drivers/mysql/bin/lib/libmysqlodbc_sb64.so",                       database = "python",                       UID    = "root",                       PWD    = "123456",                       server = "172.17.0.5",                       port = 3306) # con <- dbConnect(odbc::odbc(), "oracledb", UID="samples", #                  PWD= rstudioapi::askForPassword("Samples User Password")) library(DBI) #"MySQL"为/etc/odbc.ini配置的信息 # [MySQL] # Driver=/opt/rstudio-drivers/mysql/bin/lib/libmysqlodbc_sb64.so # DSN=MySQL # Se