xuefliang

博文

目前显示的是 2022的博文

接种率数据处理

十二月 31, 2022

import math import pandas as pd import numpy as np import janitor import oracledb df=( pd.read_excel( '/mnt/c/Users/xuefe/Downloads/gnldet_jzl.xlsx' , sheet_name = 'sheet1' , skiprows = 3 ). clean_names(). apply( lambda x: x.fillna( method = "ffill" ) if x.name in [ 'unnamed_0' , 'unnamed_1' , 'unnamed_2' ] else x, axis = 0 ). filter( regex = '^un| 率 ' ). query( "unnamed_2 not in [' 总人数 ',' 全程接种 ']" ). apply( lambda x: pd.to_numeric(x.str.replace( "%" , "" ), errors = 'coerce' ) if ' 率 ' in x.name else x, axis = 0 ) ) # 重命名前 4 列 df.rename( columns ={df.columns[ 0 ]: 'dq' , df.columns[ 1 ]: 'bm' , df.columns[ 2 ]: 'ym' , df.columns[ 3 ]: 'jc' }, inplace = True ) # 重命名后 18 列 for i in range ( 4 , 22 ): df.rename( columns ={df.columns[i]: f'age_ { i - 3 } ' }, inplace = True ) # 按条件分别进行填充 # 'f' indi...

出生队列接种率处理

十二月 27, 2022

library(tidyverse) library(lubridate) library(showtext) library(janitor) library(openxlsx) library(magrittr) library(hablar) library(readxl) library(ggcharts) library(purrr) showtext_auto(enable = TRUE) `%nin%` = Negate(`%in%`) df <- readxl::read_excel('/mnt/c/Users/xuefe/Downloads/gnldet_jzl.xlsx',sheet = 1,skip = 3) %>% fill(c(`...1`,`...2`,`...3`),.direction='down') %>% filter(`...3` %nin% c('总人数','全程接种')) %>% select(starts_with('...') | contains("接种率")) %>% map_dfc(str_remove_all, pattern = "%") %>% mutate_at(vars(contains("接种率")), as.numeric) %>% select(-`接种率(%)...22`) # df %>% # write.xlsx('/mnt/c/Users/xuefe/Downloads/jzl.xlsx') names(df) <- c('dqmc','bm','ym','jc','age_1','age_2','age_3','age_4','age_5','age_6','age_7','age_8','age_9', ...

RJDBC链接oracle数据库

十一月 19, 2022

1 安装RJDBC 2 安装 Oracle RJDBC驱动（ JDBC and UCP Downloads page (oracle.com) ） 3 下载并赋予执行权限 chmod 755 ojdbc8.jar library(RJDBC) jdbcDriver =JDBC("oracle.jdbc.OracleDriver",classPath="/usr/bin/ojdbc8.jar") con =dbConnect(jdbcDriver, "jdbc:oracle:thin:@//192.168.30.48:1521/JZDB1", "username", "password")

R 地址拆分

十一月 17, 2022

library(tidyverse) library(lubridate) library(showtext) library(janitor) library(openxlsx) library(stringi) showtext_auto() #时间需要修改 yw_date <- ymd('2022-11-16') `%nin%` = Negate(`%in%`) place_cut <- function(data) { m=stri_match_all_regex(data,'[中]{0,1}[国]{0,1}([\u4e00-\u9fa5]*?(?:省|自治区|市|新疆|广西|内蒙古|宁夏))([\u4e00-\u9fa5]*?(?:市|区|县|自治州|盟)){0,1}([\u4e00-\u9fa5]*?(?:市|区|县|旗)){0,1}([\u4e00-\u9fa5]*?(?:乡|镇|街道|苏木)){0,1}([\u4e00-\u9fa5]*?(?:\\S+)){0,1}') sheng=m[[1]][,2] shi=m[[1]][,3] xian=m[[1]][,4] dizhi=str_c(sheng,shi,xian,sep=',') return(dizhi) } ka <- read.csv('/mnt/d/1116 24时/报告卡.csv',fileEncoding = 'GB18030') %>% mutate(有效证件号=toupper(str_remove_all(有效证件号,"'")), 报告卡录入时间=ymd_hms(报告卡录入时间), 订正终审时间=case_when(str_detect(订正终审时间,"\\.") ~ "", TRUE ~ as.character(订正终审时...

pathlib 学习

十月 21, 2022

import os import pathlib from pathlib import Path import aspose.words as aw def convert_path ( win_path ) : tmp_path = win_path.replace ( ":" , "" ) tmp_path = tmp_path.replace ( " \\ " , "/" ) tmp_path = tmp_path.lower () tmp_path = "/mnt/" + tmp_path return ( tmp_path ) url = pathlib.Path ( convert_path ( r"C:\Users\xuefe\Downloads\Video" ) ) files = list ( url.glob ( '*.*' ) ) # pathlib.PosixPath 转字符串 for file in files: file= str ( file ) doc = aw.Document ( file, aw.loading.LoadOptions ( "2022" ) ) doc.save ( file ) # file_dir = convert_path(r"C:\Users\xuefe\Downloads\Video") # files = os.listdir(file_dir) # # for file in files: # file_p=os.path.join(file_dir,file) # print(type(file_p)) # doc = aw.Document(file_p, aw.loading.LoadOptions("2022")) # doc.save(file_p) cwd = pathlib.Path.cwd () # 获取当前目录 home = Path.home () # 获...

tinytex包使用

十月 21, 2022

# 下载宏包，先修改为国内镜像源，比如改为清华大学的镜像源 tinytex::tlmgr_repo(url = "http://mirrors.tuna.tsinghua.edu.cn/CTAN/") tinytex::install_tinytex(version = "latest") tinytex::tlmgr_install('elegantbook') tinytex::tlmgr_install('fandol') #默认安装路径 tinytex::tinytex_root() # 查看轻量级已包含哪些宏包 tl_pkgs() # 解析错误日志，看看是缺什么宏包造成的 parse_packages("test.log")

pandas 条件赋值

十月 19, 2022

df["WhereCol"] = np.where((df.Cat == "A") & (df.B > 10), 1, 0) conditions = [ (df.Cat == "A") & (df.B > 10), (df.Cat == "B") & (df.B > 10) ] values = [1, 2] df["SelectCol"] = np.select(conditions, values, default=0) xg.case_when ( xg.A3.str.slice ( 16 , 17 ) .astype ( int ) % 2 == 0 , ' 女 ' , ~xg.A3.str.slice ( 16 , 17 ) .astype ( int ) % 2 == 0 , ' 男 ' , ' 不详 ' , column_name = " 性别 " ) xg=xg.case_when ( xg.A3.str.contains ( '^62' ) , ' 甘肃 ' , ~xg.A3.str.contains ( '^62' ) , ' 外省 ' , ' 其他 ' , column_name = " 归属 " )

pandas 链式操作

十月 18, 2022

import pandas as pd import numpy as np import janitor yilanbiao=pd.read_excel ( convert_path ( r"C:\Users\xuefe\Desktop\ 未上报一览表 ( 截止 18 日 8 时 ).xlsx" ) ) ( yilanbiao. assign ( 身份证号 = lambda x: x [ ' 身份证号 ' ] .str.strip ( "'|’| ， |“|‘" ) .str.upper () , 首次阳性采样时间 = lambda x:x [ ' 首次阳性采样时间 ' ] .pipe ( pd.to_datetime, unit = 's' ) ) . add_column ( column_name = "id" , value =yilanbiao [ ' 姓名 ' ] +yilanbiao [ ' 身份证号 ' ] ) ) ( yilanbiao. assign ( 身份证号 = lambda x:x [ ' 身份证号 ' ] .str.strip ( "'|’| ， |“|‘" ) .str.upper () ) . concatenate_columns ( column_names = [ ' 姓名 ' , ' 身份证号 ' ] , new_column_name = 'id' , sep = '_' ) . drop_duplicates ( [ 'id' ] ) . drop ( [ 'id' ] , axis = 1 ) . also ( lambda x:x.to_excel ( r'/mnt/c/users/xuefe/desktop/ 去重复后 .xlsx' , index = False ) ) ) test= ( yilanbiao.also ( lambda x:x [ ' 身份证号 '...

python 批量移除密码

十月 17, 2022

import msoffcrypto import pathlib def unlock ( filename, passwd, output_folder ) : temp = open ( filename, 'rb' ) excel = msoffcrypto.OfficeFile ( temp ) excel.load_key ( passwd ) out_path = pathlib.Path ( output_folder ) if not out_path.exists () : out_path.mkdir () with open ( str ( out_path/filename.name ) , 'wb' ) as f: excel.decrypt ( f ) temp.close () def convert_path ( win_path ) : tmp_path = win_path.replace ( ":" , "" ) tmp_path = tmp_path.replace ( " \\ " , "/" ) tmp_path = tmp_path.lower () tmp_path = "/mnt/" + tmp_path return ( tmp_path ) if __name__ == '__main__' : key= "2022" url = pathlib.Path ( convert_path ( r"C:\Users\xuefe\Downloads\Video" ) ) files = list ( url.glob ( '*.*' ) ) for f in files: unlock ( f, key, r'/mnt/c/users/xuefe/downloads/' )

matplotlib中文绘图

九月 25, 2022

1 、在 /usr/share/fonts/下载字体 2、 fc-list :lang=zh 查看字体 3、使用 import matplotlib import numpy as np import pandas as pd import matplotlib.pyplot as plt from matplotlib import font_manager font_manager.fontManager.addfont ( "/usr/share/fonts/SourceHanSansCN-Normal.otf" ) plt.rcParams [ 'font.sans-serif' ] = [ 'Source Han Sans CN' ] ### 解决中文乱码 plt.rcParams [ 'axes.unicode_minus' ] = False # fname 为你下载的字体库路径，注意 SourceHanSansSC-Bold.otf 字体的路径 dataset = pd.read_csv ( 'data/Rabies.csv' , header = 0 , index_col = 0 ) dataset.index=pd.to_datetime ( dataset.index ) dataset [ ' 狂犬疫苗接种量 ' ] .plot () dataset.plot () plt.show () import matplotlib.pyplot as plt from matplotlib.font_manager import FontProperties from plotnine import ggplot, aes, geom_bar, coord_flip, theme, element_text font = FontProperties( fname = "/usr/share/fonts/SourceHanSansCN-Regular.otf" ) ( ggplot(top_vaccines.reset_index(), aes( x = 'ym_mc' , y = ...

Rmarkdown控制文本输出的宽度

九月 14, 2022

使用LaTeX软件包列表，详细参考 https://bookdown.org/yihui/rmarkdown-cookbook/text-width.html --- output : pdf_document : pandoc_args : --listings includes : in_header : preamble.tex --- 我们设置了列表包的选项： preamble.tex \lstset { breaklines=true }

rJava安装

九月 12, 2022

sudo apt install libcurl4-openssl-dev libxml2-dev libudunits2-dev libgdal-dev libbz2-dev libpcre3-dev liblzma-dev zlib1g-dev -y sudo apt install -y default-jre default-jdk sudo R CMD javareconf install.packages("rJava") install.packages("xml2", dependencies=TRUE, INSTALL_opts = c('--no-lock'))

R 正则表达式

八月 27, 2022

jg <- read_table('~/Documents/sjbd.csv',col_names = F) %>% mutate(X1=str_replace(X1,"^\"","1")) %>% mutate(X1=str_replace(X1,",\\\\N","")) %>% filter(!str_detect(X1,"^\\,")) %>% filter(!str_detect(X1,"^\\（")) %>% separate(X1,c('A','B','C','D','E','F','G','H','I','M','K','N','O'),sep=",") jg <- read.table('~/Documents/sjbd.csv')

解决 Windows11 下 wsl 2使用 matplotlib.pyplot 画图没反应的问题

八月 22, 2022

获取wls2主机地址并添加 .bashrc文件中 Dynamically export the DISPLAY environment variable in WSL2; # add to ~/.bashrc, and source it. export DISPLAY=$(cat /etc/resolv.conf | grep nameserver | awk '{print $2}'):0 2. 下载并打开 VcXsrv on Windows. It’s import to check the “Disable access contr ol” to allow connection from WSL2; 3. pycharm中设置 Setting the DISPLAY environment variable in PyCharm;

python链接oracle数据库

八月 21, 2022

import pandas as pd import numpy as np import janitor import oracledb # 链接方式一 params = oracledb.ConnectParams ( host = "192.168.30.48" , port = 1521 , service_name = "JZDB1" ) con = oracledb.connect ( user = "ipvsdb" , password = "######" , params =params ) # 链接方式二 dsn= f' { "ipvsdb" } / { "#####" } @ { 1521 } :192.168.30.48/JZDB1' con=oracledb.connect ( dsn ) sqldq = ''' select dzbm,dzmc from sys_xzqh_zzjg ''' xzqh = pd.read_sql ( sqldq, con ) con.close ()

解决PyCharm中Python Console的中文乱码问题

八月 20, 2022

进入Settings->Build, Execution, Deploymer->Console->Python Console Environment variables中添加PYTHONIOENCODING=UTF-8 Settings->Editor->File Encoding 均选择UTF-8

RNA-seq 流程

七月 21, 2022

#cuffdiff pipeline FASTQ-reads -> mapping(HISAT2,tophat2) ->BAM ->cuffdiff ->DEGs(FPKM) #DESeq2 pipepline FASTQ-reads -> mapping(HISAT2,tophat2) ->BAM ->Gene Raw Reads Count ->find DEGs in R (No FPKM)

RNA-seq脚本备份

七月 21, 2022

hisat2-build建立索引 hisat2-build ./data/Saccharomyces_cerevisiae.R64-1-1.dna_rm.toplevel.fa ./data/yeast_ref hisat2将read比对到参参考基因组由于sam格式的文本文件过大，一般将其转换为bam格式的文件 hisat2 -x ./data/yeast_ref -U ./data/SRR1916152.fastq | samtools view -bS -| samtools sort - -o ./data/EV_3.bam hisat2 -x ./data/yeast_ref -U ./data/SRR1916153.fastq | samtools view -bS -| samtools sort - -o ./data/EV_4.bam hisat2 -x ./data/yeast_ref -U ./data/SRR1916154.fastq | samtools view -bS -| samtools sort - -o ./data/DNMT3B_2.bam hisat2 -x ./data/yeast_ref -U ./data/SRR1916155.fastq | samtools view -bS -| samtools sort - -o ./data/DNMT3B_3.bam hisat2 -x ./data/yeast_ref -U ./data/SRR1916156.fastq | samtools view -bS -| samtools sort - -o ./data/DNMT3B_4.bam 建立bam文件的索引 samtools index ./data/EV_3.bam samtools index ./data/EV_4.bam samtools index ./data/DNMT3B_2.bam samtools index ./data/DNMT3B_3.bam samtools index ./data/DNMT3B_4.bam 安装HTSeq软件，计算每个基因中reads的数目 htseq-count -f bam -r pos ./data/EV_3.bam ./data/Saccharomyces...

计算fpkm

七月 21, 2022

setwd("E:\\read_count") library(tidyverse) library(magrittr) library(hablar) library(DESeq2) library(GenomicFeatures) ##计算fpkm #根据参考基因组获得每条reads的exon txdb <- makeTxDbFromGFF("E:\\read_count\\Saccharomyces_cerevisiae.R64-1-1.106.gtf",format="gtf") exons_gene <- exonsBy(txdb, by = "gene") exons_gene_lens <- lapply(exons_gene,function(x){sum(width(reduce(x)))}) n=t(as.data.frame(exons_gene_lens)) #传入Count矩阵和基因长度 FPKM <- function(data_count,gene_len) { lens <- nrow(data_count) colnames <- data_count %>% colnames() for (i in 1:ncol(data_count)) { mapped_reads <- sum(data_count[1:lens,i])#计算每个样本的mapped reads数,由于data_count文本最后几行是综述数据，需要舍去，因此选择58721行之前的基因数据。 FPKM <- data_count[1:lens,i]/(10^-9*mapped_reads*gene_len[1:lens,1])#计算FPKM值，10^-9是单位转换后的换算，不能放在分母外的原因是mapped_reads*n[1:58721,1]的乘积很大会报错，因此需要先缩小倍数再进行计算 data_count = data.frame(data_count[1:lens,],FPKM)#添加到data_count表格后面i } data_coun...

批量读取和写出文件

七月 05, 2022

#批量读取文件 files <- list.files("./Documents/60岁以上/",pattern = "xlsx",full.names = T) df = purrr::map_dfr(files, readxl::read_xlsx) save(df,file = './Documents/df.RData') load("D:\\df.RData") #按某列数据进行拆分写为文件 df %>% group_by(区县名称) %>% group_walk(~ write_csv(.x, paste0(.y$区县名称, ".csv")))

安裝及使用 FastQC

五月 07, 2022

1 下载FASTQC 2、安装好Java，JDK 3、解压 FastQC，linux 运行fastqc，如win 运行run_fastqc.bat

SRAtookit 使用

五月 06, 2022

1 、解压，将路径添加环境变量。 2 、运行 vdb-config -i 进行环境配置，需要启用“远程访问“并设置缓存选项卡，设置存储库目录需要设置为空文件夹。 3 、运行 fastq-dump.exe SRR1916156.1

批量进行OneHot编码

四月 23, 2022

#批量进行OneHot编码 cate=data.select_dtypes(include="object").columns.values.tolist() dummies = pd.get_dummies(data[cate], drop_first=True,prefix=cate) data = pd.concat([data.drop(cate,axis=1), dummies],axis=1)

pandas 查询

四月 08, 2022

jl.query("jz_sj<=@pd.Timestamp('2021-12-31') & ym_ph in @jg.疫苗批号") jl.drop_duplicates(['grda_code', 'jz_zc'])

stringr 数据框转字符串

二月 21, 2022

library(tidyverse) library(hablar) library(janitor) options(scipen=200) tst <- read.csv("D:\\ph.csv") %>% clean_names() %>% distinct() %>% str_c(sep="','") %>% str_trim() %>% str_replace_all("[\"]","'") %>% str_remove_all("[\n]")

rstatix

二月 14, 2022

library(rstatix) library(ggpubr) # For easy data-visualizatio library(gtsummary) library(tidyverse) iris %>% group_by(Species) %>% get_summary_stats(Sepal.Length, Sepal.Width, type = "common") iris %>% get_summary_stats(type = "common") iris %>% group_by(Species) %>% get_summary_stats(Sepal.Length, show=c('n')) df <- ToothGrowth df$dose <- as.factor(df$dose) # 比较2个独立组别 df %>% t_test(len ~ supp, paired = FALSE) -> stat.test # 分组数据：在按照「dose」分组后比较 supp 水平： stat.test <- df %>% group_by(dose) %>% t_test(len ~ supp) %>% adjust_pvalue() %>% add_significance("p.adj") # 比较配对样本 stat.test <- df %>% t_test(len ~ supp, paired = TRUE) stat.test <- df %>% # 成对比较：如果分组变量包含多于2个分类，会自动执行成对比较 pairwise.test <- df %>% t_test(len ~ dose) # 基于参考组的成对比较 stat.test <- df %>% t_test(len ~ dose, ref.group = "0.5...

生信分析

二月 11, 2022

安装包 if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install(c("GSEABase","GSVA","clusterProfiler" ),ask = F,update = F) BiocManager::install(c("GEOquery","limma","impute" ),ask = F,update = F) BiocManager::install(c("org.Hs.eg.db","hgu133plus2.db" ),ask = F,update = F) BiocManager::install("GDCRNATools") if (! require (tinyarray))devtools::install_github( "xjsun1221/tinyarray" ,upgrade = F )

holoviz可视化

一月 29, 2022

conda update -n base conda conda install -c pyviz holoviz geoviews

rstatix及gtsummary

一月 13, 2022

library(rstatix) library(ggpubr) # For easy data-visualizatio library(gtsummary) library(tidyverse) iris %>% group_by(Species) %>% get_summary_stats(Sepal.Length, Sepal.Width, type = "common") iris %>% get_summary_stats(type = "common") iris %>% group_by(Species) %>% get_summary_stats(Sepal.Length, show=c('n')) df <- ToothGrowth df$dose <- as.factor(df$dose) # 比较2个独立组别 df %>% t_test(len ~ supp, paired = FALSE) -> stat.test # 分组数据：在按照「dose」分组后比较 supp 水平： stat.test <- df %>% group_by(dose) %>% t_test(len ~ supp) %>% adjust_pvalue() %>% add_significance("p.adj") # 比较配对样本 stat.test <- df %>% t_test(len ~ supp, paired = TRUE) stat.test <- df %>% # 成对比较：如果分组变量包含多于2个分类，会自动执行成对比较 pairwise.test <- df %>% t_test(len ~ dose) # 基于参考组的成对比较 stat.test <- df %>% t_test(len ~ dose, ref.group = "0.5...