博文

目前显示的是 2022的博文

接种率数据处理

import math import pandas as pd import numpy as np import janitor import oracledb df=( pd.read_excel( '/mnt/c/Users/xuefe/Downloads/gnldet_jzl.xlsx' , sheet_name = 'sheet1' , skiprows = 3 ). clean_names(). apply( lambda x: x.fillna( method = "ffill" ) if x.name in [ 'unnamed_0' , 'unnamed_1' , 'unnamed_2' ] else x, axis = 0 ). filter( regex = '^un| 率 ' ). query( "unnamed_2 not in [' 总人数 ',' 全程接种 ']" ). apply( lambda x: pd.to_numeric(x.str.replace( "%" , "" ), errors = 'coerce' ) if ' 率 ' in x.name else x, axis = 0 ) ) # 重命名前 4 列 df.rename( columns ={df.columns[ 0 ]: 'dq' , df.columns[ 1 ]: 'bm' , df.columns[ 2 ]: 'ym' , df.columns[ 3 ]: 'jc' }, inplace = True ) # 重命名后 18 列 for i in range ( 4 , 22 ): df.rename( columns ={df.columns[i]: f'age_ { i - 3 } ' }, inplace = True ) # 按条件分别进行填充 # 'f' indi

出生队列接种率处理

 library(tidyverse) library(lubridate) library(showtext) library(janitor) library(openxlsx) library(magrittr) library(hablar) library(readxl) library(ggcharts) library(purrr) showtext_auto(enable = TRUE) `%nin%` = Negate(`%in%`) df <- readxl::read_excel('/mnt/c/Users/xuefe/Downloads/gnldet_jzl.xlsx',sheet = 1,skip = 3) %>%    fill(c(`...1`,`...2`,`...3`),.direction='down') %>%    filter(`...3` %nin% c('总人数','全程接种')) %>%    select(starts_with('...')  | contains("接种率")) %>%    map_dfc(str_remove_all, pattern = "%") %>%    mutate_at(vars(contains("接种率")), as.numeric) %>%    select(-`接种率(%)...22`) # df %>%  #   write.xlsx('/mnt/c/Users/xuefe/Downloads/jzl.xlsx') names(df) <- c('dqmc','bm','ym','jc','age_1','age_2','age_3','age_4','age_5','age_6','age_7','age_8','age_9',              

RJDBC链接oracle数据库

1 安装RJDBC 2 安装 Oracle RJDBC驱动 ( JDBC and UCP Downloads page (oracle.com) ) 3 下载并赋予执行权限 chmod 755 ojdbc8.jar library(RJDBC) jdbcDriver =JDBC("oracle.jdbc.OracleDriver",classPath="/usr/bin/ojdbc8.jar") con =dbConnect(jdbcDriver, "jdbc:oracle:thin:@//192.168.30.48:1521/JZDB1", "username", "password")

R 地址拆分

library(tidyverse) library(lubridate) library(showtext) library(janitor) library(openxlsx) library(stringi) showtext_auto() #时间需要修改 yw_date <- ymd('2022-11-16') `%nin%` = Negate(`%in%`) place_cut <- function(data) {   m=stri_match_all_regex(data,'[中]{0,1}[国]{0,1}([\u4e00-\u9fa5]*?(?:省|自治区|市|新疆|广西|内蒙古|宁夏))([\u4e00-\u9fa5]*?(?:市|区|县|自治州|盟)){0,1}([\u4e00-\u9fa5]*?(?:市|区|县|旗)){0,1}([\u4e00-\u9fa5]*?(?:乡|镇|街道|苏木)){0,1}([\u4e00-\u9fa5]*?(?:\\S+)){0,1}')   sheng=m[[1]][,2]   shi=m[[1]][,3]   xian=m[[1]][,4]   dizhi=str_c(sheng,shi,xian,sep=',')   return(dizhi) } ka <- read.csv('/mnt/d/1116 24时/报告卡.csv',fileEncoding = 'GB18030') %>%    mutate(有效证件号=toupper(str_remove_all(有效证件号,"'")),          报告卡录入时间=ymd_hms(报告卡录入时间),          订正终审时间=case_when(str_detect(订正终审时间,"\\.") ~ "",                           TRUE  ~ as.character(订正终审时间)),          订正报告时间=case_when(str_detect(订正报告时间,"\\.") ~ "",    

pathlib 学习

  import os import pathlib from pathlib import Path import aspose.words as aw def convert_path ( win_path ) : tmp_path = win_path.replace ( ":" , "" ) tmp_path = tmp_path.replace ( " \\ " , "/" ) tmp_path = tmp_path.lower () tmp_path = "/mnt/" + tmp_path return ( tmp_path ) url = pathlib.Path ( convert_path ( r"C:\Users\xuefe\Downloads\Video" ) ) files = list ( url.glob ( '*.*' ) ) # pathlib.PosixPath 转 字符串 for file in files: file= str ( file ) doc = aw.Document ( file, aw.loading.LoadOptions ( "2022" ) ) doc.save ( file ) # file_dir = convert_path(r"C:\Users\xuefe\Downloads\Video") # files = os.listdir(file_dir) # # for file in files: # file_p=os.path.join(file_dir,file) # print(type(file_p)) # doc = aw.Document(file_p, aw.loading.LoadOptions("2022")) # doc.save(file_p) cwd = pathlib.Path.cwd () # 获取当前目录 home = Path.home () # 获

tinytex包使用

 # 下载宏包,先修改为国内镜像源,比如改为清华大学的镜像源 tinytex::tlmgr_repo(url = "http://mirrors.tuna.tsinghua.edu.cn/CTAN/") tinytex::install_tinytex(version = "latest") tinytex::tlmgr_install('elegantbook') tinytex::tlmgr_install('fandol') #默认安装路径 tinytex::tinytex_root()  # 查看轻量级已包含哪些宏包 tl_pkgs()  # 解析错误日志,看看是缺什么宏包造成的 parse_packages("test.log")

pandas 条件赋值

  df["WhereCol"] = np.where((df.Cat == "A") & (df.B > 10), 1, 0) conditions = [ (df.Cat == "A") & (df.B > 10), (df.Cat == "B") & (df.B > 10) ] values = [1, 2] df["SelectCol"] = np.select(conditions, values, default=0) xg.case_when ( xg.A3.str.slice ( 16 , 17 ) .astype ( int ) % 2 == 0 , ' 女 ' , ~xg.A3.str.slice ( 16 , 17 ) .astype ( int ) % 2 == 0 , ' 男 ' , ' 不详 ' , column_name = " 性别 " ) xg=xg.case_when ( xg.A3.str.contains ( '^62' ) , ' 甘肃 ' , ~xg.A3.str.contains ( '^62' ) , ' 外省 ' , ' 其他 ' , column_name = " 归属 " )

pandas 链式操作

import pandas as pd import numpy as np import janitor yilanbiao=pd.read_excel ( convert_path ( r"C:\Users\xuefe\Desktop\ 未上报一览表 ( 截止 18 日 8 时 ).xlsx" ) ) ( yilanbiao. assign ( 身份证号 = lambda x: x [ ' 身份证号 ' ] .str.strip ( "'|’| , |“|‘" ) .str.upper () , 首次阳性采样时间 = lambda x:x [ ' 首次阳性采样时间 ' ] .pipe ( pd.to_datetime, unit = 's' ) ) . add_column ( column_name = "id" , value =yilanbiao [ ' 姓名 ' ] +yilanbiao [ ' 身份证号 ' ] ) ) ( yilanbiao. assign ( 身份证号 = lambda x:x [ ' 身份证号 ' ] .str.strip ( "'|’| , |“|‘" ) .str.upper () ) . concatenate_columns ( column_names = [ ' 姓名 ' , ' 身份证号 ' ] , new_column_name = 'id' , sep = '_' ) . drop_duplicates ( [ 'id' ] ) . drop ( [ 'id' ] , axis = 1 ) . also ( lambda x:x.to_excel ( r'/mnt/c/users/xuefe/desktop/ 去重复后 .xlsx' , index = False ) ) ) test= ( yilanbiao.also ( lambda x:x [ ' 身份证号 '

python 批量移除密码

  import msoffcrypto import pathlib def unlock ( filename, passwd, output_folder ) : temp = open ( filename, 'rb' ) excel = msoffcrypto.OfficeFile ( temp ) excel.load_key ( passwd ) out_path = pathlib.Path ( output_folder ) if not out_path.exists () : out_path.mkdir () with open ( str ( out_path/filename.name ) , 'wb' ) as f: excel.decrypt ( f ) temp.close () def convert_path ( win_path ) : tmp_path = win_path.replace ( ":" , "" ) tmp_path = tmp_path.replace ( " \\ " , "/" ) tmp_path = tmp_path.lower () tmp_path = "/mnt/" + tmp_path return ( tmp_path ) if __name__ == '__main__' : key= "2022" url = pathlib.Path ( convert_path ( r"C:\Users\xuefe\Downloads\Video" ) ) files = list ( url.glob ( '*.*' ) ) for f in files: unlock ( f, key, r'/mnt/c/users/xuefe/downloads/' )

matplotlib中文绘图

1 、在 /usr/share/fonts/下载字体 2、 fc-list :lang=zh 查看字体 3、使用 import matplotlib import numpy as np import pandas as pd import matplotlib.pyplot as plt from matplotlib import font_manager font_manager.fontManager.addfont ( "/usr/share/fonts/SourceHanSansCN-Normal.otf" ) plt.rcParams [ 'font.sans-serif' ] = [ 'Source Han Sans CN' ] ### 解决中文乱码 plt.rcParams [ 'axes.unicode_minus' ] = False # fname 为 你下载的字体库路径,注意 SourceHanSansSC-Bold.otf 字体的路径 dataset = pd.read_csv ( 'data/Rabies.csv' , header = 0 , index_col = 0 ) dataset.index=pd.to_datetime ( dataset.index ) dataset [ ' 狂犬疫苗接种量 ' ] .plot () dataset.plot () plt.show ()   import matplotlib.pyplot as plt from matplotlib.font_manager import FontProperties from plotnine import ggplot, aes, geom_bar, coord_flip, theme, element_text font = FontProperties( fname = "/usr/share/fonts/SourceHanSansCN-Regular.otf" ) (     ggplot(top_vaccines.reset_index(), aes( x = 'ym_mc' , y = &

Rmarkdown控制文本输出的宽度

  使用LaTeX软件包列表,详细参考 https://bookdown.org/yihui/rmarkdown-cookbook/text-width.html --- output : pdf_document : pandoc_args : --listings includes : in_header : preamble.tex --- 我们设置 了列表包的选项 : preamble.tex \lstset { breaklines=true }

rJava安装

sudo apt install libcurl4-openssl-dev libxml2-dev libudunits2-dev  libgdal-dev libbz2-dev libpcre3-dev liblzma-dev zlib1g-dev  -y  sudo apt install -y default-jre default-jdk sudo R CMD javareconf install.packages("rJava") install.packages("xml2", dependencies=TRUE, INSTALL_opts = c('--no-lock'))

R 正则表达式

 jg <- read_table('~/Documents/sjbd.csv',col_names = F) %>%    mutate(X1=str_replace(X1,"^\"","1")) %>%    mutate(X1=str_replace(X1,",\\\\N","")) %>%    filter(!str_detect(X1,"^\\,")) %>%    filter(!str_detect(X1,"^\\(")) %>%    separate(X1,c('A','B','C','D','E','F','G','H','I','M','K','N','O'),sep=",") jg <- read.table('~/Documents/sjbd.csv') 

解决 Windows11 下 wsl 2使用 matplotlib.pyplot 画图没反应的问题

图片
  获取wls2主机地址 并 添加 .bashrc文件中  Dynamically export the  DISPLAY   environment variable in WSL2; # add to ~/.bashrc, and source it. export DISPLAY=$(cat /etc/resolv.conf | grep nameserver | awk '{print $2}'):0 2. 下载并打开  VcXsrv  on Windows. It’s  import  to check the “Disable access contr ol” to allow connection from WSL2; 3. pycharm中设置 Setting the DISPLAY environment variable in PyCharm;

python链接oracle数据库

import pandas as pd import numpy as np import janitor import oracledb # 链接方式一 params = oracledb.ConnectParams ( host = "192.168.30.48" , port = 1521 , service_name = "JZDB1" ) con = oracledb.connect ( user = "ipvsdb" , password = "######" , params =params ) # 链接方式二 dsn= f' { "ipvsdb" } / { "#####" } @ { 1521 } :192.168.30.48/JZDB1' con=oracledb.connect ( dsn ) sqldq = ''' select dzbm,dzmc from sys_xzqh_zzjg ''' xzqh = pd.read_sql ( sqldq, con ) con.close ()

解决PyCharm中Python Console的中文乱码问题

  进入Settings->Build, Execution, Deploymer->Console->Python Console Environment variables中添加PYTHONIOENCODING=UTF-8 Settings->Editor->File Encoding 均选择UTF-8

RNA-seq 流程

 #cuffdiff pipeline FASTQ-reads -> mapping(HISAT2,tophat2) ->BAM ->cuffdiff ->DEGs(FPKM) #DESeq2 pipepline FASTQ-reads -> mapping(HISAT2,tophat2) ->BAM ->Gene Raw Reads Count ->find DEGs in R (No FPKM)

RNA-seq脚本备份

 hisat2-build建立索引 hisat2-build ./data/Saccharomyces_cerevisiae.R64-1-1.dna_rm.toplevel.fa ./data/yeast_ref hisat2将read比对到参参考基因组由于sam格式的文本文件过大,一般将其转换为bam格式的文件 hisat2 -x ./data/yeast_ref -U ./data/SRR1916152.fastq | samtools view -bS -| samtools sort - -o ./data/EV_3.bam hisat2 -x ./data/yeast_ref -U ./data/SRR1916153.fastq | samtools view -bS -| samtools sort - -o ./data/EV_4.bam hisat2 -x ./data/yeast_ref -U ./data/SRR1916154.fastq | samtools view -bS -| samtools sort - -o ./data/DNMT3B_2.bam hisat2 -x ./data/yeast_ref -U ./data/SRR1916155.fastq | samtools view -bS -| samtools sort - -o ./data/DNMT3B_3.bam hisat2 -x ./data/yeast_ref -U ./data/SRR1916156.fastq | samtools view -bS -| samtools sort - -o ./data/DNMT3B_4.bam 建立bam文件的索引 samtools index ./data/EV_3.bam samtools index ./data/EV_4.bam samtools index ./data/DNMT3B_2.bam samtools index ./data/DNMT3B_3.bam samtools index ./data/DNMT3B_4.bam 安装HTSeq软件,计算每个基因中reads的数目 htseq-count -f bam -r pos ./data/EV_3.bam ./data/Saccharomyces_cer

计算fpkm

setwd("E:\\read_count") library(tidyverse) library(magrittr) library(hablar) library(DESeq2) library(GenomicFeatures)  ##计算fpkm #根据参考基因组获得每条reads的exon txdb <- makeTxDbFromGFF("E:\\read_count\\Saccharomyces_cerevisiae.R64-1-1.106.gtf",format="gtf") exons_gene <- exonsBy(txdb, by = "gene") exons_gene_lens <- lapply(exons_gene,function(x){sum(width(reduce(x)))}) n=t(as.data.frame(exons_gene_lens)) #传入Count矩阵和基因长度 FPKM <- function(data_count,gene_len) {   lens <- nrow(data_count)   colnames <- data_count %>% colnames()   for (i in 1:ncol(data_count)) {     mapped_reads <- sum(data_count[1:lens,i])#计算每个样本的mapped reads数,由于data_count文本最后几行是综述数据,需要舍去,因此选择58721行之前的基因数据。     FPKM <- data_count[1:lens,i]/(10^-9*mapped_reads*gene_len[1:lens,1])#计算FPKM值,10^-9是单位转换后的换算,不能放在分母外的原因是mapped_reads*n[1:58721,1]的乘积很大会报错,因此需要先缩小倍数再进行计算     data_count = data.frame(data_count[1:lens,],FPKM)#添加到data_count表格后面i   }   data_count %>%      dplyr

批量读取和写出文件

 #批量读取文件 files <- list.files("./Documents/60岁以上/",pattern = "xlsx",full.names = T) df = purrr::map_dfr(files, readxl::read_xlsx)  save(df,file = './Documents/df.RData') load("D:\\df.RData") #按某列数据进行拆分写为文件 df %>%    group_by(区县名称) %>%    group_walk(~ write_csv(.x, paste0(.y$区县名称, ".csv")))

安裝及使用 FastQC

 1 下载FASTQC  2、 安装好Java,JDK 3、解压 FastQC,linux 运行fastqc,如win 运行run_fastqc.bat

SRAtookit 使用

  1 、解压,将路径添加环境变量。 2 、运行 vdb-config -i 进行环境配置,需要启用“远程访问“并设置缓存选项卡,设置 存储库目录需要设置为空文件夹 。 3 、运行 fastq-dump.exe SRR1916156.1

批量进行OneHot编码

 #批量进行OneHot编码 cate=data.select_dtypes(include="object").columns.values.tolist() dummies = pd.get_dummies(data[cate], drop_first=True,prefix=cate) data = pd.concat([data.drop(cate,axis=1), dummies],axis=1)

pandas 查询

 jl.query("jz_sj<=@pd.Timestamp('2021-12-31') & ym_ph in @jg.疫苗批号") jl.drop_duplicates(['grda_code', 'jz_zc'])

stringr 数据框转字符串

 library(tidyverse) library(hablar) library(janitor) options(scipen=200) tst <- read.csv("D:\\ph.csv") %>%    clean_names() %>%    distinct() %>%    str_c(sep="','") %>%      str_trim() %>%    str_replace_all("[\"]","'") %>%    str_remove_all("[\n]")  

rstatix

 library(rstatix)  library(ggpubr)  # For easy data-visualizatio library(gtsummary) library(tidyverse) iris %>%    group_by(Species) %>%    get_summary_stats(Sepal.Length, Sepal.Width, type = "common") iris %>% get_summary_stats(type = "common") iris %>%   group_by(Species) %>%    get_summary_stats(Sepal.Length, show=c('n'))    df <- ToothGrowth df$dose <- as.factor(df$dose) # 比较2个独立组别 df %>%    t_test(len ~ supp, paired = FALSE)  -> stat.test # 分组数据:在按照「dose」分组后比较 supp 水平: stat.test <- df %>%   group_by(dose) %>%   t_test(len ~ supp) %>%   adjust_pvalue() %>%   add_significance("p.adj") # 比较配对样本 stat.test <- df %>%    t_test(len ~ supp, paired = TRUE) stat.test <- df %>%  # 成对比较:如果分组变量包含多于2个分类,会自动执行成对比较 pairwise.test <- df %>% t_test(len ~ dose) # 基于参考组的成对比较 stat.test <- df %>% t_test(len ~ dose, ref.group = "0.5") # 基于总体水平的成对比较 # T-test stat.test <- df %>% t_test(le

生信分析

安装包   if (!requireNamespace("BiocManager", quietly = TRUE))      install.packages("BiocManager") BiocManager::install(c("GSEABase","GSVA","clusterProfiler" ),ask = F,update = F) BiocManager::install(c("GEOquery","limma","impute" ),ask = F,update = F) BiocManager::install(c("org.Hs.eg.db","hgu133plus2.db" ),ask = F,update = F) BiocManager::install("GDCRNATools") if (! require (tinyarray))devtools::install_github( "xjsun1221/tinyarray" ,upgrade =  F )

holoviz可视化

conda update -n base conda conda install -c pyviz holoviz geoviews

rstatix及gtsummary

 library(rstatix)  library(ggpubr)  # For easy data-visualizatio library(gtsummary) library(tidyverse) iris %>%    group_by(Species) %>%    get_summary_stats(Sepal.Length, Sepal.Width, type = "common") iris %>% get_summary_stats(type = "common") iris %>%   group_by(Species) %>%    get_summary_stats(Sepal.Length, show=c('n'))    df <- ToothGrowth df$dose <- as.factor(df$dose) # 比较2个独立组别 df %>%    t_test(len ~ supp, paired = FALSE)  -> stat.test # 分组数据:在按照「dose」分组后比较 supp 水平: stat.test <- df %>%   group_by(dose) %>%   t_test(len ~ supp) %>%   adjust_pvalue() %>%   add_significance("p.adj") # 比较配对样本 stat.test <- df %>%    t_test(len ~ supp, paired = TRUE) stat.test <- df %>%  # 成对比较:如果分组变量包含多于2个分类,会自动执行成对比较 pairwise.test <- df %>% t_test(len ~ dose) # 基于参考组的成对比较 stat.test <- df %>% t_test(len ~ dose, ref.group = "0.5") # 基于总体水平的成对比较 # T-test stat.test <- df %>% t_test(le