博文

目前显示的是 七月, 2022的博文

RNA-seq 流程

 #cuffdiff pipeline FASTQ-reads -> mapping(HISAT2,tophat2) ->BAM ->cuffdiff ->DEGs(FPKM) #DESeq2 pipepline FASTQ-reads -> mapping(HISAT2,tophat2) ->BAM ->Gene Raw Reads Count ->find DEGs in R (No FPKM)

RNA-seq脚本备份

 hisat2-build建立索引 hisat2-build ./data/Saccharomyces_cerevisiae.R64-1-1.dna_rm.toplevel.fa ./data/yeast_ref hisat2将read比对到参参考基因组由于sam格式的文本文件过大,一般将其转换为bam格式的文件 hisat2 -x ./data/yeast_ref -U ./data/SRR1916152.fastq | samtools view -bS -| samtools sort - -o ./data/EV_3.bam hisat2 -x ./data/yeast_ref -U ./data/SRR1916153.fastq | samtools view -bS -| samtools sort - -o ./data/EV_4.bam hisat2 -x ./data/yeast_ref -U ./data/SRR1916154.fastq | samtools view -bS -| samtools sort - -o ./data/DNMT3B_2.bam hisat2 -x ./data/yeast_ref -U ./data/SRR1916155.fastq | samtools view -bS -| samtools sort - -o ./data/DNMT3B_3.bam hisat2 -x ./data/yeast_ref -U ./data/SRR1916156.fastq | samtools view -bS -| samtools sort - -o ./data/DNMT3B_4.bam 建立bam文件的索引 samtools index ./data/EV_3.bam samtools index ./data/EV_4.bam samtools index ./data/DNMT3B_2.bam samtools index ./data/DNMT3B_3.bam samtools index ./data/DNMT3B_4.bam 安装HTSeq软件,计算每个基因中reads的数目 htseq-count -f bam -r pos ./data/EV_3.bam ./data/Saccharomyces_cer

计算fpkm

setwd("E:\\read_count") library(tidyverse) library(magrittr) library(hablar) library(DESeq2) library(GenomicFeatures)  ##计算fpkm #根据参考基因组获得每条reads的exon txdb <- makeTxDbFromGFF("E:\\read_count\\Saccharomyces_cerevisiae.R64-1-1.106.gtf",format="gtf") exons_gene <- exonsBy(txdb, by = "gene") exons_gene_lens <- lapply(exons_gene,function(x){sum(width(reduce(x)))}) n=t(as.data.frame(exons_gene_lens)) #传入Count矩阵和基因长度 FPKM <- function(data_count,gene_len) {   lens <- nrow(data_count)   colnames <- data_count %>% colnames()   for (i in 1:ncol(data_count)) {     mapped_reads <- sum(data_count[1:lens,i])#计算每个样本的mapped reads数,由于data_count文本最后几行是综述数据,需要舍去,因此选择58721行之前的基因数据。     FPKM <- data_count[1:lens,i]/(10^-9*mapped_reads*gene_len[1:lens,1])#计算FPKM值,10^-9是单位转换后的换算,不能放在分母外的原因是mapped_reads*n[1:58721,1]的乘积很大会报错,因此需要先缩小倍数再进行计算     data_count = data.frame(data_count[1:lens,],FPKM)#添加到data_count表格后面i   }   data_count %>%      dplyr

批量读取和写出文件

 #批量读取文件 files <- list.files("./Documents/60岁以上/",pattern = "xlsx",full.names = T) df = purrr::map_dfr(files, readxl::read_xlsx)  save(df,file = './Documents/df.RData') load("D:\\df.RData") #按某列数据进行拆分写为文件 df %>%    group_by(区县名称) %>%    group_walk(~ write_csv(.x, paste0(.y$区县名称, ".csv")))