美文网首页
TCGA学习——从下载开始

TCGA学习——从下载开始

作者: monkey_study | 来源:发表于2022-05-03 10:30 被阅读0次

参考了生信技能树jimmy老师的TCGA下载代码,摸索了不少时间,尝试了for循环和apply函数,不得不说,apply函数真好用!比for循环快了好多奥!感谢jimmy老师!!!

代码超级棒!
还稍微学了一点点正则表达式的应用,它好神奇!!!
我用的是新版的TCGAbiolinks下载数据的!参数设置相比以前的教程而言有少许变化,磕磕绊绊总算下载下来了,可以进行后续处理啦。下载下来的表达矩阵包含了count,FPKM以及其他的多种格式数据,我取了count数据!
废话不多说了,直接上代码!

rm(list = ls())
library(TCGAbiolinks)
project_ids<-TCGAbiolinks:::getGDCprojects()$project_id 
TCGA_ids<- project_ids[grep("TCGA",project_ids,ignore.case = T)]
project<- "TCGA-BRCA"
data.category <- "Clinical"
TCGAbiolinks:::getProjectSummary("TCGA-BRCA")  #查看file_count、data_categories,case_count
query <- GDCquery(
  project =project,
  data.category = data.category,
  data.type = "Clinical Supplement", 
  data.format = "BCR XML",
  legacy = FALSE
)
save(query,file = "query.Rdata")
load("query.Rdata")
data <- getResults(query)
dim(data)
directory <- "D:/Rstudio_data/TCGA/project/pyroptosis_lncRNA/GDCdata/Clinical-brca/"
GDCdownload(query, method = "api", files.per.chunk = 50,directory =directory )

# Load the packages required to read XML files.
library("XML")
library("methods")
dir <- "D:/Rstudio_data/TCGA/project/pyroptosis_lncRNA/GDCdata/Clinical-brca/"

all_fiels=list.files(path = dir ,pattern='*.xml$',recursive=T)
cl = lapply(all_fiels
            , function(x){
              #x=all_fiels[1]
              result <- xmlParse(file = file.path(dir,x)) 
              rootnode <- xmlRoot(result)  
              xmldataframe <- xmlToDataFrame( rootnode[2] ) 
              return(t(xmldataframe))
            })

cl_df <- t(do.call(cbind,cl))
save(cl_df,file = 'BRCA_clinical_df.Rdata')



options(stringsAsFactors = F)
library(dplyr)
#探索
x = read.table(file = "GDCdata/TCGA-BRCA/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/12c6b269-434e-4a18-aa93-844bc9c5eb4d/947967e5-9255-49e1-886b-3dd81909a9da.rna_seq.augmented_star_gene_counts.tsv",comment.char = "#",header = T,sep = "\t",check.names = T)
x <- x[-c(1:4),]
x2 = read.table(file ="GDCdata/TCGA-BRCA/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/130343f0-62f4-4400-8002-e596edb3c02a/fcc2a5c0-ec23-43b5-b616-9b03c8ff4129.rna_seq.augmented_star_gene_counts.tsv",comment.char = "#",header = T,sep = "\t",check.names = T)
x2 <- x2[-c(1:4),]
identical(x$gene_id,x2$gene_id)
#得到两者基因Id顺序是一致的,可以直接用cbind结合
##批量读取所有counts.gz文件
count_filepath = list.files(path = "./GDCdata/TCGA-BRCA/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/",
                            pattern = "*.counts.tsv$",
                            recursive = T)
dir <- "./GDCdata/TCGA-BRCA/harmonized/Transcriptome_Profiling/Gene_Expression_Quantification/"
exp_count <- lapply(count_filepath,function(x){
  result <- read.table(file = file.path(dir,x),comment.char = "#",header = T,sep = "\t",check.names = T)[-c(1:4),]
  return(result)
})
exp <- do.call(cbind,exp_count)
a<- exp[,c(1,2,3)]
b<- exp[,!colnames(exp)==c("gene_id","gene_name","gene_type")]
b <- b[,colnames(b)[grep("^unstranded.*",colnames(b))]]
ex <- cbind(a,b)
save(ex,file = "Brca_count_noanno.Rdata")
load("Brca_count_noanno.Rdata")

library(jsonlite)
jsonFile <- "GDCdata/metadata.cart.2022-05-03.json"
metadata_json_File <- jsonlite::fromJSON(txt = jsonFile) #data.frame 1226
ID = sapply(metadata_json_File$associated_entities,function(x){x[,1]}) ## 文件id
file2id = data.frame(file_name = metadata_json_File$file_name,
                     ID = ID)
head(file2id$file_name)
head(count_filepath)

count_files2 = stringr::str_split(count_filepath,"/",simplify = T)[,2]
count_files2[1] %in% file2id$file_name
file2id = file2id[match(count_files2,file2id$file_name),]
colnames(exp) = file2id$ID
b<- ex[,!colnames(ex)==c("gene_id","gene_name","gene_type")]
colnames(b) = file2id$ID
a<- ex[,c(1,2,3)]
ex <- cbind(a,b)
save(ex,file = "Brca_count_anno.Rdata")
load("Brca_count_noanno.Rdata")
View(ex[1:4,1:4])

说在最后,第一次真正下载数据,上手发现还是有一定难度的奥,查找资料,尝试修改,有不足的地方大家指正奥!
其中,有一点比较绕的地方(我自己脑抽了,来回倒腾),我先不改了,休息一下!

参考:数据框组成的列表用lapply循环 - 搜索 (bing.com)
TCGA的28篇教程-整理GDC下载的xml格式的临床资料 - 云+社区 - 腾讯云 (tencent.com)
https://www.jianshu.com/p/bf6d592fe8dc

相关文章

网友评论

      本文标题:TCGA学习——从下载开始

      本文链接:https://www.haomeiwen.com/subject/fumbyrtx.html