load R packages
library(dplyr)
library(tibble)
library(data.table)
library(impute)
load phenotype
phen <- read.csv("../../Study/Phenotype/clinical__information.csv")
prof <- fread("../../Study/RawData/samples_expression.txt")
curation
choose index of the phenotype for survival analysis
phen.cln <- phen %>%
mutate(stage_event_tnm_categories = paste0(pathologic_T, pathologic_N, pathologic_M)) %>%
dplyr::select(bcr_patient_barcode, gender, vital_status, days_to_death,
days_to_last_followup, race, person_neoplasm_cancer_status,
age_at_initial_pathologic_diagnosis, laterality,
neoplasm_histologic_grade, pathologic_stage, stage_event_tnm_categories) %>%
distinct(bcr_patient_barcode, .keep_all = TRUE)
# dead_patient
dead_patient <- phen.cln %>%
dplyr::filter(vital_status == "Dead") %>%
dplyr::select(-days_to_last_followup) %>%
reshape::rename(c(bcr_patient_barcode = "Barcode",
gender = "Gender",
vital_status = "OS",
days_to_death = "OS.Time",
race = "Race",
person_neoplasm_cancer_status = "cancer_status",
age_at_initial_pathologic_diagnosis = "Age",
neoplasm_histologic_grade = "Grade",
pathologic_stage = "Stage",
stage_event_tnm_categories = "TNM")) %>%
mutate(OS = ifelse(OS == "Dead", 1, 0)) %>%
mutate(OS.Time = as.numeric(as.character(OS.Time))/365)
# alive_patient
alive_patient <- phen.cln %>%
dplyr::filter(vital_status == "Alive") %>%
dplyr::select(-days_to_death) %>%
reshape::rename(c(bcr_patient_barcode = "Barcode",
gender = "Gender",
vital_status = "OS",
days_to_last_followup = "OS.Time",
race = "Race",
person_neoplasm_cancer_status = "cancer_status",
age_at_initial_pathologic_diagnosis = "Age",
neoplasm_histologic_grade = "Grade",
pathologic_stage = "Stage",
stage_event_tnm_categories = "TNM")) %>%
mutate(OS = ifelse(OS == "Dead", 1, 0)) %>%
mutate(OS.Time = as.numeric(as.character(OS.Time))/365)
# combine data
survival_data <- rbind(dead_patient, alive_patient) %>%
mutate(Barcode = gsub("-", "_", Barcode))
filter expression matrix
-
First, the biological features were removed if having zero value in more than 20% of patients.
-
The samples were removed if missing across more than 20% features.
-
Then we used impute function from R impute package, to fill out the missing values.
-
Lastly, we removed input features with zero values across all samples.
get_profile <- function(profile = prof,
metadata = survival_data,
occurrence = 0.2,
present = 0.2){
# profile = prof
# metadata = survival_data
# occurrence = 0.2
# present = 0.2
colnames(profile) <- c(colnames(profile)[1:3], gsub("-", "_", colnames(profile)[4:ncol(profile)]))
filter_fill_fun <- function(tag){
# tag = data_type[1]
dat <- profile %>% filter(Platform == tag) %>%
dplyr::select(-c("Platform", "Description"))
# feature occurrence
feature_occ <- apply(dat[, -1], 1, function(x){sum(x[!is.na(x)] != 0)/length(x)}) %>%
data.frame() %>%
setNames("feature_occurrence") %>%
mutate(GeneSymbol = dat$GeneSymbol)
remain_feature <- feature_occ %>% filter(feature_occurrence > occurrence)
# sample occurrence
sample_occ <- apply(dat[, -1], 2, function(x){sum(x[!is.na(x)] != 0)/length(x)}) %>%
data.frame() %>%
setNames("sample_occurrence") %>%
rownames_to_column("sample")
remain_sample <- sample_occ %>% filter(sample_occurrence > present)
# remove duplicate features
dat.cln <- dat %>% filter(GeneSymbol%in%as.character(remain_feature$GeneSymbol)) %>%
dplyr::select(c("GeneSymbol", as.character(remain_sample$sample)))
dat.cln$median <- apply(dat.cln[, -1], 1, median)
dat.cln <- with(dat.cln, dat.cln[order(GeneSymbol, median, decreasing = T), ])
dat.symbol <- dat.cln[!duplicated(dat.cln$GeneSymbol), ] %>%
dplyr::select(-median) %>%
column_to_rownames("GeneSymbol")
# impute.knn
set.seed(12345)
saved.state <- .Random.seed
dat.imputed <- impute.knn(as.matrix(dat.symbol))
return(dat.imputed)
}
data_type <- unique(profile$Platform)
#print(data_type)
copyNumber <- filter_fill_fun(data_type[1])
geneExp <- filter_fill_fun(data_type[2])
methylation <- filter_fill_fun(data_type[3])
protein_RPPA <- filter_fill_fun(data_type[4])
copyNumber_prf <- copyNumber$data
geneExp_prf <- geneExp$data
methylation_prf <- methylation$data
protein_RPPA_prf <- protein_RPPA$data
common_sampleid <- intersect(metadata$Barcode,
intersect(colnames(copyNumber_prf),
intersect(colnames(geneExp_prf),
intersect(colnames(methylation_prf), colnames(protein_RPPA_prf))
)
)
)
metadata.cln <- metadata %>% filter(Barcode %in% common_sampleid)
copyNumber_prf.cln <- copyNumber_prf[, colnames(copyNumber_prf)%in%common_sampleid]
geneExp_prf.cln <- geneExp_prf[, colnames(geneExp_prf)%in%common_sampleid]
methylation_prf.cln <- methylation_prf[, colnames(methylation_prf)%in%common_sampleid]
protein_RPPA_prf.cln <- protein_RPPA_prf[, colnames(protein_RPPA_prf)%in%common_sampleid]
res <- list(meta = metadata.cln,
CN = copyNumber_prf.cln,
GE = geneExp_prf.cln,
ML = methylation_prf.cln,
PR = protein_RPPA_prf.cln)
return(res)
}
data_trim_res <- get_profile(profile = prof,
metadata = survival_data)
output
dir_prf <- "../../Result/profile"
if(!dir.exists(dir_prf)){
dir.create(dir_prf)
}
# copyNumber
copyNumber.path <- paste(dir_prf, "copyNumber_filter.tsv", sep = "/")
write.table(data_trim_res$CN, file = copyNumber.path, quote = F, sep = "\t")
# geneExp
geneExp.path <- paste(dir_prf, "geneExp_filter.tsv", sep = "/")
write.table(data_trim_res$GE, file = geneExp.path, quote = F, sep = "\t")
# methylation
methylation.path <- paste(dir_prf, "methylation_filter.tsv", sep = "/")
write.table(data_trim_res$ML, file = methylation.path, quote = F, sep = "\t")
# protein_RPPA
protein_RPPA.path <- paste(dir_prf, "protein_RPPA_filter.tsv", sep = "/")
write.table(data_trim_res$PR, file = protein_RPPA.path, quote = F, sep = "\t")
dir_phe <- "../../Result/phenotype"
if(!dir.exists(dir_phe)){
dir.create(dir_phe)
}
# total
all_survival.path <- paste(dir_phe, "all_survival_data.tsv", sep = "/")
write.table(survival_data, file = all_survival.path, quote = F, sep = "\t", row.names = F)
# total
common_survival.path <- paste(dir_phe, "common_survival_data.tsv", sep = "/")
write.table(data_trim_res$meta, file = common_survival.path, quote = F, sep = "\t", row.names = F)
version
sessionInfo()
R version 3.6.3 (2020-02-29)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)
Matrix products: default
BLAS/LAPACK: /data/share/anaconda3/lib/libopenblasp-r0.3.12.so
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] impute_1.60.0 ggpubr_0.4.0 ggplot2_3.3.3 data.table_1.13.6 tibble_3.0.4 dplyr_1.0.2
loaded via a namespace (and not attached):
[1] Rcpp_1.0.5 lattice_0.20-41 tidyr_1.1.2 rprojroot_2.0.2 assertthat_0.2.1 digest_0.6.27 R6_2.5.0
[8] cellranger_1.1.0 plyr_1.8.6 backports_1.2.1 evaluate_0.14 pillar_1.4.7 rlang_0.4.10 curl_4.3
[15] readxl_1.3.1 rstudioapi_0.13 car_3.0-10 vegan_2.5-7 Matrix_1.3-0 rmarkdown_2.6 desc_1.2.0
[22] splines_3.6.3 stringr_1.4.0 foreign_0.8-76 munsell_0.5.0 broom_0.7.3 compiler_3.6.3 xfun_0.19
[29] pkgconfig_2.0.3 mgcv_1.8-33 htmltools_0.5.0 tidyselect_1.1.0 rio_0.5.16 reshape_0.8.8 fansi_0.4.1
[36] permute_0.9-5 crayon_1.3.4 withr_2.3.0 MASS_7.3-53 grid_3.6.3 nlme_3.1-150 gtable_0.3.0
[43] lifecycle_0.2.0 magrittr_2.0.1 scales_1.1.1 zip_2.1.1 cli_2.2.0 stringi_1.5.3 carData_3.0-4
[50] ggsignif_0.6.0 testthat_3.0.1 ellipsis_0.3.1 generics_0.1.0 vctrs_0.3.6 cowplot_1.1.1 openxlsx_4.2.3
[57] tools_3.6.3 forcats_0.5.0 glue_1.4.2 purrr_0.3.4 hms_0.5.3 abind_1.4-5 pkgload_1.1.0
[64] parallel_3.6.3 yaml_2.2.1 colorspace_2.0-0 cluster_2.1.0 rstatix_0.6.0 knitr_1.30 haven_2.3.1









网友评论