diff --git a/mixOmics-ExamapleCode.R b/mixOmics-ExamapleCode.R new file mode 100644 index 0000000000000000000000000000000000000000..01102f2207388e29f420b6c08388e154e356b523 --- /dev/null +++ b/mixOmics-ExamapleCode.R @@ -0,0 +1,210 @@ +# setting up the workspace +#pathtodata<-"/run/user/1000/gvfs/smb-share:server=147.251.32.123,share=bouchal_budinska/Porovnani proteomiky a transkriptomiky_COMT_TNBC/" +pathtodata<-"R:/Porovnani proteomiky a transkriptomiky_COMT_TNBC/" + +setwd(pathtodata) + +#loading the data +E<-read.csv("data/norm_counts.tsv", sep="\t", header=T) # DESeq2 normalized expression data +P<-read.csv("data/20210702_110638_COMT_MDAMB231CRISPR_4349_2101_ProteinLevelReport.csv", sep="\t", header=T, dec = ".") + +#reading protein annotation +protan<-read.csv("data/Protein_annotation_COMT_TNBC.csv", header=T,sep="\t") + +prot_names<-protan$Genes +names(prot_names)<-protan$ProteinGroups + + +library(reshape) + +P$ID<-paste(P$R.Condition,P$R.Replicate, sep="_") + +P_wide<-reshape(P[,-c(1,2,3)], direction="wide", idvar="PG.ProteinAccessions", timevar="ID", ) +colnames(P_wide)<-gsub("_","_rep",gsub("ctrl","CTRL",gsub(pattern = "PG.Quantity.", replacement = "",colnames(P_wide)))) +rownames(P_wide)<-P_wide[,1] +P_wide<-P_wide[,-1] + +head(P_wide) +match(colnames(P_wide),colnames(E)) + +## selecting common samples +## means selecting samples with all data types (practically in this case removing F11 and third replicates) + +sample_sel<-intersect(colnames(P_wide),colnames(E))[which(regexpr("F11",intersect(colnames(P_wide),colnames(E)))==-1)] +E_sel<-E[,sample_sel] +P_sel<-P_wide[,sample_sel] + +## preparing gene names for nicer visualization if necessary +gene_names<-E$gene_name +names(gene_names)<-E$gene_id + + +### starting MixOmics analysis!!! ### + +library(mixOmics) + +## preparing list of data matrices for mixOmics +diabloset<-list(P=t(P_sel), E=t(E_sel)) # rows are samples, hence we are transposing the matrices + + + +## filtering parameters - it is wise to filter out variables that are not the most informative. Say based on coefficient of variation. + +# let's calculate the coefficient of variation for each parameter +CV_P<-apply(diabloset$P,2, function(x) mean(x)/sd(x)) +CV_E<-apply(diabloset$E,2, function(x) mean(x)/sd(x)) + +plot(sort(CV_P)) +plot(sort(CV_E)) + +sel_CV_P<-which(CV_P>15) +sel_CV_E<-which(CV_E>10) + +length(sel_CV_P) +length(sel_CV_E) + +diabloset_filt<-list(P=t(P_sel)[,sel_CV_P], E=t(E_sel)[,sel_CV_E]) + +group<-c("CTRL","COMT_neg")[c(regexpr("CTRL",rownames(diabloset_filt$E))==-1)+1] +names(group)<-rownames(diabloset_filt$E) + + + +## performing block.splsda ## + +design <- matrix(0.1, ncol = length(diabloset_filt), nrow = length(diabloset_filt), + dimnames = list(names(diabloset_filt), names(diabloset_filt))) +diag(design) = 0 + +Y <- group +sgccda10 <- block.splsda(X = diabloset_filt, + Y = Y, + ncomp = 10, + design = design, + scheme = "centroid", scale = TRUE) + + +variance<-unlist(lapply(sgccda10$prop_expl_var,function(x) x[[1]])[-7]) + +pdf("mixOmics/variance_explained_vs_weight_comp1.pdf") +barplot(sgccda10$weights$comp1, variance[-3], xlab="variance explained by 1st component",ylab="weight in the model", names.arg = names(variance)[-3], las=2) +dev.off() + +pdf("mixOmics/cimDiablo_Heatmapa.pdf", width = 28) +cimDiablo(sgccda10, margins = c(20,10), size.legend = 0.5) +dev.off() + + +cols<-c("red","blue")[as.numeric(as.factor(group))] + +pdf("mixOmics/sgccda.pdf") +plot(sgccda10, col=c("red","blue")) +dev.off() + +pdf("mixOmics/sgccda_indiv.pdf") +plotIndiv(sgccda10, col=c("red","blue")) +dev.off() + +pdf("mixOmics/loadings.pdf", height = 42, width=14) +plotLoadings(sgccda10, contrib = "max") +dev.off() + +plot(sort(abs(sgccda10$loadings$P[,1]))) +plot(sort(abs(sgccda10$loadings$E[,1]))) + + + +#selecting top 50 loadings from each +cuttedloadings_top50<-Reduce(union,lapply(loadings(sgccda10), function(x) names(sort(abs(x[,1]), decreasing=TRUE)[1:50]))[-3]) +cuttedloadings_top50_list<-lapply(loadings(sgccda10), function(x) names(sort(abs(x[,1]), decreasing=TRUE)[1:50]))[-3] + +## reanalyzing block.splsda on filtered top 50 loadings just for circosPlot +diabloset_filt_top50<-list(P=diabloset_filt$P[,cuttedloadings_top50_list$P],E=diabloset_filt$E[,cuttedloadings_top50_list$E]) + +varnams<-cuttedloadings_top50_list +varnams$E<-gene_names[cuttedloadings_top50_list$E] + +sgccda_top50 <- block.splsda(X = diabloset_filt_top50, + Y = Y, + ncomp = 10, + design = design, + scheme = "centroid", scale = TRUE) + +pdf("mixOmics/circosPlot_top50_0.95.pdf") +circosPlot(sgccda_top50,comp = 1:2, cutoff = 0.95, var.names = varnams) +dev.off() + + +plot(diabloset_filt$P[,"Q99460"],diabloset_filt$E[,"ENSG00000170734"]) + +# heatmapa na top 50 +pdf("mixOmics/cimDiablo_top50.pdf") +cimDiablo(sgccda_top50) +dev.off() + + + +#### correlations #### +col2 <- colorRampPalette(c("#67001F", "#B2182B", "#D6604D", "#F4A582", + "#FDDBC7", "#FFFFFF", "#D1E5F0", "#92C5DE", + "#4393C3", "#2166AC", "#053061")) +col3 <- colorRampPalette(c("#67001F", "#B2182B", "#D6604D", "#F4A582", + "#FDDBC7", "light grey", "#D1E5F0", "#92C5DE", + "#4393C3", "#2166AC", "#053061")) + +library(psych) + +B<-data.frame(diabloset_filt$P,diabloset_filt$E) +corPW<-cor(B,method="pearson") +corPW.test<-corr.test(B,method="pearson") +corPW.test$p.adjusted<- corPW.test$p +diag(corPW.test$p.adjusted)<-NA +corPW.test$p.adjusted[upper.tri(corPW.test$p.adjusted)]<-p.adjust(corPW.test$p[upper.tri(corPW.test$p)], method="BH") +corPW.test$p.adjusted[lower.tri(corPW.test$p.adjusted)]<-p.adjust(corPW.test$p[lower.tri(corPW.test$p)], method="BH") + + +## rows genes, columns proteins ## +corPW_sel<-corPW[1:ncol(diabloset_filt$P),(ncol(diabloset_filt$P)+1):ncol(corPW)] +corPW.test_sel<-corPW.test[1:ncol(diabloset_filt$P),(ncol(diabloset_filt$P)+1):ncol(corPW)] + +## rows genes, columns proteins ## +corPW_sel<-corPW[1:ncol(diabloset_filt$P),(ncol(diabloset_filt$P)+1):ncol(corPW)] +corPW.test_sel<-corPW.test[1:ncol(diabloset_filt$P),(ncol(diabloset_filt$P)+1):ncol(corPW)] + +selC<-names(which(apply(corPW.test_sel,2,function(x) sum(any(x<0.05)))>0)) +selR<-names(which(apply(corPW.test_sel,1,function(x) sum(any(x<0.05)))>0)) + +corPW_sel_top50<-corPW_sel[cuttedloadings_top50_list$P,cuttedloadings_top50_list$E] +colnames(corPW_sel_top50)<-gene_names[colnames(corPW_sel_top50)] +rownames(corPW_sel_top50)<-prot_names[rownames(corPW_sel_top50)] + +hclust_proteins<-hclust(dist(corPW_sel_top50)) +hclust_transcripts<-hclust(dist(t(corPW_sel_top50))) + +pdf("mixOmics/correlations_pearson_top50.pdf") +corrplot::corrplot(corPW_sel_top50[hclust_proteins$labels[hclust_proteins$order],hclust_transcripts$labels[hclust_transcripts$order]], is.corr = FALSE, col = col2(100)[100:1]) +dev.off() + + + +# merging with DEG table +deg_ctrl_b1<-read.csv("DE_analysis/CTRL_vs_B1/all/DESeq2_CTRLvsB1.tsv.csv", header=T,sep = "\t", row.names = 1) + +deg_ctrl_b1[cuttedloadings_top50_list$E,] + +t.test(diabloset$E[,"ENSG00000124181"]~group[rownames(diabloset$E)]) +boxplot(diabloset$E[,"ENSG00000124181"]~group[rownames(diabloset$E)]) + +deg_ctrl_b1["ENSG00000124181",]$log2FoldChange + +t.test(diabloset$E[,"ENSG00000124181"]~group[rownames(diabloset$E)]) + +apply(log(diabloset$E[1:4,rownames(deg_ctrl_b1[c(14,15583),])]),2, function(x) t.test(x~group[rownames(diabloset$E)][1:4])) + +boxplot(log(diabloset$E[,"ENSG00000124181"])~group[rownames(diabloset$E)],ylim=c(0,10)) +boxplot(log(diabloset$E[,"ENSG00000196611"])~group[rownames(diabloset$E)],ylim=c(0,10)) +boxplot(log(diabloset$E[,"ENSG00000103888"])~group[rownames(diabloset$E)],ylim=c(0,10)) + +deg_ctrl_b1[c("ENSG00000124181","ENSG00000196611","ENSG00000103888"),] + +