# RNASeq = GA from bcgsc rm(list=ls()) norm=TRUE norm=FALSE setwd("/data/laml/bcgsc.ca_LAML.IlluminaGA_RNASeq.Level_3.1.7.0") (s=dir(pattern="*gene.quantification.txt")) if (norm) out="RPKM" else out="raw_counts" # RPKMreads per kilobase per million for (i in 1:length(s)) { d=read.delim(s[i]) if (i==1) D=d[,out] else D=cbind(D,d[,out]) print(i) } cnms=colnames(D)=substr(s,start=9,stop=12) # pull IDs out of file names # note that cnms order must be right since for loop went through file name vector rownames(D) # currently no row names d[,1] # first column of any file has gene names, so take from last d of for loop rownames(D)=d[,1]# factor is automatically converted strings since rownames wants strings head(rownames(D)) #see that rownames are strings. But I often convert factors rnms=rownames(D)=as.character(d[,1])#to strings explicitly to remove all doubts. head(rnms) #? = gene not known. Many ?=> not unique rows left of | so keep full names # d=round(log(D),1) d=D # save as little d to store in *.RData because I tend to use d for dataframes head(d,2) dim(d) # 179 sample, 20442 gene expression values length(unique(rownames(d))) L=strsplit(rownames(d),"|",fixed=T) S=sapply(L,function(x) x[1]) length(unique(S)) # 19991 different gene symbols, so ~450 do not have names (fname=paste0("RNASeqV1",ifelse(norm,"norm","raw"))) write.csv(d,paste0("/data/laml/level4/",fname,".csv")) save(d,file=paste0("/data/laml/level4/",fname,".RData")) # rm(list=ls()) library(Biobase) load("/data/laml/level4/RNASeqV1norm.RData") eset <- ExpressionSet(assayData=d) ids=colnames(eset) load("/data/laml/level4/pD.RData") pData(eset)=pD[ids,] save(eset,file="/data/laml/level4/RNASeqV1normEset.RData") load("/data/laml/level4/RNASeqV1raw.RData")) eset <- ExpressionSet(assayData=d) ids=colnames(eset) load("/data/laml/level4/pD.RData") pData(eset)=pD[ids,] save(eset,file="/data/laml/level4/RNASeqV1rawEset.RData") # RNASeqV2 = HiSeq from UNC setwd("/data/laml/unc.edu_LAML.IlluminaHiSeq_RNASeqV2.Level_3.1.2.0") if (norm) s=dir(pattern=paste0("*genes.normalized_results")) else s=dir(pattern=paste0("*genes.results")) head(s) # note that HiSeq file names use uuids, not patient ids for (i in 1:length(s)) { d=read.delim(s[i]) if (i==1) D=d[,2] else D=cbind(D,d[,2]) print(i) } # 173 HiSeq files (uuids=substring(s,9,44)) sdrf="/data/laml/meta/unc.edu_LAML.IlluminaHiSeq_RNASeqV2.1.2.0.sdrf.txt" tail(ids<-read.delim(sdrf,stringsAsFactors=F)[,1:2],7) #blocks of 6 # ugly column names. Switch to clean ones names(ids)<-c("uuid","barCode") head(ids,7) ids 1038/6 # 173 # scroll up to see that the blocks of 6 hold throughout, but don't assume this; # for come tcga cancers this is not the case, and one has to first sort (ids=ids[order(ids$uuid),]) # on uuids or barcodes to get blocks of 6 everywhere (ids=ids[order(ids$barCode),]) (ids=ids[6*(1:173),])#heximate to eliminate redundancy and set up for uuid row naming ids$ptID<-substring(ids$barCode,9,12) ids rownames(ids)<-ids$uuid ids=ids[,-1] # no longer a need for the uuid column, since it's in the row names ids=ids[uuids,] #rearrange to match order in the for loop above cnms=colnames(D)=ids$ptID rnms=rownames(D)=as.character(d[,1]) head(D) d=round(D) dim(d) d=d[,order(cnms)] head(d) (fname=paste0("RNASeqV2",ifelse(norm,"norm","raw"))) write.csv(d,paste0("/data/laml/level4/",fname,".csv")) save(d,file=paste0("/data/laml/level4/",fname,".RData")) # e <- ExpressionSet(assayData=d) # save(e,file="/data/laml/level4/RNASeqV2set.RData") # head(exprs(e)) rm(list=ls()) library(Biobase) load("/data/laml/level4/RNASeqV2norm.RData") eset <- ExpressionSet(assayData=d) ids=colnames(eset) load("/data/laml/level4/pD.RData") pData(eset)=pD[ids,] save(eset,file="/data/laml/level4/RNASeqV2normEset.RData") load("/data/laml/level4/RNASeqV2raw.RData") eset <- ExpressionSet(assayData=d) ids=colnames(eset) load("/data/laml/level4/pD.RData") pData(eset)=pD[ids,] save(eset,file="/data/laml/level4/RNASeqV2rawEset.RData")