# RNASeq = GA from bcgsc
rm(list=ls()) 
norm=TRUE
norm=FALSE
setwd("/data/laml/bcgsc.ca_LAML.IlluminaGA_RNASeq.Level_3.1.7.0")
(s=dir(pattern="*gene.quantification.txt"))
if (norm) out="RPKM" else out="raw_counts" # RPKMreads per kilobase per million
for (i in 1:length(s)) {
  d=read.delim(s[i])
  if (i==1)  D=d[,out] else D=cbind(D,d[,out])
  print(i)
}

cnms=colnames(D)=substr(s,start=9,stop=12) # pull IDs out of file names
# note that cnms order must be right since for loop went through file name vector
rownames(D) # currently no row names
d[,1]  # first column of any file has gene names, so take from last d of for loop
rownames(D)=d[,1]# factor is automatically converted strings since rownames wants strings
head(rownames(D)) #see that rownames are strings. But I often convert factors
rnms=rownames(D)=as.character(d[,1])#to strings explicitly to remove all doubts.
head(rnms) #? = gene not known. Many ?=> not unique rows left of | so keep full names
# d=round(log(D),1) 
d=D  # save as little d to store in *.RData because I tend to use d for dataframes
head(d,2) 
dim(d) # 179 sample, 20442 gene expression values
length(unique(rownames(d)))
L=strsplit(rownames(d),"|",fixed=T)
S=sapply(L,function(x) x[1])
length(unique(S)) # 19991 different gene symbols, so ~450 do not have names
(fname=paste0("RNASeqV1",ifelse(norm,"norm","raw")))
write.csv(d,paste0("/data/laml/level4/",fname,".csv"))
save(d,file=paste0("/data/laml/level4/",fname,".RData"))

# 
rm(list=ls()) 
library(Biobase)
load("/data/laml/level4/RNASeqV1norm.RData")
eset <- ExpressionSet(assayData=d)
ids=colnames(eset)
load("/data/laml/level4/pD.RData")
pData(eset)=pD[ids,]
save(eset,file="/data/laml/level4/RNASeqV1normEset.RData")

load("/data/laml/level4/RNASeqV1raw.RData"))
eset <- ExpressionSet(assayData=d)
ids=colnames(eset)
load("/data/laml/level4/pD.RData")
pData(eset)=pD[ids,]
save(eset,file="/data/laml/level4/RNASeqV1rawEset.RData")



# RNASeqV2 =  HiSeq from UNC
setwd("/data/laml/unc.edu_LAML.IlluminaHiSeq_RNASeqV2.Level_3.1.2.0")
if (norm) s=dir(pattern=paste0("*genes.normalized_results")) else
  s=dir(pattern=paste0("*genes.results"))
head(s) # note that HiSeq file names use uuids, not patient ids 
for (i in 1:length(s)) {
  d=read.delim(s[i])
  if (i==1)  D=d[,2] else D=cbind(D,d[,2])
  print(i)
}
# 173 HiSeq files

(uuids=substring(s,9,44))
sdrf="/data/laml/meta/unc.edu_LAML.IlluminaHiSeq_RNASeqV2.1.2.0.sdrf.txt"
tail(ids<-read.delim(sdrf,stringsAsFactors=F)[,1:2],7) #blocks of 6 
# ugly column names. Switch to clean ones
names(ids)<-c("uuid","barCode")
head(ids,7)
ids 
1038/6 # 173
# scroll up to see that the blocks of 6 hold throughout, but don't assume this;
# for come tcga cancers this is not the case, and one has to first sort
(ids=ids[order(ids$uuid),])  # on uuids or barcodes to get blocks of 6 everywhere
(ids=ids[order(ids$barCode),]) 
(ids=ids[6*(1:173),])#heximate to eliminate redundancy and set up for uuid row naming
ids$ptID<-substring(ids$barCode,9,12)
ids
rownames(ids)<-ids$uuid
ids=ids[,-1] # no longer a need for the uuid column, since it's in the row names
ids=ids[uuids,] #rearrange to match order in the for loop above
cnms=colnames(D)=ids$ptID
rnms=rownames(D)=as.character(d[,1])
head(D)
d=round(D) 
dim(d)
d=d[,order(cnms)]
head(d)
(fname=paste0("RNASeqV2",ifelse(norm,"norm","raw")))
write.csv(d,paste0("/data/laml/level4/",fname,".csv"))
save(d,file=paste0("/data/laml/level4/",fname,".RData"))
# e <- ExpressionSet(assayData=d)
# save(e,file="/data/laml/level4/RNASeqV2set.RData")
# head(exprs(e))

rm(list=ls()) 
library(Biobase)
load("/data/laml/level4/RNASeqV2norm.RData")
eset <- ExpressionSet(assayData=d)
ids=colnames(eset)
load("/data/laml/level4/pD.RData")
pData(eset)=pD[ids,]
save(eset,file="/data/laml/level4/RNASeqV2normEset.RData")

load("/data/laml/level4/RNASeqV2raw.RData")
eset <- ExpressionSet(assayData=d)
ids=colnames(eset)
load("/data/laml/level4/pD.RData")
pData(eset)=pD[ids,]
save(eset,file="/data/laml/level4/RNASeqV2rawEset.RData")