#Read - or scan - in the data into a matrix (160 rows, 4 columns) TMP.data<-matrix(scan("c:\\TEMP\\TMP_proteinlengthdata.txt"),ncol=4,byrow=T) #Remove column 3 as it is redundant TMP.data<-TMP.data[,c(1,2,4)] #Count the number of proteins #Should be 160 TMP.nproteins<-nrow(TMP.data) TMP.nproteins #Column 1 contains the protein name or ID #Column 2 contains the protein amino acid sequences #Column 3 contains the secondary structure information #For each protein, compute the sequence length and store the lengths in #the vector TMP.proteinlengths. TMP.proteinlengths<-numeric(length=TMP.nproteins) for(i in 1:TMP.nproteins){ s1<-TMP.data[i,2] TMP.proteinlengths[i]<-nchar(s1) } print(TMP.proteinlengths) #Draw a histogram hist(TMP.proteinlengths) ############################################################################ #Now repeat for the randomly selected proteins #The format of this file is a little stranger ... it needs some more pre-processing #before the analysis can be performed #Here are some string manipulation functions #Highlight them all, the RUN. strsplit <- function(x, split){ if (length(x) > 1) { return(lapply(x, strsplit, split)) } result <- character(0) if (nchar(x) == 0) return(result) posn <- regexpr(split, x) if (posn <= 0) return(x) c(result, substring(x, 1, posn - 1), Recall(substring(x, posn+1, nchar(x)), split)) } strsub <- function(pattern, replacement, x){ np <- nchar(pattern) nx <- nchar(x) posn <- regexpr(pattern, x) if (posn <= 0) return(x) result <- if (posn == 1) paste(replacement, substring(x, np + 1, nx), sep="") else paste(substring(x, 1, posn - 1), replacement, substring(x, posn + np, nx), sep="") result } gsub <- function(pattern, replacement, x){ if (regexpr(pattern, x) <= 0) return(x) x <- strsub(pattern, replacement, x) Recall(pattern, replacement, x) } #Now start reading in the data protein.data<-scan("c:\\TEMP\\proteinlengthdata.txt") n<-length(protein.data) index<-c(1:n) start.lines<-index[substring(protein.data,1,1) == ">"] end.lines<-c(start.lines,n+1) end.lines<-end.lines[-1]-1 proteins.data<-matrix(" ",ncol=2,nrow=length(start.lines)) for(i in 1:length(start.lines)){ v<-character() for(j in (start.lines[i]+1):end.lines[i]){ tmp.v<-protein.data[j] v<-paste(v,tmp.v) } v<-gsub(" ","",v) proteins.data[i,1]<-gsub(">","",protein.data[start.lines[i]]) proteins.data[i,2]<-v } #Now, for the proteins.data matrix #Column 1 contains the protein name or ID #Column 2 contains the protein amino acid sequences protein.nproteins<-nrow(proteins.data) print(protein.nproteins) #For each protein, compute the sequence length and store the lengths in #the vector TMP.proteinlengths. protein.proteinlengths<-numeric(length=protein.nproteins) for(i in 1:protein.nproteins){ s1<-proteins.data[i,2] protein.proteinlengths[i]<-nchar(s1) } print(protein.proteinlengths) #Draw a histogram TMP.proteinlengths mean(TMP.proteinlengths) var(TMP.proteinlengths) mean(protein.proteinlengths) var(protein.proteinlengths) #Some Plots range(c(TMP.proteinlengths,protein.proteinlengths)) log(range(c(TMP.proteinlengths,protein.proteinlengths))) par(mfrow=c(2,2)) br<-c(0:20)*100 lbr<-c(0:20)*0.5 hist(TMP.proteinlengths,breaks=br,xlab="Length") axes() title("Transmembrane Proteins") hist(log(TMP.proteinlengths),breaks=lbr,xlab="log(Length)") axes() title("Transmembrane Proteins (log scale)") hist(protein.proteinlengths,breaks=br,xlab="Length") axes() title("Randomly selected Proteins") hist(log(protein.proteinlengths),breaks=lbr,xlab="log(Length)") axes() title("Randomly selected Proteins (log scale)") #The t-test on the log scale data y.tmp<-log(TMP.proteinlengths) y.random<-log(protein.proteinlengths) #With equal variances assumed test.result<-t.test(x=y.tmp, y=y.random, alternative="two.sided", mu=0, paired=F,var.equal=T, conf.level=.95) print(test.result) print(test.result$p.value) #With unequal variances assumed test.result.unequal<-t.test(x=y.tmp, y=y.random, alternative="two.sided", mu=0, paired=F,var.equal=F, conf.level=.95) print(test.result.unequal) print(test.result.unequal$p.value) #Non-parametric testing #Using the Mann-Whitney-Wilcoxon for equal medians wilcox.test.result<-wilcox.test(x=y.tmp, y=y.random, alternative="two.sided", mu=0, paired=F, exact=T, correct=T) print(wilcox.test.result) print(wilcox.test.result$p.value) #Using the Kolmogorov-Smirnov tests for whole distributions ks.test.result<-ks.gof(x=y.tmp, y=y.random, alternative="two.sided") print(ks.test.result) print(ks.test.result$p.value) ###################################################################### #Test for normality #A QQ (quantile-quantile) plot par(mfrow=c(1,2),pty="s") qqnorm(y.tmp) title("Transmembrane Proteins") qqnorm(y.random) title("Randomly selected Proteins")