#Read - or scan - in the data into a matrix (160 rows, 4 columns)
TMP.data<-matrix(scan("c:\\TEMP\\TMP_proteinlengthdata.txt"),ncol=4,byrow=T)

#Remove column 3 as it is redundant
TMP.data<-TMP.data[,c(1,2,4)]

#Count the number of proteins
#Should be 160
TMP.nproteins<-nrow(TMP.data)
TMP.nproteins

#Column 1 contains the protein name or ID
#Column 2 contains the protein amino acid sequences
#Column 3 contains the secondary structure information

#For each protein, compute the sequence length and store the lengths in
#the vector TMP.proteinlengths.

TMP.proteinlengths<-numeric(length=TMP.nproteins)
for(i in 1:TMP.nproteins){
	s1<-TMP.data[i,2]
	TMP.proteinlengths[i]<-nchar(s1)
}
print(TMP.proteinlengths)

#Draw a histogram
hist(TMP.proteinlengths)

############################################################################
#Now repeat for the randomly selected proteins
#The format of this file is a little stranger ... it needs some more pre-processing
#before the analysis can be performed

#Here are some string manipulation functions
#Highlight them all, the RUN.

strsplit <- function(x, split){
    if (length(x) > 1) {
        return(lapply(x, strsplit, split))
        }
    result <- character(0)
    if (nchar(x) == 0) return(result)
    posn <- regexpr(split, x)
    if (posn <= 0) return(x)
    c(result, substring(x, 1, posn - 1),
        Recall(substring(x, posn+1, nchar(x)), split))
}


strsub <- function(pattern, replacement, x){
	np <- nchar(pattern)
	nx <- nchar(x)
	posn <- regexpr(pattern, x)
	if (posn <= 0) return(x)
	result <- if (posn == 1) paste(replacement, substring(x, np + 1, nx), sep="")
	else paste(substring(x, 1, posn - 1), replacement, substring(x, posn + np, nx), sep="")
	result
}

gsub <- function(pattern, replacement, x){
    if (regexpr(pattern, x) <= 0) return(x)
    x <- strsub(pattern, replacement, x)
    Recall(pattern, replacement, x)
}

#Now start reading in the data

protein.data<-scan("c:\\TEMP\\proteinlengthdata.txt")

n<-length(protein.data)
index<-c(1:n)

start.lines<-index[substring(protein.data,1,1) == ">"]
end.lines<-c(start.lines,n+1)
end.lines<-end.lines[-1]-1

proteins.data<-matrix(" ",ncol=2,nrow=length(start.lines))
for(i in 1:length(start.lines)){
	
	v<-character()
	for(j in (start.lines[i]+1):end.lines[i]){
		tmp.v<-protein.data[j]
		v<-paste(v,tmp.v)
	}
	v<-gsub(" ","",v)
	proteins.data[i,1]<-gsub(">","",protein.data[start.lines[i]])
	proteins.data[i,2]<-v
}

#Now, for the proteins.data matrix
#Column 1 contains the protein name or ID
#Column 2 contains the protein amino acid sequences

protein.nproteins<-nrow(proteins.data)
print(protein.nproteins)

#For each protein, compute the sequence length and store the lengths in
#the vector TMP.proteinlengths.

protein.proteinlengths<-numeric(length=protein.nproteins)
for(i in 1:protein.nproteins){
	s1<-proteins.data[i,2]
	protein.proteinlengths[i]<-nchar(s1)
}
print(protein.proteinlengths)

#Draw a histogram

TMP.proteinlengths
mean(TMP.proteinlengths)
var(TMP.proteinlengths)
mean(protein.proteinlengths)
var(protein.proteinlengths)

#Some Plots

range(c(TMP.proteinlengths,protein.proteinlengths))
log(range(c(TMP.proteinlengths,protein.proteinlengths)))
par(mfrow=c(2,2))
br<-c(0:20)*100
lbr<-c(0:20)*0.5
hist(TMP.proteinlengths,breaks=br,xlab="Length")
axes()
title("Transmembrane Proteins")
hist(log(TMP.proteinlengths),breaks=lbr,xlab="log(Length)")
axes()
title("Transmembrane Proteins (log scale)")

hist(protein.proteinlengths,breaks=br,xlab="Length")
axes()
title("Randomly selected Proteins")
hist(log(protein.proteinlengths),breaks=lbr,xlab="log(Length)")
axes()
title("Randomly selected Proteins (log scale)")


#The t-test on the log scale data

y.tmp<-log(TMP.proteinlengths)
y.random<-log(protein.proteinlengths)

#With equal variances assumed
test.result<-t.test(x=y.tmp, y=y.random, alternative="two.sided", mu=0, paired=F,var.equal=T, conf.level=.95)
print(test.result)
print(test.result$p.value)

#With unequal variances assumed
test.result.unequal<-t.test(x=y.tmp, y=y.random, alternative="two.sided", mu=0, paired=F,var.equal=F, conf.level=.95)
print(test.result.unequal)
print(test.result.unequal$p.value)

#Non-parametric testing
#Using the Mann-Whitney-Wilcoxon for equal medians

wilcox.test.result<-wilcox.test(x=y.tmp, y=y.random, alternative="two.sided", mu=0, paired=F, exact=T, correct=T)
print(wilcox.test.result)
print(wilcox.test.result$p.value)

#Using the Kolmogorov-Smirnov tests for whole distributions
ks.test.result<-ks.gof(x=y.tmp, y=y.random, alternative="two.sided")
print(ks.test.result)
print(ks.test.result$p.value)

######################################################################

#Test for normality
#A QQ (quantile-quantile) plot

par(mfrow=c(1,2),pty="s")

qqnorm(y.tmp)
title("Transmembrane Proteins")
qqnorm(y.random)
title("Randomly selected Proteins")