#############################################################

#Don't forget to use the "help" command,
#for example, at the command line, type
#
#>help(rexp)
#
#to find out what the "rexp" function does
#

#############################################################
n <- 500
lambda <- 3
nits <- 5000
xmin <- rep(0,nits)
xmax <- rep(0,nits)
for(i in 1:nits){
	x <- rexp(n,lambda)
	xmin[i] <- min(x)
	xmax[i] <- max(x)
}
hist(xmin)
hist(xmax)
length(xmin[xmin < 0.0001])/nits
1-exp(-n*lambda*0.0001)
length(xmin[xmin > 0.0005])/nits
exp(-n*lambda*0.0005)
length(xmax[xmax <= 2])/nits
(1-exp(-lambda*2))^n
length(xmax[xmax > 3])/nits
1-(1-exp(-lambda*3))^n

############################################################### 

x <- c(0:5000)/500
cdf.x <- function(x,lambda){1-exp(-lambda*x)}
cdf.xmin <- function(x,lambda,n){1-(1-cdf.x(x,lambda))^n}
cdf.xmax <- function(x,lambda,n){(cdf.x(x,lambda))^n}

############################################################### 

n <- 500
theta <- 0.25
nits <- 5000
xmin <- rep(0,nits)
xmax <- rep(0,nits)
for(i in 1:nits){
	x <- rgeom(n,theta)
	xmin[i] <- min(x)
	xmax[i] <- max(x)
}
hist(xmin)
hist(xmax)

############################################################### 
n <- 500
nits <- 5000
umin <- rep(0,nits)
umax <- rep(0,nits)
for(i in 1:nits){
	u <- runif(n)
	y <- c(0,sort(u),1)
	udiff <- diff(y)
	umin[i] <- min(udiff)
	umax[i] <- max(udiff)
}
hist(umin)
hist(umax)

############################################################### 

#Read - or scan - in the data into a matrix (160 rows, 4 columns)
TMP.data<-matrix(scan("c:\\TEMP\\TMPproteinlengthdata.txt"),ncol=4,byrow=T)

#Remove column 3 as it is redundant
TMP.data<-TMP.data[,c(1,2,4)]

#Count the number of proteins
#Should be 160
TMP.nproteins<-nrow(TMP.data)
TMP.nproteins

#Column 1 contains the protein name or ID
#Column 2 contains the protein amino acid sequences
#Column 3 contains the secondary structure information

#For each protein, compute the sequence length and store the lengths in
#the vector TMP.proteinlengths.

TMP.proteinlengths<-numeric(length=TMP.nproteins)
for(i in 1:TMP.nproteins){
	s1<-TMP.data[i,2]
	TMP.proteinlengths[i]<-nchar(s1)
}
print(TMP.proteinlengths)

#Draw a histogram
hist(TMP.proteinlengths)

############################################################################
#Now repeat for the randomly selected proteins
#The format of this file is a little stranger ... it needs some more pre-processing
#before the analysis can be performed

#Here are some string manipulation functions
#Highlight them all, the RUN.

strsplit <- function(x, split){
    if (length(x) > 1) {
        return(lapply(x, strsplit, split))
        }
    result <- character(0)
    if (nchar(x) == 0) return(result)
    posn <- regexpr(split, x)
    if (posn <= 0) return(x)
    c(result, substring(x, 1, posn - 1),
        Recall(substring(x, posn+1, nchar(x)), split))
}


strsub <- function(pattern, replacement, x){
	np <- nchar(pattern)
	nx <- nchar(x)
	posn <- regexpr(pattern, x)
	if (posn <= 0) return(x)
	result <- if (posn == 1) paste(replacement, substring(x, np + 1, nx), sep="")
	else paste(substring(x, 1, posn - 1), replacement, substring(x, posn + np, nx), sep="")
	result
}

gsub <- function(pattern, replacement, x){
    if (regexpr(pattern, x) <= 0) return(x)
    x <- strsub(pattern, replacement, x)
    Recall(pattern, replacement, x)
}

#Now start reading in the data

protein.data<-scan("c:\\TEMP\\proteinlengthdata.txt")

n<-length(protein.data)
index<-c(1:n)

start.lines<-index[substring(protein.data,1,1) == ">"]
end.lines<-c(start.lines,n+1)
end.lines<-end.lines[-1]-1

proteins.data<-matrix(" ",ncol=2,nrow=length(start.lines))
for(i in 1:length(start.lines)){
	
	v<-character()
	for(j in (start.lines[i]+1):end.lines[i]){
		tmp.v<-protein.data[j]
		v<-paste(v,tmp.v)
	}
	v<-gsub(" ","",v)
	proteins.data[i,1]<-gsub(">","",protein.data[start.lines[i]])
	proteins.data[i,2]<-v
}

#Now, for the proteins.data matrix
#Column 1 contains the protein name or ID
#Column 2 contains the protein amino acid sequences

protein.nproteins<-nrow(proteins.data)
print(protein.nproteins)

#For each protein, compute the sequence length and store the lengths in
#the vector TMP.proteinlengths.

protein.proteinlengths<-numeric(length=protein.nproteins)
for(i in 1:protein.nproteins){
	s1<-proteins.data[i,2]
	protein.proteinlengths[i]<-nchar(s1)
}
print(protein.proteinlengths)

#Draw a histogram
hist(protein.proteinlengths)