#
# Sample Python script for the estimation of certain random variables 
# coming from drawing repeatedly from a set of words, where the words 
# are composed of independent letters.
#
# Author: David Moews, 19-I-2006.
#
# This program has been placed in the public domain.
#

import math

#
# Convolve two probability distributions.  A probability distribution
# is represented as a list [probability 1, multiplicity of probability 1,
#                           probability 2, multiplicity of probability 2,
#                           ...,
#                           probability n, multiplicity of probability n].
# The sum of the products of the probabilities and their corresponding
# multiplicities should be 1.
#
def convolve(a, b):
	"""convolve two prob distrs, each a list [prb reps prb reps...]"""
	c=[]
	for i in range(len(a)/2):
		for j in range(len(b)/2):
			c.append(a[2*i]*b[2*j])
			c.append(a[2*i+1]*b[2*j+1])
	return c

#
# Take the nth convolution power of the distribution a.
#
def convpower(a, n):
	"""nth convolution power of prob distr a"""
	c=[1,1]
	for i in range(n):
		c=convolve(a,c)
	return c

#
# Return the highest probability which appears in a.
#
def maxprob(a):
	"""max probability in a"""
	c=0.0
	for i in range(len(a)/2):
		if (a[2*i]>c):
			c=a[2*i]
	return c

#
# Return the total number of probabilities in a
# (i.e., the sum of multiplicities in a.)
#
def size(a):
	"""total number of probs in a"""
	c=0
	for i in range(len(a)/2):
		c=c+a[2*i+1]
	return c

def log1p(x):
	"""return log(1+x)"""
	u=1.0+x
	if (u==1.0):
		return x
	else:
		return math.log(u)*x/(u-1.0)

def expm1(x):
	"""return exp(x)-1"""
	if (math.fabs(x)>0.1):
		return math.exp(x)-1
	else:
		tot=x
		term=x*x/2.0
		i=3
		while 1:
			next=tot+term
			if (next==tot):
				return tot
			tot=next
			term=term*x/i
			i=i+1
			
		
def powfun(eps,t):
	"""return 1-(1+eps)**t"""
	return -expm1(t*log1p(eps))

#
# The main function.  It analyzes the effect of drawing t times,
# with replacement, from a.  For the random variables N (= the
# number of different items drawn) and P (= the total probability
# mass of different items drawn), it prints out the mean and
# standard deviation of each random variable, and Chebyshev limits for
# its deviation from the mean.
#
def analyze(a,t):
	"""analyze t draws from a"""
	num=0
	probmass=0
	varnum=0
	varprobmass=0
	for i in range(len(a)/2):
		p=a[2*i]
		n=a[2*i+1]
		pfvalue=powfun(-p,t)
		num=num+n*pfvalue
		probmass=probmass+n*p*pfvalue
	for i in range(len(a)/2):
		for j in range(len(a)/2):
			p=a[2*i]
			n=a[2*i+1]
			q=a[2*j]
			m=a[2*j+1]
			pfvalue=(1-p-q)**t*powfun(p*q/(1-p-q),t)
			varnum=varnum+n*m*pfvalue
			varprobmass=varprobmass+n*m*p*q*pfvalue
	for i in range(len(a)/2):
		p=a[2*i]
		n=a[2*i+1]
		pfvalue=(1-p)**t*powfun(-p/(1-p),t)
		varnum=varnum+n*pfvalue
		varprobmass=varprobmass+n*p*p*pfvalue
	print
	print "Total number of words in the distribution is", size(a)
	print "The highest probability of any word is", maxprob(a)
	print "Number of draws is", t
	print "Mean number of different words is", num
	stdevnum=math.sqrt(varnum)
	print "Standard deviation of number of different words is", stdevnum
	print "Mean probability mass covered is", probmass
	stdevprobmass=math.sqrt(varprobmass)
	print "Standard deviation of probability mass covered is", stdevprobmass
	print "There is a >=75% chance that the number of words is within", stdevnum*2, "of the mean"
	print "There is a >=93.75% chance that the number of words is within", stdevnum*4, "of the mean"
	print "There is a >=75% chance that the probability mass is within", stdevprobmass*2, "of the mean"
	print "There is a >=93.75% chance that the probability mass is within", stdevprobmass*4, "of the mean"

# 
# Run this function to check powfun()
#
def check_powfun():
	t1=[0.2,0.1,0.05,0.01,0.001,1e-4,1e-5,1e-6,1e-7,-1e-7,-1e-6,-1e-5,-1e-4,-0.001,-0.01,-0.05,-0.1,-0.2]
	t2=[1,5,10,20,50,100]
	for i in range(len(t1)):
		for j in range(len(t2)):
			print powfun(t1[i],t2[j]), 1-(1+t1[i])**t2[j], powfun(t1[i],t2[j])-(1-(1+t1[i])**t2[j])











# First example of the use of analyze()
c=[1,1]
# First letter: 13 letters with 1/32 chance, 5 with 1/16, 3 with 3/32
c=convolve(c, [0.03125,13,0.0625,5,0.09375,3])
# Second letter: 13 letters with 1/32 chance, 5 with 1/16, 3 with 3/32
c=convolve(c, [0.03125,13,0.0625,5,0.09375,3])
# Analyze 161 draws
analyze(c,161)

# Second example; same as the first, but using convpower()
analyze(convpower([0.03125,13,0.0625,5,0.09375,3],2),161)

# Third example of the use of analyze()
# 1024 draws from a uniform distribution on 1024 letters
analyze([1.0/1024.0,1024],1024)

# Fourth example:
# 3072 draws from a uniform distribution on 1024 letters
analyze([1.0/1024.0,1024],3072)
