#
# Sample Python script for the estimation of certain random variables
# coming from drawing repeatedly from a set of words, where the words
# are composed of independent letters.
#
# Author: David Moews, 19-I-2006.
#
# This program has been placed in the public domain.
#
import math
#
# Convolve two probability distributions. A probability distribution
# is represented as a list [probability 1, multiplicity of probability 1,
# probability 2, multiplicity of probability 2,
# ...,
# probability n, multiplicity of probability n].
# The sum of the products of the probabilities and their corresponding
# multiplicities should be 1.
#
def convolve(a, b):
"""convolve two prob distrs, each a list [prb reps prb reps...]"""
c=[]
for i in range(len(a)/2):
for j in range(len(b)/2):
c.append(a[2*i]*b[2*j])
c.append(a[2*i+1]*b[2*j+1])
return c
#
# Take the nth convolution power of the distribution a.
#
def convpower(a, n):
"""nth convolution power of prob distr a"""
c=[1,1]
for i in range(n):
c=convolve(a,c)
return c
#
# Return the highest probability which appears in a.
#
def maxprob(a):
"""max probability in a"""
c=0.0
for i in range(len(a)/2):
if (a[2*i]>c):
c=a[2*i]
return c
#
# Return the total number of probabilities in a
# (i.e., the sum of multiplicities in a.)
#
def size(a):
"""total number of probs in a"""
c=0
for i in range(len(a)/2):
c=c+a[2*i+1]
return c
def log1p(x):
"""return log(1+x)"""
u=1.0+x
if (u==1.0):
return x
else:
return math.log(u)*x/(u-1.0)
def expm1(x):
"""return exp(x)-1"""
if (math.fabs(x)>0.1):
return math.exp(x)-1
else:
tot=x
term=x*x/2.0
i=3
while 1:
next=tot+term
if (next==tot):
return tot
tot=next
term=term*x/i
i=i+1
def powfun(eps,t):
"""return 1-(1+eps)**t"""
return -expm1(t*log1p(eps))
#
# The main function. It analyzes the effect of drawing t times,
# with replacement, from a. For the random variables N (= the
# number of different items drawn) and P (= the total probability
# mass of different items drawn), it prints out the mean and
# standard deviation of each random variable, and Chebyshev limits for
# its deviation from the mean.
#
def analyze(a,t):
"""analyze t draws from a"""
num=0
probmass=0
varnum=0
varprobmass=0
for i in range(len(a)/2):
p=a[2*i]
n=a[2*i+1]
pfvalue=powfun(-p,t)
num=num+n*pfvalue
probmass=probmass+n*p*pfvalue
for i in range(len(a)/2):
for j in range(len(a)/2):
p=a[2*i]
n=a[2*i+1]
q=a[2*j]
m=a[2*j+1]
pfvalue=(1-p-q)**t*powfun(p*q/(1-p-q),t)
varnum=varnum+n*m*pfvalue
varprobmass=varprobmass+n*m*p*q*pfvalue
for i in range(len(a)/2):
p=a[2*i]
n=a[2*i+1]
pfvalue=(1-p)**t*powfun(-p/(1-p),t)
varnum=varnum+n*pfvalue
varprobmass=varprobmass+n*p*p*pfvalue
print
print "Total number of words in the distribution is", size(a)
print "The highest probability of any word is", maxprob(a)
print "Number of draws is", t
print "Mean number of different words is", num
stdevnum=math.sqrt(varnum)
print "Standard deviation of number of different words is", stdevnum
print "Mean probability mass covered is", probmass
stdevprobmass=math.sqrt(varprobmass)
print "Standard deviation of probability mass covered is", stdevprobmass
print "There is a >=75% chance that the number of words is within", stdevnum*2, "of the mean"
print "There is a >=93.75% chance that the number of words is within", stdevnum*4, "of the mean"
print "There is a >=75% chance that the probability mass is within", stdevprobmass*2, "of the mean"
print "There is a >=93.75% chance that the probability mass is within", stdevprobmass*4, "of the mean"
#
# Run this function to check powfun()
#
def check_powfun():
t1=[0.2,0.1,0.05,0.01,0.001,1e-4,1e-5,1e-6,1e-7,-1e-7,-1e-6,-1e-5,-1e-4,-0.001,-0.01,-0.05,-0.1,-0.2]
t2=[1,5,10,20,50,100]
for i in range(len(t1)):
for j in range(len(t2)):
print powfun(t1[i],t2[j]), 1-(1+t1[i])**t2[j], powfun(t1[i],t2[j])-(1-(1+t1[i])**t2[j])
# First example of the use of analyze()
c=[1,1]
# First letter: 13 letters with 1/32 chance, 5 with 1/16, 3 with 3/32
c=convolve(c, [0.03125,13,0.0625,5,0.09375,3])
# Second letter: 13 letters with 1/32 chance, 5 with 1/16, 3 with 3/32
c=convolve(c, [0.03125,13,0.0625,5,0.09375,3])
# Analyze 161 draws
analyze(c,161)
# Second example; same as the first, but using convpower()
analyze(convpower([0.03125,13,0.0625,5,0.09375,3],2),161)
# Third example of the use of analyze()
# 1024 draws from a uniform distribution on 1024 letters
analyze([1.0/1024.0,1024],1024)
# Fourth example:
# 3072 draws from a uniform distribution on 1024 letters
analyze([1.0/1024.0,1024],3072)