Libraries & functions


import numpy as np
import matplotlib.pyplot as plt
import time
import random as rdm
from scipy.integrate import simps

import math

def normpdf(x, mean, sd):
    var = float(sd)**2
    pi = 3.1415926
    denom = (2*pi*var)**.5
    num = math.exp(-(float(x)-float(mean))**2/(2*var))
    return num/denom


v_normpdf = np.vectorize(normpdf)

from matplotlib.ticker import ScalarFormatter, NullFormatter
#rcParams['font.family'] = 'Helvetica'
#plt.style.use('ggplot')
params = {'axes.labelsize': 28, 'xtick.labelsize': 16, 'ytick.labelsize': 16, 'text.usetex': True, 'lines.linewidth': 1,
          'axes.titlesize': 22, 'font.family': 'serif', 'axes.facecolor': (1,1,1)}
# Axes.set_facecolor(self, color)
plt.rcParams.update(params)
columnwidth = 240./72.27
textwidth = 504.0/72.27

# %matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

Mock data

Creating a mock data

mu_true = 100.
std_true = 35.
mock = np.random.normal(mu_true, std_true, size=10000)
plt.figure(facecolor=(1,1 , 1))
plt.hist(mock, range=(-50,250), bins=30)
plt.ylabel('counts')
plt.xlabel('x')
plt.show()

png

mu_true = np.mean(mock)
std_true = np.std(mock)
print(mu_true, std_true)
100.001201994 34.960934794

EMCEE functions

import emcee
import corner

No error

We will try to fit a gaussian to our generated mock data. In this first exercise, there is no error associated with the data



def probgauss(param, vector): #this is your likelihood function
    x = vector #note that vector could receive more then 1 observable.. 
    mu, std = param #these are the paramters you want to fit
    
    prob= v_normpdf(x, mu, std) 
    sum_prob = np.sum(np.log(prob)) #this is brute force. When fitting a gaussian, you can find an analytic function to get this.
    return(sum_prob) 

def lnprior(theta): #the range for the parameters
    mu, std= theta
    if 70 < mu <130 and  10 < std < 60 : #this is the most simple case. 
        return 0.0
    return -np.inf

def lnprob(theta, vec_vphi): #the prob for emcee
    lp = lnprior(theta)
    if not np.isfinite(lp):
        return -np.inf
    return lp+probgauss(theta, vec_vphi)

#number of free parameters and walkers
ndim, nwalkers = 2, 120

pos = [[85, 26] + 1e-3*np.random.randn(ndim) for i in range(nwalkers)] #initializing your walkers.

ncores= 4 #number of CPU cores you want to use.

sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, threads=ncores, args=[(mock)])

start_time = time.time() 
print('started sampler\n')

nsteps = 170
for i, result in enumerate(sampler.sample(pos, iterations=nsteps)):
    if (i+1) % 50 == 0:
        print()#need to use this if multithreading is used
        print('Total time: ',round((time.time() - start_time)/60,2), 
              "min. Completed: {0:5.1%}".format(float(i+1) / nsteps), end='\n')


sampler.pool.terminate() #it will close the  opened threads
print('Total time: ',round((time.time() - start_time)/60, 2), "minutes")#need to use this if multithreading is used

#if you want to save your chains for later analysis

# np.save(test_name, sampler.chain[:, :, :].reshape((-1, ndim))) 
# np.save(test_name + '_chain', sampler.chain)
# np.save(test_name+ '_likelihoods', sampler.lnprobability )

started sampler


Total time:  0.29 min. Completed: 29.4%

Total time:  0.57 min. Completed: 58.8%

Total time:  0.86 min. Completed: 88.2%
Total time:  0.97 minutes
samplerchain = sampler.chain
skip = 120 #we usually skip the initial steps. Its called the burn-out phase.
sampleserror = samplerchain[:, skip:, :].reshape((-1, ndim))
medians = np.percentile(sampleserror, 50, axis=0)
mu1 = medians[0]
std1 = medians[1]
p_median = [mu1, std1]
print(p_median)
[99.973957188632482, 34.985853160541069]
name_param = ['$\mu$', '$\sigma$']
for jj in range(ndim):
    plt.figure(facecolor=(1,1 , 1))
    for ii in range(nwalkers):
        p = samplerchain[ii, :, jj]
        plt.plot(p,  'k-', alpha=0.1,)
    plt.plot([0, nsteps], [p_median[jj], p_median[jj]], 'r-', label='median value')
    plt.xlabel('steps')
    plt.ylabel(name_param[jj])
    plt.legend(loc=2)
#     plt.savefig(test_name + '_' + name_param[jj]+'_walker.png', facecolor='w')
    plt.show()
    plt.close()

print('finished plot parameter evolution')
likelis = sampler.lnprobability
plt.figure(facecolor=(1,1 , 1))
for ii in range(nwalkers):
    plt.plot(likelis[:][ii],  'k-', alpha=0.1,)

plt.xlabel('steps')
plt.ylabel('$\Sigma \log(\mathcal{L})$')
# plt.savefig(test_name+'_likelihoods.png', facecolor='w')
plt.show()


quantiles = [0.1, 0.5, 0.9]



corner.corner(sampleserror, labels=name_param,
                    quantiles=quantiles, smooth=1.0,  show_titles=True,  title_kwargs={"fontsize": 11},
                   truths=[mu_true, std_true], facecolor='w')#,
#                    color='purple')#, smooth1d=0.5)


plt.show()

xx = np.linspace(0, 200, 300)
plt.figure(facecolor=(1,1 , 1))
for mum, stdm in sampleserror[np.random.randint(len(sampleserror), size=100)]:
    d =  v_normpdf(xx, mum, stdm)
    plt.plot(xx, (d), 'k-', alpha=0.1)

plt.hist(mock, histtype='step', normed=True, bins=50, label='generated')
plt.show()


png

png

finished plot parameter evolution

png

png

png

Mock data with error

Now, let’s include some error in our data.

mock_er = np.random.normal(20, 1, size=len(mock))

plt.figure(facecolor=(1,1 , 1))
plt.hist(mock_er, bins=100)
plt.show()

png

Lets assign to each mock data an ‘observed’ value based on its error

mockdata_obs = []
for v, er in zip(mock, mock_er):
    mockdata_obs.append(rdm.normalvariate(v, er))
mockdata_obs = np.asarray(mockdata_obs)
plt.figure(facecolor=(1,1 , 1))
vmin = -100
vmax = 280
b_size = 10
step = 5
vint, hist = my_hist(mock,vmin, vmax, b_size, step, normed=True)

plt.plot(vint, hist, label='True')
vint, hist = my_hist(mockdata_obs,vmin, vmax, b_size, step, normed=True)
plt.plot(vint, hist, label="Observed")
plt.legend()
plt.ylabel('prob')
plt.xlabel('x')
plt.show()

png

vround = np.vectorize(round)
print('True data - [mean, sigma]: ', vround([np.mean(mock), np.std(mock)],2))
print('Observed data - [mean, sigma]: ', vround([np.mean(mockdata_obs), np.std(mockdata_obs)],2))
True data - [mean, sigma]:  [ 100.     34.96]
Observed data - [mean, sigma]:  [ 99.88  40.68]

The observation errors enlarge the measured dispersion.

EMCEE in the ‘observed data’

sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, threads=ncores, args=[(mockdata_obs)])

start_time = time.time() 
print('started sampler\n')


nsteps = 170
for i, result in enumerate(sampler.sample(pos, iterations=nsteps)):
    if (i+1) % 50 == 0:
        print()#need to use this if multithreading is used
        print('Total time: ',round((time.time() - start_time)/60,2), 
              "min. Completed: {0:5.1%}".format(float(i+1) / nsteps), end='\n')


sampler.pool.terminate() #it will close the  opened threads
print('Total time: ',round((time.time() - start_time)/60, 2), "minutes")#need to use this if multithreading is used
# np.save(test_name, sampler.chain[:, :, :].reshape((-1, ndim)))
# np.save(test_name + '_chain', sampler.chain)
# np.save(test_name+ '_likelihoods', sampler.lnprobability )

started sampler


Total time:  0.3 min. Completed: 29.4%

Total time:  0.59 min. Completed: 58.8%

Total time:  0.89 min. Completed: 88.2%
Total time:  1.0 minutes
samplerchain = sampler.chain
name_param = ['$\mu$', '$\sigma$']
skip = 120
sampleserror = samplerchain[:, skip:, :].reshape((-1, ndim))
medians = np.percentile(sampleserror, 50, axis=0)
mu1 = medians[0]
std1 = medians[1]
p_median = [mu1, std1]
print(p_median)
[99.853592274585694, 40.676793741161177]
for jj in range(ndim):
    plt.figure(facecolor=(1,1 , 1))
    for ii in range(nwalkers):
        p = samplerchain[ii, :, jj]
        plt.plot(p,  'k-', alpha=0.1,)
    plt.plot([0, nsteps], [p_median[jj], p_median[jj]], 'r-', label='median value')
    plt.xlabel('steps')
    plt.ylabel(name_param[jj])
    plt.legend(loc=2)
#     plt.savefig(test_name + '_' + name_param[jj]+'_walker.png', facecolor='w')
    plt.show()

print('finished plot parameter evolution')
likelis = sampler.lnprobability
plt.figure(facecolor=(1,1 , 1))
for ii in range(nwalkers):
    plt.plot(likelis[:][ii],  'k-', alpha=0.1,)

plt.xlabel('steps')
plt.ylabel('$\Sigma \log(\mathcal{L})$')
# plt.savefig(test_name+'_likelihoods.png', facecolor='w')
plt.show()
plt.close()


quantiles = [0.1, 0.5, 0.9]



fig = corner.corner(sampleserror, labels=name_param,
                    quantiles=quantiles, smooth=1.0,  show_titles=True,  title_kwargs={"fontsize": 11},
                   truths=[mu_true, std_true], facecolor=(1,1 , 1))#,
#                    color='purple')#, smooth1d=0.5)


# plt.savefig(test_name + 'corner_after' + str(skip)+ '.png', facecolor='w')
plt.show()

xx = np.linspace(0, 200, 300)
plt.figure(facecolor=(1,1 , 1))
for mum, stdm in sampleserror[np.random.randint(len(sampleserror), size=100)]:
    d =  v_normpdf(xx, mum, stdm)
    plt.plot(xx, (d), 'k-', alpha=0.1)

vint, hist = my_hist(mock,vmin, vmax, b_size, step, normed=True)

plt.plot(vint, hist, label='generated')
vint, hist = my_hist(mockdata_obs,vmin, vmax, b_size, step, normed=True)
plt.plot(vint, hist, label='observed')
plt.legend()
# plt.savefig(test_name + 'fits_after' + str(skip)+ '.png', facecolor='w')
plt.show()

png

png

finished plot parameter evolution

png

png

png

As we can see, we fitted the observed data (orange curve). But we want to recover the true underlying distribution (blue curve)

EMCEE including error

Finding the TRUE underlying distribution

#CHECK PAGE 140  FROM THE BOOK: Statistics, Data Mining and Machine Learning in Astronomy
#also pages 203 
x = np.linspace(-300, 300, 500)
twopi = np.sqrt(2*np.pi)

#now the likelihood function includes the error if the measurement.

def probgauss_er(param, vector):
    x, x_er= vector
    mu, std = param
    std_er = np.sqrt(std**2 + x_er**2) #equation 4.29 or 5.63 from the book
    prob = -0.5*((x - mu)/std_er)**2+ np.log(1/(std_er))
    sum_prob = np.sum(prob)
    return(sum_prob) 

def lnprior(theta): #the range for the parameters
    mu, std= theta
    if 70 < mu <130 and  10 < std < 60 :
        return 0.0
    return -np.inf

def lnprob(theta, vec_vphi): #the prob for emcee
    lp = lnprior(theta)
    if not np.isfinite(lp):
        return -np.inf
    return lp+probgauss_er(theta, vec_vphi)

#number of free parameters and walkers
ndim, nwalkers = 2, 120

pos = [[85, 26] + 6*np.random.uniform(-1, 1)*np.random.randn(ndim) for i in range(nwalkers)]

ncores= 4

sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, threads=ncores, args=[(mockdata_obs, mock_er)])

start_time = time.time() 
print('started sampler\n')


nsteps = 150
for i, result in enumerate(sampler.sample(pos, iterations=nsteps)):
    if (i+1) % 50 == 0:
        print()#need to use this if multithreading is used
        print('Total time: ',round((time.time() - start_time)/60,2), 
              "min. Completed: {0:5.1%}".format(float(i+1) / nsteps), end='\n')


sampler.pool.terminate() #it will close the  opened threads
print('Total time: ',round((time.time() - start_time)/60, 2), "minutes")#need to use this if multithreading is used
# np.save(test_name, sampler.chain[:, :, :].reshape((-1, ndim)))
# np.save(test_name + '_chain', sampler.chain)
# np.save(test_name+ '_likelihoods', sampler.lnprobability )

started sampler


Total time:  0.02 min. Completed: 33.3%

Total time:  0.03 min. Completed: 66.7%

Total time:  0.04 min. Completed: 100.0%
Total time:  0.04 minutes
samplerchain = sampler.chain
name_param = ['$\mu$', '$\sigma$']
skip = 120
sampleserror = samplerchain[:, skip:, :].reshape((-1, ndim))
medians = np.percentile(sampleserror, 50, axis=0)
mu1 = medians[0]
std1 = medians[1]
p_median = [mu1, std1]
print(p_median)
[99.937328451806252, 35.3142947554622]
for jj in range(ndim):
    plt.figure(facecolor=(1,1 , 1))
    for ii in range(nwalkers):
        p = samplerchain[ii, :, jj]
        plt.plot(p,  'k-', alpha=0.1,)
    plt.plot([0, nsteps], [p_median[jj], p_median[jj]], 'r-', label='median value')
    plt.xlabel('steps')
    plt.ylabel(name_param[jj])
    plt.legend(loc=2)
#     plt.savefig(test_name + '_' + name_param[jj]+'_walker.png', facecolor='w')
    plt.show()

print('finished plot parameter evolution')
likelis = sampler.lnprobability
plt.figure(facecolor=(1,1 , 1))
for ii in range(nwalkers):
    plt.plot(likelis[:][ii],  'k-', alpha=0.1,)

plt.xlabel('steps')
plt.ylabel('$\Sigma \log(\mathcal{L})$')
# plt.savefig(test_name+'_likelihoods.png', facecolor='w')
plt.show()


quantiles = [0.05, 0.5, 0.95]



fig = corner.corner(sampleserror, labels=name_param,
                    quantiles=quantiles, smooth=1.0,  show_titles=True,  title_kwargs={"fontsize": 11},
                   truths=[mu_true, std_true])#,
#                    color='purple')#, smooth1d=0.5)


# plt.savefig(test_name + 'corner_after' + str(skip)+ '.png', facecolor='w')
plt.show()

xx = np.linspace(-50, 250, 300)
plt.figure(facecolor=(1,1 , 1))
for mum, stdm in sampleserror[np.random.randint(len(sampleserror), size=100)]:
    d =  v_normpdf(xx, mum, stdm)
    plt.plot(xx, np.log10(d), 'k:', alpha=0.05)

vint, hist = my_hist(mock,vmin, vmax, b_size, step, normed=True)

plt.plot(vint, np.log10(hist), linewidth=3, label='True')
vint, hist = my_hist(mockdata_obs,vmin, vmax, b_size, step, normed=True)
plt.plot(vint, np.log10(hist), label='observed')
plt.legend()
# plt.savefig(test_name + 'fits_after' + str(skip)+ '.png', facecolor='w')
plt.show()
plt.close()

png

png

finished plot parameter evolution

png

png

png