%matplotlib inline import scipy.stats as stats from IPython.core.pylabtools import figsize import numpy as np figsize(12.5, 4) import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D jet = plt.cm.jet fig = plt.figure() x = y = np.linspace(0, 5, 100) X, Y = np.meshgrid(x, y) plt.subplot(121) uni_x = stats.uniform.pdf(x, loc=0, scale=5) uni_y = stats.uniform.pdf(y, loc=0, scale=5) M = np.dot(uni_x[:, None], uni_y[None, :]) im = plt.imshow(M, interpolation='none', origin='lower', cmap=jet, vmax=1, vmin=-.15, extent=(0, 5, 0, 5)) plt.xlim(0, 5) plt.ylim(0, 5) plt.title("Landscape formed by Uniform priors.") ax = fig.add_subplot(122, projection='3d') ax.plot_surface(X, Y, M, cmap=plt.cm.jet, vmax=1, vmin=-.15) ax.view_init(azim=390) plt.title("Uniform prior landscape; alternate view") figsize(12.5, 5) fig = plt.figure() plt.subplot(121) exp_x = stats.expon.pdf(x, scale=3) exp_y = stats.expon.pdf(x, scale=10) M = np.dot(exp_x[:, None], exp_y[None, :]) CS = plt.contour(X, Y, M) im = plt.imshow(M, interpolation='none', origin='lower', cmap=jet, extent=(0, 5, 0, 5)) #plt.xlabel("prior on $p_1$") #plt.ylabel("prior on $p_2$") plt.title("$Exp(3), Exp(10)$ prior landscape") ax = fig.add_subplot(122, projection='3d') ax.plot_surface(X, Y, M, cmap=jet) ax.view_init(azim=390) plt.title("$Exp(3), Exp(10)$ prior landscape; \nalternate view") # create the observed data # sample size of data we observe, trying varying this (keep it less than 100 ;) N = 1 # the true parameters, but of course we do not see these values... lambda_1_true = 1 lambda_2_true = 3 #...we see the data generated, dependent on the above two values. data = np.concatenate([ stats.poisson.rvs(lambda_1_true, size=(N, 1)), stats.poisson.rvs(lambda_2_true, size=(N, 1)) ], axis=1) print "observed (2-dimensional,sample size = %d):" % N, data # plotting details. x = y = np.linspace(.01, 5, 100) likelihood_x = np.array([stats.poisson.pmf(data[:, 0], _x) for _x in x]).prod(axis=1) likelihood_y = np.array([stats.poisson.pmf(data[:, 1], _y) for _y in y]).prod(axis=1) L = np.dot(likelihood_x[:, None], likelihood_y[None, :]) figsize(12.5, 12) # matplotlib heavy lifting below, beware! plt.subplot(221) uni_x = stats.uniform.pdf(x, loc=0, scale=5) uni_y = stats.uniform.pdf(x, loc=0, scale=5) M = np.dot(uni_x[:, None], uni_y[None, :]) im = plt.imshow(M, interpolation='none', origin='lower', cmap=jet, vmax=1, vmin=-.15, extent=(0, 5, 0, 5)) plt.scatter(lambda_2_true, lambda_1_true, c="k", s=50, edgecolor="none") plt.xlim(0, 5) plt.ylim(0, 5) plt.title("Landscape formed by Uniform priors on $p_1, p_2$.") plt.subplot(223) plt.contour(x, y, M * L) im = plt.imshow(M * L, interpolation='none', origin='lower', cmap=jet, extent=(0, 5, 0, 5)) plt.title("Landscape warped by %d data observation;\n Uniform priors on $p_1, p_2$." % N) plt.scatter(lambda_2_true, lambda_1_true, c="k", s=50, edgecolor="none") plt.xlim(0, 5) plt.ylim(0, 5) plt.subplot(222) exp_x = stats.expon.pdf(x, loc=0, scale=3) exp_y = stats.expon.pdf(x, loc=0, scale=10) M = np.dot(exp_x[:, None], exp_y[None, :]) plt.contour(x, y, M) im = plt.imshow(M, interpolation='none', origin='lower', cmap=jet, extent=(0, 5, 0, 5)) plt.scatter(lambda_2_true, lambda_1_true, c="k", s=50, edgecolor="none") plt.xlim(0, 5) plt.ylim(0, 5) plt.title("Landscape formed by Exponential priors on $p_1, p_2$.") plt.subplot(224) # This is the likelihood times prior, that results in the posterior. plt.contour(x, y, M * L) im = plt.imshow(M * L, interpolation='none', origin='lower', cmap=jet, extent=(0, 5, 0, 5)) plt.scatter(lambda_2_true, lambda_1_true, c="k", s=50, edgecolor="none") plt.title("Landscape warped by %d data observation;\n Exponential priors on \ $p_1, p_2$." % N) plt.xlim(0, 5) plt.ylim(0, 5) figsize(12.5, 4) data = np.loadtxt("data/mixture_data.csv", delimiter=",") plt.hist(data, bins=20, color="k", histtype="stepfilled", alpha=0.8) plt.title("Histogram of the dataset") plt.ylim([0, None]) print data[:10], "..." import pymc as pm p = pm.Uniform("p", 0, 1) assignment = pm.Categorical("assignment", [p, 1 - p], size=data.shape[0]) print "prior assignment, with p = %.2f:" % p.value print assignment.value[:10], "..." taus = 1.0 / pm.Uniform("stds", 0, 100, size=2) ** 2 centers = pm.Normal("centers", [120, 190], [0.01, 0.01], size=2) """ The below deterministic functions map an assignment, in this case 0 or 1, to a set of parameters, located in the (1,2) arrays `taus` and `centers`. """ @pm.deterministic def center_i(assignment=assignment, centers=centers): return centers[assignment] @pm.deterministic def tau_i(assignment=assignment, taus=taus): return taus[assignment] print "Random assignments: ", assignment.value[:4], "..." print "Assigned center: ", center_i.value[:4], "..." print "Assigned precision: ", tau_i.value[:4], "..." # and to combine it with the observations: observations = pm.Normal("obs", center_i, tau_i, value=data, observed=True) # below we create a model class model = pm.Model([p, assignment, taus, centers]) mcmc = pm.MCMC(model) mcmc.sample(50000) figsize(12.5, 9) plt.subplot(311) lw = 1 center_trace = mcmc.trace("centers")[:] # for pretty colors later in the book. colors = ["#348ABD", "#A60628"] if center_trace[-1, 0] > center_trace[-1, 1] \ else ["#A60628", "#348ABD"] plt.plot(center_trace[:, 0], label="trace of center 0", c=colors[0], lw=lw) plt.plot(center_trace[:, 1], label="trace of center 1", c=colors[1], lw=lw) plt.title("Traces of unknown parameters") leg = plt.legend(loc="upper right") leg.get_frame().set_alpha(0.7) plt.subplot(312) std_trace = mcmc.trace("stds")[:] plt.plot(std_trace[:, 0], label="trace of standard deviation of cluster 0", c=colors[0], lw=lw) plt.plot(std_trace[:, 1], label="trace of standard deviation of cluster 1", c=colors[1], lw=lw) plt.legend(loc="upper left") plt.subplot(313) p_trace = mcmc.trace("p")[:] plt.plot(p_trace, label="$p$: frequency of assignment to cluster 0", color="#467821", lw=lw) plt.xlabel("Steps") plt.ylim(0, 1) plt.legend() mcmc.sample(100000) figsize(12.5, 4) center_trace = mcmc.trace("centers", chain=1)[:] prev_center_trace = mcmc.trace("centers", chain=0)[:] x = np.arange(50000) plt.plot(x, prev_center_trace[:, 0], label="previous trace of center 0", lw=lw, alpha=0.4, c=colors[1]) plt.plot(x, prev_center_trace[:, 1], label="previous trace of center 1", lw=lw, alpha=0.4, c=colors[0]) x = np.arange(50000, 150000) plt.plot(x, center_trace[:, 0], label="new trace of center 0", lw=lw, c="#348ABD") plt.plot(x, center_trace[:, 1], label="new trace of center 1", lw=lw, c="#A60628") plt.title("Traces of unknown center parameters") leg = plt.legend(loc="upper right") leg.get_frame().set_alpha(0.8) plt.xlabel("Steps") figsize(11.0, 4) std_trace = mcmc.trace("stds")[:] _i = [1, 2, 3, 0] for i in range(2): plt.subplot(2, 2, _i[2 * i]) plt.title("Posterior of center of cluster %d" % i) plt.hist(center_trace[:, i], color=colors[i], bins=30, histtype="stepfilled") plt.subplot(2, 2, _i[2 * i + 1]) plt.title("Posterior of standard deviation of cluster %d" % i) plt.hist(std_trace[:, i], color=colors[i], bins=30, histtype="stepfilled") # plt.autoscale(tight=True) plt.tight_layout() import matplotlib as mpl figsize(12.5, 4.5) plt.cmap = mpl.colors.ListedColormap(colors) plt.imshow(mcmc.trace("assignment")[::400, np.argsort(data)], cmap=plt.cmap, aspect=.4, alpha=.9) plt.xticks(np.arange(0, data.shape[0], 40), ["%.2f" % s for s in np.sort(data)[::40]]) plt.ylabel("posterior sample") plt.xlabel("value of $i$th data point") plt.title("Posterior labels of data points") cmap = mpl.colors.LinearSegmentedColormap.from_list("BMH", colors) assign_trace = mcmc.trace("assignment")[:] plt.scatter(data, 1 - assign_trace.mean(axis=0), cmap=cmap, c=assign_trace.mean(axis=0), s=50) plt.ylim(-0.05, 1.05) plt.xlim(35, 300) plt.title("Probability of data point belonging to cluster 0") plt.ylabel("probability") plt.xlabel("value of data point") norm = stats.norm x = np.linspace(20, 300, 500) posterior_center_means = center_trace.mean(axis=0) posterior_std_means = std_trace.mean(axis=0) posterior_p_mean = mcmc.trace("p")[:].mean() plt.hist(data, bins=20, histtype="step", normed=True, color="k", lw=2, label="histogram of data") y = posterior_p_mean * norm.pdf(x, loc=posterior_center_means[0], scale=posterior_std_means[0]) plt.plot(x, y, label="Cluster 0 (using posterior-mean parameters)", lw=3) plt.fill_between(x, y, color=colors[1], alpha=0.3) y = (1 - posterior_p_mean) * norm.pdf(x, loc=posterior_center_means[1], scale=posterior_std_means[1]) plt.plot(x, y, label="Cluster 1 (using posterior-mean parameters)", lw=3) plt.fill_between(x, y, color=colors[0], alpha=0.3) plt.legend(loc="upper left") plt.title("Visualizing Clusters using posterior-mean parameters") import pymc as pm x = pm.Normal("x", 4, 10) y = pm.Lambda("y", lambda x=x: 10 - x, trace=True) ex_mcmc = pm.MCMC(pm.Model([x, y])) ex_mcmc.sample(500) plt.plot(ex_mcmc.trace("x")[:]) plt.plot(ex_mcmc.trace("y")[:]) plt.title("Displaying (extreme) case of dependence between unknowns") norm_pdf = stats.norm.pdf p_trace = mcmc.trace("p")[:] x = 175 v = p_trace * norm_pdf(x, loc=center_trace[:, 0], scale=std_trace[:, 0]) > \ (1 - p_trace) * norm_pdf(x, loc=center_trace[:, 1], scale=std_trace[:, 1]) print "Probability of belonging to cluster 1:", v.mean() figsize(12.5, 4) import pymc as pm x_t = pm.rnormal(0, 1, 200) x_t[0] = 0 y_t = np.zeros(200) for i in range(1, 200): y_t[i] = pm.rnormal(y_t[i - 1], 1) plt.plot(y_t, label="$y_t$", lw=3) plt.plot(x_t, label="$x_t$", lw=3) plt.xlabel("time, $t$") plt.legend() def autocorr(x): # from http://tinyurl.com/afz57c4 result = np.correlate(x, x, mode='full') result = result / np.max(result) return result[result.size / 2:] colors = ["#348ABD", "#A60628", "#7A68A6"] x = np.arange(1, 200) plt.bar(x, autocorr(y_t)[1:], width=1, label="$y_t$", edgecolor=colors[0], color=colors[0]) plt.bar(x, autocorr(x_t)[1:], width=1, label="$x_t$", color=colors[1], edgecolor=colors[1]) plt.legend(title="Autocorrelation") plt.ylabel("measured correlation \nbetween $y_t$ and $y_{t-k}$.") plt.xlabel("k (lag)") plt.title("Autocorrelation plot of $y_t$ and $x_t$ for differing $k$ lags.") max_x = 200 / 3 + 1 x = np.arange(1, max_x) plt.bar(x, autocorr(y_t)[1:max_x], edgecolor=colors[0], label="no thinning", color=colors[0], width=1) plt.bar(x, autocorr(y_t[::2])[1:max_x], edgecolor=colors[1], label="keeping every 2nd sample", color=colors[1], width=1) plt.bar(x, autocorr(y_t[::3])[1:max_x], width=1, edgecolor=colors[2], label="keeping every 3rd sample", color=colors[2]) plt.autoscale(tight=True) plt.legend(title="Autocorrelation plot for $y_t$", loc="lower left") plt.ylabel("measured correlation \nbetween $y_t$ and $y_{t-k}$.") plt.xlabel("k (lag)") plt.title("Autocorrelation of $y_t$ (no thinning vs. thinning) \ at differing $k$ lags.") from pymc.Matplot import plot as mcplot mcmc.sample(25000, 0, 10) mcplot(mcmc.trace("centers", 2), common_scale=False) from IPython.core.display import HTML def css_styling(): styles = open("../styles/custom.css", "r").read() return HTML(styles) css_styling()