Probabilistic Programming in Python: Bayesian Modeling and Probabilistic Machine Learning with Theano



import pymc3 as pm
import pandas as pd
import numpy as np
import theano.tensor as tt
from bokeh.plotting import figure, show
from bokeh.models import BoxAnnotation, Span, Label, Legend
from bokeh.io import output_notebook
from bokeh.palettes import brewer
data_monthly = pd.read_csv(pm.get_data("monthly_in_situ_co2_mlo.csv"), header=56)
# - replace -99.99 with NaN
data_monthly.replace(to_replace=-99.99, value=np.nan, inplace=True)
# fix column names
cols = ["year", "month", "--", "--", "CO2", "seasonaly_adjusted", "fit",
        "seasonally_adjusted_fit", "CO2_filled", "seasonally_adjusted_filled"]
data_monthly.columns = cols
cols.remove("--"); cols.remove("--")
data_monthly = data_monthly[cols]
# drop rows with nan
data_monthly.dropna(inplace=True)
# fix time index
data_monthly["day"] = 15
data_monthly.index = pd.to_datetime(data_monthly[["year", "month", "day"]])
cols.remove("year"); cols.remove("month")
data_monthly = data_monthly[cols]
data_monthly.head(5)
# function to convert datetimes to numbers that are useful to algorithms
#   this will be useful later when doing prediction
def dates_to_idx(timelist):
    reference_time = pd.to_datetime('1958-03-15')
    t = (timelist - reference_time) / pd.Timedelta(1, "Y")
    return np.asarray(t)
t = dates_to_idx(data_monthly.index)
# normalize CO2 levels
y = data_monthly["CO2"].values
first_co2 = y[0]
std_co2 = np.std(y)
y_n = (y - first_co2) / std_co2
data_monthly = data_monthly.assign(t = t)
data_monthly = data_monthly.assign(y_n = y_n)
# split into training and test set
sep_idx = data_monthly.index.searchsorted(pd.to_datetime("2003-12-15"))
data_prior = data_monthly.iloc[:sep_idx+1, :]
data_after = data_monthly.iloc[sep_idx:, :]
# make plot
p = figure(x_axis_type='datetime', title='Monthly CO2 Readings from Mauna Loa',
           plot_width=700, plot_height=400)
p.yaxis.axis_label = 'CO2 [ppm]'
p.xaxis.axis_label = 'Date'
predict_region = BoxAnnotation(left=pd.to_datetime("2003-12-15"),
                               fill_alpha=0.1, fill_color="firebrick")
p.add_layout(predict_region)
ppm400 = Span(location=400,
              dimension='width', line_color='red',
              line_dash='dashed', line_width=2)
p.add_layout(ppm400)
p.line(data_monthly.index, data_monthly['CO2'],
       line_width=2, line_color="black", alpha=0.5)
p.circle(data_monthly.index, data_monthly['CO2'],
         line_color="black", alpha=0.1, size=2)
train_label = Label(x=100, y=165, x_units='screen', y_units='screen',
                    text='Training Set', render_mode='css', border_line_alpha=0.0,
                    background_fill_alpha=0.0)
test_label  = Label(x=585, y=80, x_units='screen', y_units='screen',
                    text='Test Set', render_mode='css', border_line_alpha=0.0,
                    background_fill_alpha=0.0)
p.add_layout(train_label)
p.add_layout(test_label)

show(p)

x = np.linspace(0, 150, 5000)
priors = [
    ("ℓ_pdecay",  pm.Gamma.dist(alpha=10, beta=0.075)),
    ("ℓ_psmooth", pm.Gamma.dist(alpha=4,  beta=3)),
    ("period",    pm.Normal.dist(mu=1.0,  sd=0.05)),
    ("ℓ_med",     pm.Gamma.dist(alpha=2,  beta=0.75)),
    ("α",         pm.Gamma.dist(alpha=5,  beta=2)),
    ("ℓ_trend",   pm.Gamma.dist(alpha=4,  beta=0.1)),
    ("ℓ_noise",   pm.Gamma.dist(alpha=2,  beta=4))]
colors = brewer['Paired'][7]
p = figure(title='Lengthscale and period priors',
           plot_width=700, plot_height=400)
p.yaxis.axis_label = 'Probability'
p.xaxis.axis_label = 'Years'
for i, prior in enumerate(priors):
    p.line(x, np.exp(prior[1].logp(x).eval()), legend=prior[0],
           line_width=3, line_color=colors[i])
show(p)

x = np.linspace(0, 4, 5000)
priors = [
    ("η_per",   pm.HalfCauchy.dist(beta=2)),
    ("η_med",   pm.HalfCauchy.dist(beta=1.0)),
    ("η_trend", pm.HalfCauchy.dist(beta=3)), # will use beta=2, but 2.2 is visible on plot
    ("σ",       pm.HalfNormal.dist(sd=0.25)),
    ("η_noise", pm.HalfNormal.dist(sd=0.5))]
colors = brewer['Paired'][5]
p = figure(title='Scale priors',
           plot_width=700, plot_height=400)
p.yaxis.axis_label = 'Probability'
p.xaxis.axis_label = 'Years'
for i, prior in enumerate(priors):
    p.line(x, np.exp(prior[1].logp(x).eval()), legend=prior[0],
           line_width=3, line_color=colors[i])
show(p)












# pull out normalized data
t = data_prior["t"].values[:,None]
y = data_prior["y_n"].values
with pm.Model() as model:
    # yearly periodic component x long term trend
    η_per = pm.HalfCauchy("η_per", beta=2, testval=1.0)
    ℓ_pdecay = pm.Gamma("ℓ_pdecay", alpha=10, beta=0.075)
    period  = pm.Normal("period", mu=1, sd=0.05)
    ℓ_psmooth = pm.Gamma("ℓ_psmooth ", alpha=4, beta=3)
    cov_seasonal = η_per**2 * pm.gp.cov.Periodic(1, period, ℓ_psmooth) \
                            * pm.gp.cov.Matern52(1, ℓ_pdecay)
    gp_seasonal = pm.gp.Marginal(cov_func=cov_seasonal)
    # small/medium term irregularities
    η_med = pm.HalfCauchy("η_med", beta=0.5, testval=0.1)
    ℓ_med = pm.Gamma("ℓ_med", alpha=2, beta=0.75)
    α = pm.Gamma("α", alpha=5, beta=2)
    cov_medium = η_med**2 * pm.gp.cov.RatQuad(1, ℓ_med, α)
    gp_medium = pm.gp.Marginal(cov_func=cov_medium)
    # long term trend
    η_trend = pm.HalfCauchy("η_trend", beta=2, testval=2.0)
    ℓ_trend = pm.Gamma("ℓ_trend", alpha=4, beta=0.1)
    cov_trend = η_trend**2 * pm.gp.cov.ExpQuad(1, ℓ_trend)
    gp_trend = pm.gp.Marginal(cov_func=cov_trend)
    # noise model
    η_noise = pm.HalfNormal("η_noise", sd=0.5, testval=0.05)
    ℓ_noise = pm.Gamma("ℓ_noise", alpha=2, beta=4)
    σ  = pm.HalfNormal("σ",  sd=0.25, testval=0.05)
    cov_noise = η_noise**2 * pm.gp.cov.Matern32(1, ℓ_noise) +\
                pm.gp.cov.WhiteNoise(σ)
    gp = gp_seasonal + gp_medium + gp_trend
    y_ = gp.marginal_likelihood("y", X=t, y=y, noise=cov_noise)
    mp = pm.find_MAP(include_transformed=True)


#Error line
#------------------------------------------------------------------------------------------   
#logp = 1,667.4, ||grad|| = 1.7276: 100%|██████████| 263/263 [01:27<00:00,  3.02it/s]
#------------------------------------------------------------------------------------------


# display the results, dont show transformed parameter values
sorted([name+":"+str(mp[name]) for name in mp.keys() if not name.endswith("_")])
['period:0.999726976161941',
 'α:1.170964081505256',
 'η_med:0.024147872518165272',
 'η_noise:0.007809337970705424',
 'η_per:0.10222380093634102',
 'η_trend:1.8313633958127935',
 'σ:0.0067451696343060665',
 'ℓ_med:1.5439691919986576',
 'ℓ_noise:0.17300634495608952',
 'ℓ_pdecay:126.01206470514794',
 'ℓ_psmooth :0.7362483610055942',
 'ℓ_trend:51.56586203609078']
# predict at a 15 day granularity
dates = pd.date_range(start='3/15/1958', end="12/15/2003", freq="15D")
tnew = dates_to_idx(dates)[:,None]
print("Predicting with gp ...")
mu, var = gp.predict(tnew, point=mp, diag=True)
mean_pred = mu*std_co2 + first_co2
var_pred  = var*std_co2**2
# make dataframe to store fit results
fit = pd.DataFrame({"t": tnew.flatten(),
                    "mu_total": mean_pred,
                    "sd_total": np.sqrt(var_pred)},
                   index=dates)
print("Predicting with gp_trend ...")
mu, var = gp_trend.predict(tnew, point=mp,
                           given={"gp": gp, "X": t, "y": y, "noise": cov_noise},
                           diag=True)
fit = fit.assign(mu_trend = mu*std_co2 + first_co2,
                 sd_trend = np.sqrt(var*std_co2**2))
print("Predicting with gp_medium ...")
mu, var = gp_medium.predict(tnew, point=mp,
                            given={"gp": gp, "X": t, "y": y, "noise": cov_noise},
                            diag=True)
fit = fit.assign(mu_medium = mu*std_co2 + first_co2,
                 sd_medium = np.sqrt(var*std_co2**2))
print("Predicting with gp_seasonal ...")
mu, var = gp_seasonal.predict(tnew, point=mp,
                              given={"gp": gp, "X": t, "y": y, "noise": cov_noise},
                              diag=True)
fit = fit.assign(mu_seasonal = mu*std_co2 + first_co2,
                 sd_seasonal = np.sqrt(var*std_co2**2))
print("Done")
## make plot
p = figure(title="Fit to the Mauna Loa Data",
           x_axis_type='datetime', plot_width=700, plot_height=400)
p.yaxis.axis_label = 'CO2 [ppm]'
p.xaxis.axis_label = 'Date'
# plot mean and 2σ region of total prediction
upper = fit.mu_total + 2*fit.sd_total
lower = fit.mu_total - 2*fit.sd_total
band_x = np.append(fit.index.values, fit.index.values[::-1])
band_y = np.append(lower, upper[::-1])
# total fit
p.line(fit.index, fit.mu_total,
       line_width=1, line_color="firebrick", legend="Total fit")
p.patch(band_x, band_y,
        color="firebrick", alpha=0.6, line_color="white")
# trend
p.line(fit.index, fit.mu_trend,
       line_width=1, line_color="blue", legend="Long term trend")
# medium
p.line(fit.index, fit.mu_medium,
       line_width=1, line_color="green", legend="Medium range variation")
# seasonal
p.line(fit.index, fit.mu_seasonal,
       line_width=1, line_color="orange", legend="Seasonal process")
# true value
p.circle(data_prior.index, data_prior['CO2'],
         color="black", legend="Observed data")
p.legend.location = "top_left"
show(p)

Yorumlar

Bu blogdaki popüler yayınlar

Interesting information intermsof Eartquake Statisticians

Obspy-about Developments

Another Analysis Conclusion by Seismic Shock Thesis