Example 9: Modelling with PyTracerLab in the Bayesian Statistical Framework¶

Here we show how to use the PyTracerLab package to perform a basic standard application of lumped parameter models (see Examples 1 and 2), but using another synthetically generated dataset. As in Example 2, we stay in a Bayesian statistical framework using a Markov-chain Monte Carlo sampler. The data, however, is not generated with a Lumped Parameter Model anymore - so we don’t expect our model to match the data perfectly and we don’t expect our model to recover some “true” parameters. We still know the process that generated the data but this process is not parameterized in the same way as our Lumped Parameter Models; it is a moderately complex MODFLOW model. See Rudolph et al. (2025) for more details on this MODFLOW model. The observations we consider here reflect average tracer concentration in river water. Tracer enters the system via recharge (bell-shaped pulse over a 50-year period, similar to the real tritium input) and has the same decay coefficient as tritium. Tracer-free water also enters the system via an infiltration well.

Conceptualizing the Problem for LPMs¶

We know that there is an infiltration well in our system that supplies tracer-free water. We can model such a contribution by adding a model unit that has a fixed, very large mean travel time (e.g., 500 years). We even know the flow rate with which the well infiltrates water - but we do not know the fraction of well water in river water. So we have to consider a model with at least two components: one component represent the tracer-free well water, the other component represents the actual system. Because we do not know the mixing ratio of the components, we have to consider several different scenarios of mixing ratios.

import PyTracerLab.model as ism
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import pandas as pd
import seaborn as sns

1. Load (Synthetic) Observation Data¶

See the PyTracerLab Paper on how this data is generated. The data generating model is described in more detail in Rudolph et al. (2025).

# load input series
# this would be the tracer concentration in precipitation or recharge in a
# practical problem
file_name = "input_monthly_modflow.csv"
data = np.genfromtxt(
    file_name,
    delimiter=",",
    dtype=["<U7", float],
    encoding="utf-8",
    skip_header=1
)

timestamps = np.array([datetime.strptime(row[0], r"%Y-%m") for row in data])
input_series = np.array([float(row[1]) for row in data])

# load observation series
# this would be the measured tracer concentration in groudnwater in a
# practical problem
file_name = "obs_monthly_modflow.csv"
data = np.genfromtxt(
    file_name,
    delimiter=",",
    dtype=["<U7", float],
    encoding="utf-8",
    skip_header=1
)

timestamps = np.array([datetime.strptime(row[0], r"%Y-%m") for row in data])
obs_series = np.array([float(row[1]) for row in data])

### plot input series, output series, and observations

# get observation timesteps
timesteps = [t.year + t.month / 12.0 for t in timestamps]

# create figure
fig, ax = plt.subplots(1, 1)
# plot input series
ax.plot(
    timesteps,
    input_series,
    label="Tracer Input",
    c="grey"
)
# plot observations
ax.scatter(
    timesteps,
    obs_series,
    label="Observations",
    color="red",
    marker="x",
    zorder=10
)
ax.set_xlim(timesteps[0], timesteps[-1])
ax.set_ylim(0.0, 105.)
ax.set_xlabel("Time [months]")
ax.set_ylabel("Tracer concentration [-]")
ax.legend()
ax.grid()
plt.show()

../_images/264eaa2f01f83159d822868f4194979271594984d75be78fae51eafd3d16b2eb.png

2. Set up Models, Run MCMC for Model Parameters¶

def run_mixing_fraction(em_fraction):
    t_half = 12.3 * 12.0
    lambda_ = np.log(2.0) / t_half

    ### define model (use the same structure / units as the true model)
    # time step is 1 month
    m = ism.Model(
        dt=1.0,
        lambda_=lambda_,
        input_series=input_series,
        target_series=obs_series,
        steady_state_input=None,
        n_warmup_half_lives=2
    )

    # add an exponential-flow unit
    # define the initial model parameters for inference
    em_mtt_init = 12 * 2 # 1 year

    m.add_unit(
        ism.EMUnit(mtt=em_mtt_init),
        fraction=em_fraction, # fraciton of the overall response
        bounds=[(12. * 0.5, 12.0 * 1000.)],
        prefix="em"
    )

    # add an piston-flow unit
    # define the initial model parameters for inference
    pm_mtt_init = 12 * 500 # 500 years, tracer-free water
    m.add_unit(
        ism.PMUnit(mtt=pm_mtt_init),
        fraction=1 - em_fraction, # 100 percent of the overall response
        bounds=[(12. * 0.5, 12.0 * 1000.)],
        prefix="pm"
    )

    # fix mtt of pm
    m.set_fixed("pm.mtt")

    # create a solver
    solver = ism.Solver(m)
    # run DREAM
    res = solver.dream_sample(
        n_samples=2000, # effective samples after burn-in and thinning
        burn_in=3000,
        thin=1,
        n_chains=3,
        n_diff_pairs=1,
        cr=[i / 10 for i in range(1, 11)],
        gamma_jitter=0.2,
        jitter=1.e-5,
        sigma=1.,
        random_state=123456,
        return_sim=True,
        set_model_state=False
    )

    # plot chains to inspect convergence
    fig, ax = plt.subplots(res["samples_chain"].shape[2], 1, figsize=(6, res["samples_chain"].shape[2] * 1.5), sharex=True)

    ax = np.atleast_1d(ax)

    for i in range(res["samples_chain"].shape[2]):
        for j in range(res["samples_chain"].shape[0]):
            ax[i].plot(res["samples_chain"][j, :, i] / 12., c="k", lw=.5, alpha=0.4)
        ax[i].set_ylabel(m.param_keys(free_only=True)[i])
        ax[i].set_xlim(0, res["samples_chain"].shape[1])
    ax[-1].set_xlabel("Steps")

    # print gelman rubin convergence statistics
    print(res["gelman_rubin"])

    # get travel time distributions
    step_limit = 12 * 1000
    # iterate over samples
    tt_dists = []
    samples = res["samples"]
    # get random subset
    samples = samples[np.random.choice(samples.shape[0], 2000, replace=True)]
    for sample in res["samples"]:
        # set model parameters
        for item in zip(m.param_keys(free_only=True), sample):
            m.set_param(key=item[0], value=item[1])
        # get travel time distribution
        ttds = m.get_ttds(n_steps=step_limit)
        tt_dists.append(
            ttds["fractions"][0] * ttds["distributions"][0] + \
            ttds["fractions"][1] * ttds["distributions"][1]
        )

    return res, tt_dists, m

np.linspace(.35, .65, 5)

array([0.35 , 0.425, 0.5  , 0.575, 0.65 ])

results, ttds, models = [], [], []

mixing_ratios = np.linspace(.35, .65, 5) # np.linspace(.45, .5, 3)

for fraction in mixing_ratios:
    r_, a_, m_ = run_mixing_fraction(fraction)
    results.append(r_)
    ttds.append(a_)
    models.append(m_)

{'em.mtt': 1.0007227597768051}
{'em.mtt': 1.0045099640221042}
{'em.mtt': 1.044586280057089}
{'em.mtt': 1.0016811375079617}
{'em.mtt': 1.0034975046278718}

../_images/e65a98414eb7d8fb2d181e9e74519d66769fcb5972a3575cfb9a8224b5460a29.png

../_images/7211cbcfbc470ce430843de5ad10ec865fec97cca1da8bb9fd99e7931917bb96.png

../_images/dccd677b6d8b461e4e805041fcf398e17c3f2de6ca871c031ab3036617917bfb.png

../_images/64a3e0b3e3aa46112b53acad12fb3bfeea0199e0360b618402106b6dd55a4d28.png

../_images/0ab9764594aabefeb3b8da677239b31ba4ba60414746df75f5037cd680b54b65.png

def nse(obs, sim):
    obs_ = obs.reshape(-1, 1)
    sim_ = sim.reshape(-1, 1)
    mask = ~np.isnan(obs_) & ~np.isnan(sim_)
    resid = sim_ - obs_
    dev = obs_ - np.nanmean(obs_)
    nse_ = 1 - np.sum(resid[mask] ** 2) / np.sum(dev[mask] ** 2)
    return nse_

# load modflow travel times from csv
tts_mf = pd.read_csv("tts_weights_modflow.csv", index_col=None)
tts_mf["tts"] /= 30.
tts_mf.head()

	tts	weights
0	59.99600	0.000344
1	59.99600	0.000344
2	59.99600	0.000344
3	59.99600	0.000344
4	19.33943	0.000344

fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(5, 6),
                       gridspec_kw={"height_ratios": [2, 1.2], "hspace": .5, "wspace": .1},
                       sharey="row", sharex="row")

start = 0
end = 900

colors = ["red", "limegreen", "royalblue", "darkorange", "black"]

for i in range(len(results)):
    # plot ssimulations
    sim_data = results[i]["sims"]
    s = sim_data.reshape((-1, sim_data.shape[-1]))
    q = np.quantile(s, [0.1, 0.5, 0.9], axis=0)

    ax[0].plot(
        timestamps[start:end],
        q[1, start:end],
        c=colors[i],
        ls="-",
        lw=2.,
        label=fr"{mixing_ratios[i] * 100:.1f}% EM, " + "\n" + \
            fr"$ NSE $ = {nse(obs_series[start:end], q[1, start:end]):.2f}"
    )
    ax[0].fill_between(
        timestamps[start:end],
        q[0, start:end], q[2, start:end],
        color=colors[i],
        alpha=0.1,
    )

    # plot travel time distributions
    step_limit = 12 * 100
    dt = 1.

    # integrate travel time distributions to check area integrates to 1
    # use trapezoidal rule
    integrals = []
    tt_dists_ = []
    for j in range(len(ttds[i])):
        I = np.trapezoid(ttds[i][j][:step_limit], dx=dt)
        tt_dists_.append(ttds[i][j][:step_limit] / I)
        integrals.append(I)
    print(np.mean(integrals), np.std(integrals))

    t_plot = np.arange(0, step_limit * dt, dt) # years
    ax[1].plot(
        t_plot, 
        np.median(tt_dists_, axis=0),
        alpha=1.,
        c=colors[i],
        lw=1.,
        ls="-",
        zorder=10,
        label=fr"{mixing_ratios[i] * 100:.1f}% EM"
    )
    ax[1].fill_between(
        t_plot, 
        np.quantile(tt_dists_, 0.9, axis=0),
        np.quantile(tt_dists_, 0.1, axis=0),
        alpha=.1,
        color=colors[i]
    )

ax[0].scatter(
    timestamps, obs_series,
    marker="o", facecolor="w",
    edgecolor="k", s=40,
    zorder=100, alpha=0.8,
    lw=1., label="Obs."
)
ax[0].legend(fontsize=8, handlelength=1.)
ax[0].grid(alpha=0.7)
ax[0].set_xlim(timestamps[0], timestamps[-1])
ax[0].set_ylim(0)
ax[0].set_xlabel("Date")
ax[0].set_ylabel(r"Concentration $ [-] $")
ax[0].set_title("Simluated and Observed Concentrations", fontsize=11)

sns.kdeplot(data=tts_mf, x="tts", weights="weights", ax=ax[1], zorder=100,
            clip=(0, step_limit), color="k", lw=2., bw_adjust=0.15, ls="--",
            label="MODFLOW")

# the xticks of the travel time distribution plot are in months
# we want them in years
xmax = step_limit # 250 years
step_years = 20
ax[1].legend(fontsize=8)
ax[1].set_xticks(np.arange(0, xmax, 12 * step_years))
ax[1].set_xticklabels(np.arange(0, xmax / 12, step_years, int))
ax[1].set_xlim(0, xmax)
ax[1].grid(alpha=0.7)
ax[1].set_xlabel(r"Travel Time $ [a] $")
ax[1].set_ylabel(r"Fraction $ [-] $")
ax[1].set_title("Inferred and Observed Travel Time Distributions", fontsize=11)

plt.savefig("modflow_PyTracerLab.png", dpi=600, bbox_inches="tight")

34222592261113316 0.00614455446109455
3855271223734189 0.024796749860033166
33098806501063355 0.09446046973423981
18774220839037972 0.0925718751274217
14247340502611522 0.05143481341824395

../_images/2fbd3b5ab091907b610cb5c4ecfa44bdfeb359ade7565e785d9b877f183a373d.png