TensorFlow Probability

  • TF combining DL and Proba Models
  • DNN predicting binary outcomes is just fancy parametrisation of Bernouli distribution
    • Enode knowledge through richer distributional assumptions !
      • control prediction variance
      • encode prior knowledge
      • ask and answer questions

Josh Dillon Talk

In [2]:
import tensorflow as tf
import tensorflow_probability as tfp
In [9]:
model = tfp.glm.Bernoulli()

coeffs, linear_response, is_converged, num_iter = tfp.glm.fit_sparse(
    model_matrix=x, response=y,
    l1_regularizer=0.5,
    l2_regularizer=1.,
    model=model)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-2757ea006561> in <module>
      2
      3 coeffs, linear_response, is_converged, num_iter = tfp.glm.fit_sparse(
----> 4     model_matrix=x, response=y,
      5     l1_regularizer=0.5,
      6     l2_regularizer=1.,

NameError: name 'x' is not defined
In [11]:
tfd = tfp.distributions
d = tfd.Normal(loc=[-1., 1.], scale=1.)

x = d.sample() # draw two random points
px = d.prob(x) # compute density/mass

x
px
Out[11]:
<tf.Tensor 'Normal/prob/Exp:0' shape=(2,) dtype=float32>
In [ ]:
# Bijectors Transform Distribution

# Masked Autoregressive Flow for Density Estimation

iaf = tfp.distributions.TransformedDistribution(
    distribution=tfp.distributions.Normal(loc=0., scale=1.),
    bijector=(
        tfp.bijectors.MaskedAutoregressiveFlow(
            shift_and_log_scale_fn=\
                tfb.masked_autoregressive_default_template(
                    hidden_layers=[512, 512]))),
    event_shape=[dims]) # own DNN

loss = -iaf.log_prob(x) # DNN powered PDF

EX: Binomial Model - Fed Rate Prediction

In [28]:
from __future__ import absolute_import, division, print_function

warning_status = 'ignore' #@param ['ignore', 'always', 'module', 'once', 'default', 'error']
import warnings
warnings.filterwarnings(warning_status)
with warnings.catch_warnings():
    warnings.filterwarnings(warning_status, category=DeprecationWarning)
    warnings.filterwarnings(warning_status, category=UserWarning)


import numpy as np
import os
matplotlib_style = 'fivethirtyeight' #@param ['bmh', 'ggplot', 'seaborn', 'default', 'Solarize_Light2', 'classic', 'dark_background', 'seaborn-colorblind', 'seaborn-notebook']
import matplotlib.pyplot as plt; plt.style.use(matplotlib_style)
import matplotlib.axes as axes;
from matplotlib.patches import Ellipse
%matplotlib inline
import seaborn as sns; sns.set_context('notebook')
notebook_screen_res = 'png' #@param ['retina', 'png', 'jpeg', 'svg', 'pdf']
%config InlineBackend.figure_format = notebook_screen_res
In [ ]:
import tensorflow as tf
tfe = tf.contrib.eager

# Eager Execution
use_tf_eager = True #@param {type:"boolean"}


# Use try/except so we can easily re-execute the whole notebook.
if use_tf_eager:
  try:
    tf.enable_eager_execution()
  except:
    reset_session()


import tensorflow_probability as tfp
tfd = tfp.distributions
tfb = tfp.bijectors


def default_session_options(enable_gpu_ram_resizing=True,
                            enable_xla=False):
  """Creates default options for Graph-mode session."""
  config = tf.ConfigProto()
  config.log_device_placement = True
  if enable_gpu_ram_resizing:
    # `allow_growth=True` makes it possible to connect multiple
    # colabs to your GPU. Otherwise the colab malloc's all GPU ram.
    config.gpu_options.allow_growth = True
  if enable_xla:
    # Enable on XLA. https://www.tensorflow.org/performance/xla/.
    config.graph_options.optimizer_options.global_jit_level = (
        tf.OptimizerOptions.ON_1)
  return config


def reset_session(options=None):
  """Creates a new global, interactive session in Graph-mode."""
  if tf.executing_eagerly():
    return
  global sess
  try:
    tf.reset_default_graph()
    sess.close()
  except:
    pass
  if options is None:
    options = default_session_options()
  sess = tf.InteractiveSession(config=options)


def evaluate(tensors):
  """Evaluates Tensor or EagerTensor to Numpy `ndarray`s.
  Args:
  tensors: Object of `Tensor` or EagerTensor`s; can be `list`, `tuple`,
    `namedtuple` or combinations thereof.
 
  Returns:
    ndarrays: Object with same structure as `tensors` except with `Tensor` or
      `EagerTensor`s replaced by Numpy `ndarray`s.
  """
  if tf.executing_eagerly():
    return tf.contrib.framework.nest.pack_sequence_as(
        tensors,
        [t.numpy() if tf.contrib.framework.is_tensor(t) else t
         for t in tf.contrib.framework.nest.flatten(tensors)])
  return sess.run(tensors)
In [29]:
# color...
class _TFColor(object):
  """Enum of colors used in TF docs."""
  red = '#F15854'
  blue = '#5DA5DA'
  orange = '#FAA43A'
  green = '#60BD68'
  pink = '#F17CB0'
  brown = '#B2912F'
  purple = '#B276B2'
  yellow = '#DECF3F'
  gray = '#4D4D4D'
  def __getitem__(self, i):
    return [
        self.red,
        self.orange,
        self.green,
        self.blue,
        self.pink,
        self.brown,
        self.purple,
        self.yellow,
        self.gray,
    ][i % 9]
TFColor = _TFColor()
In [19]:
sess = tf.InteractiveSession()

def evaluate(tensors):
    return sess.run(tensors)
In [12]:
# Var parameters
num_meeting = 8.
possible_fed_inc = tf.range(
    start=0.,
    limit=num_meeting + 1)
possible_rates = 2. + 10. + 0.25 * possible_fed_inc

proba_inc = tf.constant([0.6, 0.7, 0.8, 0.9])
In [17]:
proba_inc = proba_inc[..., tf.newaxis]
proba_rate = tfp.distributions.Binomial(total_count=num_meeting, probs=proba_inc).prob(possible_fed_inc)
In [21]:
# Convert Tensors to Numpy
[
    possible_rates_,
    proba_rate_,
    proba_inc_,
] = evaluate([
    possible_rates,
    proba_rate,
    proba_inc
])
In [60]:
proba_rate_[1][0]
Out[60]:
array([6.5609995e-05, 1.2247191e-03, 1.0001878e-02, 4.6675421e-02,
       1.3613670e-01, 2.5412178e-01, 2.9647535e-01, 1.9765022e-01,
       5.7647999e-02], dtype=float32)
In [62]:
# Visual
plt.figure(figsize=(14, 9))
for i, pf in enumerate(proba_inc_):
    plt.subplot(2, 2, i+1)
    plt.bar(possible_rates_,
            proba_rate_[i][0],
            color=TFColor[i],
            width=0.23,
            label="$p = {:.1f}$".format(pf[0][0]),
            alpha=0.6,
            edgecolor=TFColor[i],
           lw="3")
    plt.xticks(possible_rates_ + 0.125, possible_rates_)
    plt.xlim(12, 14.25)
    plt.y    lim(0, 0.5)
    plt.ylabel("Proba of cc rate")
    plt.xlabel("Credit card rate (%)")
    plt.title("Credit card rates: " "prob_fed_raises_rates ={:.1f}".format(pf[0][0]));
    plt.suptitle("Est. of credit card rates in 12 months.", fontsize="x-large", y=1.02)
    plt.tight_layout()

Intro Examples

Linear Mixed Effects with Ed2

  • Simple appraoch to structured relationship data - aka hierarchical linear model
  • Input features dictionary of service, students, instructors vectors per element per course
  • Regressing on inputs, posits latent random variables, returning distribution over evalution ratings - TF session runs on this ouptut will return a generation of ratings
  • See tutorial using ftp.mcmc.HamiltonianMonteCarlo algo to train and explore and interpret model using posterior predictions
In [63]:
from tensorflow_probability import edward2 as ed
In [ ]:
def model(features):
    # set up FIXED EFFECTS and params
    intercept = tf.get_variable("intercept", [])
    service_effects = tf.get_variable("service_effects", [])
    student_stddev_unconst = tf.get_variable("student_stddev_pre", [])
    instructor_stddev_unconst = tf.get_variable("instructor_stddev_pre", [])

    # set up random effects
    student_effects = ed.MultivariateNormalDiag(
        loc=tf.zeros(num_students),
        scale_identity_multiplier=tf.exp(
            student_stddev_unconst),
        name="student_effects")
    instructor_effects = ed.MultivariateNormalDiag(
        loc=tf.zeros(num_instructors),
        scale_identity_multiplier=tf.exp(
            instructor_stddev_unconst),
        name="instructor_effects")

    # set up likelihood given fixed and random effects
    ratings = ed.Normal(
        loc=(service_effects * features["service"] +
                tf.gather(student_effects, features["students"]) +
                tf.gather(instructor_effects, features["instructors"]) +
                intercept),
        scale=1.,
        name="ratings")

return ratings

Gaussian Copulas with TFP Bijectors

  • Copula a multi-var PD having uniform marginal PD - building with Bijectors and TransformedDistribution
  • Gaussian Copula creates few custom Bijectors and how building several different ones
  • TF Distri Shapes HOWTO manage shapes for sampling, batch training, modeling events
In [ ]:
tfd = tfp.distributions
tfb = tfp.distributions.bijectors

# Log-Normal Distri
log_normal = tfd.TransformedDistribution(
    distribution=tfd.Normal(loc=0., scale=1.),
    bijector=tfb.Exp())

# Kumaraswamy Distri
Kumaraswamy = tfd.TransformedDistribution(
    distribution=tfd.Uniform(low=0., high=1.),
    bijector=tfb.Kumaraswamy(
        concentration1=2.,
        concentration0=2.))

# Masked Autoregressive Flow
# https://arxiv.org/abs/1705.07057
shift_and_log_scale_fn = tfb.masked_autoregressive_default_template(
    hidden_layers=[512, 512],
    event_shape=[28*28])
maf = tfd.TransformedDistribution(
    distribution=tfd.Normal(loc=0., scale=1.),
    bijector=tfb.MaskedAutoregressiveFlow(
        shift_and_log_scale_fn=shift_and_log_scale_fn))

Variational Autoencoder with FTP

  • VA is ML model using one learned system to repr data in some LOW_DIM space and a second learned system to RESTORE LOW_DIM to what would have otherwise been the input
In [ ]:
# Assumes user supplies `likelihood`, `prior`, `surrogate_posterior` functions each returning a tf.distribution.Distirbution-like object
elbo_loss = tfp.vi.monte_carlo_csiszar_f_divergence(
    f=tfp.vi.kl_reverse, # Equiv to evidence lower bound
    p_log_prob=lambda z: likelihood(z).log_prob(x) + prior().log_prob(z),
    q=surrogate_posterior(x),
    num_draws=1)

train = tf.train.AdadeltaOptimizer(learning_rate=0.01).minimize(elbo_loss)

Bayesian NN with TFP Layers

  • BNN is NN with prior weights and biases - aka infinite ensemble of NN proba assigned to each NN config per piror
  • Latest Paper: Flipout estimator
  • Stochastic Forward passes WRT probaConvo layer and denselylyaer - retuning an ouptut tensor shaped by batch size and 10 values
  • each row of tensor repr logits (unconstrianed proba vlaues) each data points belongs to one of 10 classes
  • for training, build loos func, comprising 2 terms - Expected negative-log-likelihood and KL divergence
  • approxi it via MC, KL divergence is added via regularisze terms which are arguments to layers

    tfp.layers can also be used with eager execution using Model class

class MNISTModel(tf.keras.Model):
    def __init__(self):
        super(MNISTModel, self).__init__()
        self.dense1 = tfp.layers.DenseFlipout(units=10)
        self.dense2 = tfp.layers.DenseFlipout(units=10)

    def call(sefl, input):
        """Run the model"""
        result = self.dense1(input)
        result = self.dense2(result)
        # reuse var from dense 2 layer
        result = self.dense2(result)
        return result

model = MNISTModel()
In [ ]:
model = tf.keras.Sequential([
    tf.keras.layers.Reshape([32, 32, 3]),
    tfp.layers.Convolution2DFlipout(
        64, kernel_size=5, padding='SAME', activation=tf.nn.relu),
    tf.keras.layers.MaxPool2D(pool_size=[2, 2], strides=[2, 2], padding='SAME'),
    tf.keras.layers.Reshape([16 * 16 * 64]),
    tfp.layers.DenseFlipout(10)
])

logits = model(features)

neg_log_likelihood = tf.nn.softmax_cross_entropy_with_logits(
    labels=labels, logits=logits)
kl = sum(model.get_losses_for(inputs=None))
loss = neg_log_likelihood + kl
train_op = tf.train.AdamOptimizer().minimize(loss)