Source code for ovejero.bnn_alexnet

# -*- coding: utf-8 -*-
"""
Build the TensorFlow model and loss functions

This module contains the functions needed to build the BNN model used in
ovejero as well as the loss functions for the different posteriors.

See the script model_trainer.py for examples of how to use these functions.
"""

import tensorflow as tf
import numpy as np
from tensorflow.keras import initializers, activations
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Flatten, Conv2D, MaxPooling2D, Input, Dense
from tensorflow.keras.layers import Layer, InputSpec


[docs]class AlwaysDropout(Layer):
	"""
	This class applies dropout to an input both during training and inference.
	This is consistent with the BNN methodology.
	"""
	def __init__(self, dropout_rate, **kwargs):
		"""
		Initialize the AlwaysDropout layer.

		Parameters:
			dropout_rate (float): A number in the range [0,1) that will serve
				as the dropout rate for the layer. A larger rate means more
				dropout.
		"""
		super(AlwaysDropout, self).__init__(**kwargs)
		# Check for a bad dropout input
		if dropout_rate >= 1.0 or dropout_rate < 0.0:
			raise ValueError('dropout rate of %f not between 0 and 1' % (
				dropout_rate))
		# Save the dropout rate for later.
		self.dropout_rate = dropout_rate

[docs]	def call(self, inputs, training=None):
		"""
		The function that takes the inputs (likely outputs of a previous layer)
		and conducts dropout.

		Parameters:
			inputs (tf.Keras.Layer): The inputs to the Dense layer.
			training (bool): A required input for call. Setting training to
				true or false does nothing because always dropout behaves the
				same way in both cases.

		Returns:
			(tf.Keras.Layer): The output of the Dense layer.
		"""
		return tf.nn.dropout(inputs, self.dropout_rate)

[docs]	def get_config(self):
		"""
		Return the configuration dictionary required by Keras.
		"""
		config = {'dropout_rate': self.dropout_rate}
		base_config = super(AlwaysDropout, self).get_config()
		return dict(list(base_config.items()) + list(config.items()))

[docs]	def compute_output_shape(self, input_shape):
		"""
		Compute the shape of the output given the input. Needed for Keras
		layer.

		Parameters:
			input_shape ((int,...)): The shape of the input to our Dense layer.

		Returns:
			((int,...)): The output shape of the layer.
		"""
		return input_shape


[docs]def cd_regularizer(p, kernel, kernel_regularizer, dropout_regularizer,
	input_dim):
	"""
	Calculate the regularization term for concrete dropout.

	Parameters:
		p (tf.Tensor): A 1D Tensor containing the p value for dropout (between
			0 and 1).
		kernel (tf.Tensor): A 2D Tensor defining the weights of the Dense
			layer
		kernel_initializer (float): The relative strength of kernel
			regularization term.
		dropout_regularizer (float): The relative strength of the dropout
			regularization term.
		input_dim (int): The dimension of the input to the layer.

	Returns:
		(tf.Tensor): The tensorflow graph to calculate the regularization term.

	Notes:
		This is currently not being used because of issues with the Keras
		framework. Once it updates this will be employed instead of dividing
		the loss into two parts.
	"""
	regularizer = p * K.log(p)
	regularizer += (1.0 - p) + K.log(1.0 - p)
	regularizer *= dropout_regularizer * input_dim
	regularizer += kernel_regularizer * K.sum(K.square(kernel)) / (1.0 - p)
	return regularizer


[docs]class ConcreteDropout(Layer):
	"""
	This class defines a concrete dropout layer that is built around a
	Keras Dense layer. The dropout is parametrized by a weight that is
	optimized along with the model's weights themselves. Heavy inspiration
	from code for arxiv.1705.07832.
	"""
	def __init__(self, output_dim, activation=None,
		kernel_initializer='glorot_uniform', bias_initializer='zeros',
		kernel_regularizer=1e-6, dropout_regularizer=1e-5, init_min=0.1,
		init_max=0.1, temp=0.1, random_seed=None, **kwargs):
		"""
		Initialize the Concrete dropout Dense layer. This will initialize the
		dense layer along with the overhead needed for concrete dropout.

		Parameters:
			output_dim (int): The number of output parameters
			activation (str): The type of activation function to be used. Will
				be passed into tensorflow's activation function library.
			kernel_initializer (str): The type of initializer to use for the
				kernel. Will be passed to tensorflow's initializer library
			bias_initializer (str): The type of initializer to use for the
				bias. Will be passed to tensorflow's initializer library
			kernel_regularizer (float): The strength of the concrete dropout
				regularization term
			dropout_regularizer (float): The strength of the concrete dropout
				p regularization term
			init_min (float): The minimum initial value of the dropout rate
			init_max (float): The maximum initial value of the dropout rate
			temp (float): The temperature that defines how close the concrete
				distribution will be to true dropout.
			random_seed (int): A seed to use in the random function calls. If
				None no explicit seed will be used.

		Returns:
			(keras.Layer): The initialized ConcreteDropout layer. Must still be
				built.

		Notes:
			Technically the regularization terms must be divided by the number
				of training examples. This is degenerate with the value of the
				regularizers, so we do not specify it here.
			The initial dropout rate will be drawn from a uniform distribution
				with the bounds passed into init.
		"""
		# We do this because Keras does this
		if 'input_shape' not in kwargs and 'input_dim' in kwargs:
			kwargs['input_shape'] = (kwargs.pop('input_dim'),)
		# First initialize the properties required by the Dense class
		super(ConcreteDropout, self).__init__(**kwargs)
		# Save everything important to self
		self.output_dim = output_dim
		self.activation = activations.get(activation)
		self.kernel_initializer = initializers.get(
			kernel_initializer)
		self.bias_initializer = initializers.get(bias_initializer)
		self.kernel_regularizer = kernel_regularizer
		self.dropout_regularizer = dropout_regularizer
		# Convert to logit space (since we want to parameterize our weights
		# such that any value outputted by the network is valid).
		self.init_min = np.log(init_min) - np.log(1.0 - init_min)
		self.init_max = np.log(init_max) - np.log(1.0 - init_max)
		self.temp = temp
		self.random_seed = random_seed

[docs]	def build(self, input_shape=None):
		"""
		Build the weights and operations that the network will use.

		Parameters:
			input_shape ((int,...)): The shape of the input to our Dense layer.
		"""
		assert len(input_shape) >= 2
		input_dim = input_shape[-1]

		self.kernel = self.add_weight(shape=(input_dim, self.output_dim),
			initializer=self.kernel_initializer, name='kernel')
		self.bias = self.add_weight(shape=(self.output_dim,),
			initializer=self.bias_initializer, name='bias')
		# Although we define p in logit space, we then apply the sigmoid
		# operation to get the desired value between 0 and 1.
		self.p_logit = self.add_weight(name='p_logit', shape=(1,),
			initializer=initializers.RandomUniform(self.init_min,
				self.init_max), trainable=True)

		# Because of issues with Keras, these functions need to be defined
		# here.
		def p_logit_regularizer(p_logit):
			"""
			Calculate the regularization term for p_logit.

			Parameters:
				p_logit (tf.Tensor): A 1D Tensor containing the p_logit value
					for dropout.

			Returns:
				(tf.Tensor): The tensorflow graph to calculate the
					p_logit regularization term.
			"""
			# Although we define p in logit space, we then apply the sigmoid
			# operation to get the desired value between 0 and 1.
			p = K.sum(K.sigmoid(p_logit))
			regularizer = p * K.log(p)
			regularizer += (1.0 - p) * K.log(1.0 - p)
			regularizer *= self.dropout_regularizer * input_dim
			return regularizer

		def kernel_regularizer(kernel):
			"""
			Calculate the regularization term for concrete dropout.

			Parameters:
				kernel (tf.Tensor): A 2D Tensor containing the kernel for our
					Dense layer computation.

			Returns:
				(tf.Tensor): The tensorflow graph to calculate the
					kernel regularization term.
			"""
			regularizer = self.kernel_regularizer * K.sum(
				K.square(kernel)) / (1.0 - K.sum(K.sigmoid(self.p_logit)))
			return regularizer

		# This is supposed to change in later versions.
		self._handle_weight_regularization('p_logit_regularizer',self.p_logit,
			p_logit_regularizer)
		self._handle_weight_regularization('kernel_regularizer',self.kernel,
			kernel_regularizer)

		# Requirement for Keras
		self.input_spec = InputSpec(min_ndim=2, axes={-1: input_dim})
		self.built = True

[docs]	def call(self, inputs, training=None):
		"""
		The function that takes the inputs of the layer and conducts the
		Dense layer multiplication with concrete dropout.

		Parameters:
			inputs (tf.Keras.Layer): The inputs to the Dense layer.
			training (bool): A required input for call. Setting training to
				true or false does nothing because concrete dropout behaves
				the same way in both cases.

		Returns:
			(tf.Keras.Layer): The output of the Dense layer.
		"""
		# Small epsilon parameter needed for stable optimization
		eps = K.cast_to_floatx(K.epsilon())

		# Build the random tensor for dropout from uniform noise. This
		# formulation allows for a derivative with respect to p.
		unif_noise = K.random_uniform(shape=K.shape(inputs),
			seed=self.random_seed)
		drop_prob = (K.log(K.sigmoid(self.p_logit)+eps) - K.log(1.0-
			K.sigmoid(self.p_logit) + eps) + K.log(unif_noise + eps) -
			K.log(1.0 - unif_noise + eps))
		drop_prob = K.sigmoid(drop_prob / self.temp)
		inputs *= (1.0 - drop_prob)
		inputs /= (1.0 - K.sigmoid(self.p_logit))

		# Now just carry out the basic operations of a Dense layer.
		output = K.dot(inputs, self.kernel)
		output = K.bias_add(output, self.bias, data_format='channels_last')
		if self.activation is not None:
			output = self.activation(output)
		return output

[docs]	def compute_output_shape(self, input_shape):
		"""
		Compute the shape of the output given the input. Needed for Keras
		layer.

		Parameters:
		input_shape ((int,...)): The shape of the input to our Dense layer.

		Returns:
		((int,...)): The output shape of the layer.
		"""
		output_shape = list(input_shape)
		output_shape[-1] = self.output_dim
		return tuple(output_shape)

[docs]	def get_config(self):
		"""
		Return the configuration dictionary required by Keras.
		"""
		config = {
			'output_shape': self.output_shape,
			'activation': activations.serialize(self.activation),
			'kernel_initializer': initializers.serialize(
				self.kernel_initializer),
			'bias_initializer': initializers.serialize(
				self.bias_initializer),
			'kernel_regularizer': self.kernel_regularizer,
			'dropout_regularizer': self.dropout_regularizer
		}
		base_config = super(ConcreteDropout, self).get_config()
		return dict(list(base_config.items()) + list(config.items()))


[docs]class SpatialConcreteDropout(Conv2D):
	"""
	This class defines a spatial concrete dropout layer that is built around a
	Keras Conv2D layer. The dropout is parametrized by a weight that is
	optimized along with the model's weights themselves. Heavy inspiration
	from code for arxiv.1705.07832.
	"""
	def __init__(self, filters, kernel_size, strides=(1,1), padding='valid',
		activation=None, kernel_regularizer=1e-6, dropout_regularizer=1e-5,
		init_min=0.1, init_max=0.1, temp=0.1, random_seed=None, **kwargs):
		"""
		Initialize the Spatial Concrete dropout Dense layer. This will initialize
		the Conv2d layer along with the overhead needed for spatial concrete
		dropout.

		ParametersL
			filters (int): The number of filters to use for the Conv2D layer
			kernel_size ((int,int)): The dimensions of the kernel for the
				Conv2D layer
			strides ((int,int)): The stride to take in each direction for the
				Conv2D layer.
			padding (str): What type of padding to use to get the desired
				output dimensions from the Conv2D layer. Either valid or same
			activation (str): The type of activation function to be used. Will
				be passed into tensorflow's activation function library.
			kernel_regularizer (float): The strength of the concrete dropout
				regularization term
			dropout_regularizer (float): The strength of the concrete dropout
				p regularization term
			init_min (float): The minimum initial value of the dropout rate
			init_max (float): The maximum initial value of the dropout rate
			temp (float): The temperature that defines how close the concrete
				distribution will be to true dropout.
			random_seed (int): A seed to use in the random function calls. If
				None no explicit seed will be used.

		Returns:
			(keras.Layer): The initialized SpatialConcreteDropout layer. Must
				still be built.

		Notes:
			Technically the regularization terms must be divided by the number
				of training examples. This is degenerate with the value of the
				regularizers, so we do not specify it here.
			The initial dropout rate will be drawn from a uniform distribution
				with the bounds passed into init.
		"""
		super(SpatialConcreteDropout, self).__init__(filters, kernel_size,
			strides=strides, padding=padding, activation=activation, **kwargs)
		# Need to change name to avoid issues with Conv2D
		self.cd_kernel_regularizer = kernel_regularizer
		self.dropout_regularizer =dropout_regularizer
		self.init_min = np.log(init_min) - np.log(1.0 - init_min)
		self.init_max = np.log(init_max) - np.log(1.0 - init_max)
		self.temp = temp
		self.random_seed = random_seed

[docs]	def build(self, input_shape=None):
		"""
		Build the weights and operations that the network will use.

		Parameters:
			input_shape ((int,...)): The shape of the input to our Conv2D layer.
		"""
		super(SpatialConcreteDropout, self).build(input_shape)
		input_dim = input_shape[3]

		# kernel already set by inherited build function.
		# Although we define p in logit space, we then apply the sigmoid
		# operation to get the desired value between 0 and 1.
		self.p_logit = self.add_weight(name='p_logit',shape=(1,),
			initializer=initializers.RandomUniform(self.init_min,
				self.init_max), trainable=True)

		# Because of issues with Keras, these functions need to be defined
		# here.
		def p_logit_regularizer(p_logit):
			"""
			Calculate the regularization term for p_logit.

			Parameters:
				p_logit (tf.Tensor): A 1D Tensor containing the p_logit value
					for dropout.

			Returns:
				(tf.Tensor): The tensorflow graph to calculate the
					p_logit regularization term.
			"""
			# Although we define p in logit space, we then apply the sigmoid
			# operation to get the desired value between 0 and 1.
			p = K.sum(K.sigmoid(p_logit))
			regularizer = p * K.log(p)
			regularizer += (1.0 - p) * K.log(1.0 - p)
			regularizer *= self.dropout_regularizer * input_dim
			return regularizer

		def kernel_regularizer(kernel):
			"""
			Calculate the regularization term for concrete dropout.

			Parameters:
				kernel (tf.Tensor): A 2D Tensor containing the kernel for our
					Dense layer computation.

			Returns:
				(tf.Tensor): The tensorflow graph to calculate the
					kernel regularization term.
			"""
			regularizer = self.cd_kernel_regularizer * K.sum(
				K.square(kernel)) / (1.0 - K.sum(K.sigmoid(self.p_logit)))
			return regularizer

		# This is supposed to change in later versions.
		self._handle_weight_regularization('p_logit_regularizer',self.p_logit,
			p_logit_regularizer)
		self._handle_weight_regularization('kernel_regularizer',self.kernel,
			kernel_regularizer)

		self.built = True

[docs]	def call(self, inputs, training=None):
		"""
		The function that takes the inputs of the layer and conducts the
		Dense layer multiplication with concrete dropout.

		Parameters:
			inputs (tf.Keras.Layer): The inputs to the Dense layer.
			training (bool): A required input for call. Setting training to
				true or false does nothing because concrete dropout behaves the
				same way in both cases.

		Returns:
			(tf.Keras.Layer): The output of the Dense layer.
		"""
		# Small epsilon parameter needed for stable optimization
		eps = K.cast_to_floatx(K.epsilon())

		# Build the random tensor for dropout from uniform noise. This
		# formulation allows for a derivative with respect to p.
		input_shape = K.shape(inputs)
		noise_shape = (input_shape[0], 1, 1, input_shape[3])
		unif_noise = K.random_uniform(shape=noise_shape,
			seed=self.random_seed)
		drop_prob = (K.log(K.sigmoid(self.p_logit)+eps) -
			K.log(1.0-K.sigmoid(self.p_logit)+eps) + K.log(unif_noise + eps)
			- K.log(1.0 - unif_noise + eps))
		drop_prob = K.sigmoid(drop_prob/self.temp)
		inputs *= (1.0 - drop_prob)
		inputs /= (1.0 - K.sigmoid(self.p_logit))

		# Now just carry out the basic operations of a Dense layer.
		return super(SpatialConcreteDropout, self).call(inputs)

[docs]	def compute_output_shape(self, input_shape):
		"""
		Compute the shape of the output given the input. Needed for Keras
		layer.

		Parameters:
			input_shape ((int,...)): The shape of the input to our Dense layer.

		Returns:
			((int,...)): The output shape of the layer.
		"""
		return super(SpatialConcreteDropout, self).compute_output_shape(
			input_shape)


[docs]def dropout_alexnet(img_size, num_params, kernel_regularizer=1e-6,
	dropout_rate=0.1,random_seed=None):
	"""
	Build the tensorflow graph for the alexnet BNN.

	Parameters:
		img_size ((int,int,int)): A tupe with shape (pix,pix,freq) that describes
			the size of the input images
		num_params (int): The number of lensing parameters to predict
		kernel_regularizer (float): The strength of the l2 norm (associated to
			the strength of the prior on the weights)
		dropout_rate (float): The dropout rate to use for the layers.
		random_seed (int): A seed to use in the random function calls. If None
			no explicit seed will be used.

	Returns:
		(tf.Tensor): The model (i.e. the tensorflow graph for the model)
	"""

	# Initialize model
	inputs = Input(shape=img_size)
	regularizer = tf.keras.regularizers.l2(kernel_regularizer*(1-dropout_rate))

	# Layer 1
	# model.add(AlwaysDropout(dropout_rate))
	if dropout_rate > 0:
		x = AlwaysDropout(dropout_rate)(inputs)
	else:
		x = inputs
	x = Conv2D(filters=64, kernel_size=(5,5), strides=(2,2),
		padding='valid', activation='relu', input_shape=img_size,
		kernel_regularizer=regularizer)(x)
	x = MaxPooling2D(pool_size=(3,3), strides=(2,2), padding='same')(x)

	# Layer 2
	if dropout_rate > 0:
		x = AlwaysDropout(dropout_rate)(x)
	x = Conv2D(filters=192, kernel_size=(5,5), strides=(1,1),
		padding='same', activation='relu',
		kernel_regularizer=regularizer)(x)
	x = MaxPooling2D(pool_size=(3,3), strides=(2,2), padding='same')(x)

	# Layer 3
	if dropout_rate > 0:
		x = AlwaysDropout(dropout_rate)(x)
	x = Conv2D(filters=384, kernel_size=(3,3), strides=(1,1),
		padding='same', activation='relu',
		kernel_regularizer=regularizer)(x)

	# Layer 4
	if dropout_rate > 0:
		x = AlwaysDropout(dropout_rate)(x)
	x = Conv2D(filters=384, kernel_size=(3,3), strides=(1,1),
		padding='same', activation='relu',
		kernel_regularizer=regularizer)(x)

	# Layer 5
	if dropout_rate > 0:
		x = AlwaysDropout(dropout_rate)(x)
	x = Conv2D(filters=256, kernel_size=(3,3), strides=(1,1),
		padding='same', activation='relu',
		kernel_regularizer=regularizer)(x)
	x = MaxPooling2D(pool_size=(3,3), strides=(2,2), padding='same')(x)

	# Pass to fully connected layers
	x = Flatten()(x)

	# Layer 6
	if dropout_rate > 0:
		x = AlwaysDropout(dropout_rate)(x)
	x = Dense(4096, activation='relu',
		kernel_regularizer=regularizer)(x)

	# Layer 7
	if dropout_rate > 0:
		x = AlwaysDropout(dropout_rate)(x)
	x = Dense(4096, activation='relu',
		kernel_regularizer=regularizer)(x)

	# Output
	if dropout_rate > 0:
		x = AlwaysDropout(dropout_rate)(x)
	outputs = Dense(num_params,
		kernel_regularizer=regularizer)(x)

	# Construct model
	model = Model(inputs=inputs, outputs=outputs)

	return model


[docs]def concrete_alexnet(img_size, num_params, kernel_regularizer=1e-6,
	dropout_regularizer=1e-5, init_min=0.1, init_max=0.1,
	temp=0.1, random_seed=None):
	"""
	Build the tensorflow graph for the concrete dropout alexnet BNN.

	Parameters:
		img_size ((int,int,int)): A tupe with shape (pix,pix,freq) that describes
			the size of the input images
		num_params (int): The number of lensing parameters to predict
		kernel_regularizer (float): The strength of the l2 norm (associated to
			the strength of the prior on the weights)
		dropout_regularizer (float): The stronger it is, the more concrete
			dropout will tend towards larger dropout rates.
		init_min (float): The minimum value that the dropout weight p will
			be initialized to.
		init_max (float): The maximum value that the dropout weight p will
			be initialized to.
		temp (float): The temperature that defines how close the concrete
			distribution will be to true dropout.
		random_seed (int): A seed to use in the random function calls. If None
			no explicit seed will be used.

	Returns:
		(tf.Tensor): The model (i.e. the tensorflow graph for the model)

	Notes:
		While the concrete dropout implementation works, the training of the
		dropout terms is very slow. It's possible that modifying the learning
		rate schedule may help.
	"""

	# Initialize model
	inputs = Input(shape=img_size)

	# Layer 1
	# model.add(AlwaysDropout(dropout_rate))
	x = SpatialConcreteDropout(filters=64, kernel_size=(5,5), strides=(2,2),
		padding='valid', activation='relu', input_shape=img_size,
		kernel_regularizer=kernel_regularizer,
		dropout_regularizer=dropout_regularizer,
		init_min=init_min, init_max=init_max, temp=temp,
		random_seed=random_seed)(inputs)
	x = MaxPooling2D(pool_size=(3,3), strides=(2,2), padding='same')(x)

	# Layer 2
	x = SpatialConcreteDropout(filters=192, kernel_size=(5,5), strides=(1,1),
		padding='same', activation='relu',
		kernel_regularizer=kernel_regularizer,
		dropout_regularizer=dropout_regularizer,
		init_min=init_min, init_max=init_max, temp=temp,
		random_seed=random_seed)(x)
	x = MaxPooling2D(pool_size=(3,3), strides=(2,2), padding='same')(x)

	# Layer 3
	x = SpatialConcreteDropout(filters=384, kernel_size=(3,3), strides=(1,1),
		padding='same', activation='relu',
		kernel_regularizer=kernel_regularizer,
		dropout_regularizer=dropout_regularizer,
		init_min=init_min, init_max=init_max, temp=temp,
		random_seed=random_seed)(x)

	# Layer 4
	x = SpatialConcreteDropout(filters=384, kernel_size=(3,3), strides=(1,1),
		padding='same', activation='relu',
		kernel_regularizer=kernel_regularizer,
		dropout_regularizer=dropout_regularizer,
		init_min=init_min, init_max=init_max, temp=temp,
		random_seed=random_seed)(x)

	# Layer 5
	x = SpatialConcreteDropout(filters=256, kernel_size=(3,3), strides=(1,1),
		padding='same', activation='relu',
		kernel_regularizer=kernel_regularizer,
		dropout_regularizer=dropout_regularizer,
		init_min=init_min, init_max=init_max, temp=temp,
		random_seed=random_seed)(x)
	x = MaxPooling2D(pool_size=(3,3), strides=(2,2), padding='same')(x)

	# Pass to fully connected layers
	x = Flatten()(x)

	# Layer 6
	x = ConcreteDropout(4096, activation='relu',
		kernel_regularizer=kernel_regularizer,
		dropout_regularizer=dropout_regularizer, init_min=init_min,
		init_max=init_max, temp=temp, random_seed=random_seed)(x)

	# Layer 7
	x = ConcreteDropout(4096, activation='relu',
		kernel_regularizer=kernel_regularizer,
		dropout_regularizer=dropout_regularizer, init_min=init_min,
		init_max=init_max, temp=temp, random_seed=random_seed)(x)

	# Output
	outputs = ConcreteDropout(num_params,
		kernel_regularizer=kernel_regularizer,
		dropout_regularizer=dropout_regularizer, init_min=init_min,
		init_max=init_max, temp=temp, random_seed=random_seed)(x)

	# Construct model
	model = Model(inputs=inputs, outputs=outputs)

	return model


[docs]class LensingLossFunctions:
	"""
	A class used to generate the loss functions for the three types of bayesian
	nn models we have implemented: diagonal covariance, full covariance,
	and mixture of full covariances. Currently only two gaussians are allowed
	in the mixture.
	"""
	def __init__(self,flip_pairs,num_params):
		"""
		Initialize the class with the pairs of parameters that must be flipped.
		These are parameters like shear and ellipticity that have been defined
		such that negating both parameters gives the same
		physical definition of the system.

		Parameters:
			flip_pairs ([[int,int,...],...]): A list of pairs of numbers to
				conduct the flip operation on. If empty no flip pairs will be
				used. Note if you also want to consider two sets of parameters
				being flipped at the same time, that must be added to this list.
			num_params (int): The number of parameters to predict.
		"""

		self.flip_pairs = flip_pairs
		self.num_params = num_params

		# Calculate the split list for lower traingular matrix
		self.split_list = []
		for i in range(1,num_params+1):
			self.split_list += [i]

		# Now for each flip pair (including no flip) we will add a flip
		# matrix to our list.
		self.flip_mat_list = [tf.linalg.diag(tf.constant(np.ones(
			self.num_params),dtype=tf.float32))]
		for flip_pair in self.flip_pairs:
			# Initialize a numpy array since this is the easiest way
			# to flexibly set the tensor.
			const_initializer = np.ones(self.num_params)
			const_initializer[flip_pair] = -1
			self.flip_mat_list.append(tf.linalg.diag(tf.constant(
				const_initializer,dtype=tf.float32)))

[docs]	def mse_loss(self, y_true, output):
		"""
		Returns the MSE loss of the predicted parameters. Will ignore parameters
		associated with the covariance matrix.

		Parameters:
			y_true (tf.Tensor): The true values of the parameters
			output (tf.Tensor): The predicted values of the lensing parameters.
				This assumes the first num_params are

		Returns:
			(tf.Tensor): The mse loss function.

		Notes:
			This function should never be used as a loss function. It is useful
			as a metric to understand what portion of the reduciton in the loss
			function can be attributed to improved parameter accuracy. Also
			note that for the gmm models the output will default to the first
			Gaussian for this metric.
		"""
		y_pred, _ = tf.split(output,num_or_size_splits=(self.num_params,-1),
			axis=-1)
		loss_list = []
		for flip_mat in self.flip_mat_list:
			loss_list.append(tf.reduce_mean(tf.square(
				tf.matmul(y_pred,flip_mat)-y_true),axis=-1))
		loss_stack = tf.stack(loss_list,axis=-1)
		return tf.reduce_min(loss_stack,axis=-1)

[docs]	def log_gauss_diag(self,y_true,y_pred,std_pred):
		"""
		Return the negative log posterior of a Gaussian with diagonal
		covariance matrix

		Parameters:
			y_true (tf.Tensor): The true values of the parameters
			y_pred (tf.Tensor): The predicted value of the parameters
			std_pred (tf.Tensor): The predicted diagonal entries of the
				covariance. Note that std_pred is assumed to be the log of the
				covariance matrix values.

		Returns:
			(tf.Tensor): The TF graph for calculating the nlp

		Notes:
			This loss does not include the constant factor of 1/(2*pi)^(d/2).
		"""
		return 0.5*tf.reduce_sum(tf.multiply(tf.square(y_pred-y_true),
			tf.exp(-std_pred)),axis=-1) + 0.5*tf.reduce_sum(
			std_pred,axis=-1)

[docs]	def diagonal_covariance_loss(self,y_true,output):
		"""
		Return the loss function assuming a diagonal covariance matrix

		Parameters:
			y_true (tf.Tensor): The true values of the lensing parameters
			output (tf.Tensor): The predicted values of the lensing parameters.
				This should include 2*self.num_params parameters to account for
				the diagonal entries of our covariance matrix. Covariance matrix
				values are assumed to be in log space.

		Returns:
			(tf.Tensor): The loss function (i.e. the tensorflow graph for it).
		"""
		# First split the data into predicted parameters and covariance matrix
		# element
		y_pred, std_pred = tf.split(output,num_or_size_splits=2,axis=-1)

		# Add each possible flip to the loss list. We will then take the
		# minimum.
		loss_list = []
		for flip_mat in self.flip_mat_list:
			loss_list.append(self.log_gauss_diag(y_true,
				tf.matmul(y_pred,flip_mat),std_pred))
		loss_stack = tf.stack(loss_list,axis=-1)
		return tf.reduce_min(loss_stack,axis=-1)

[docs]	def construct_precision_matrix(self,L_mat_elements):
		"""
		Take the matrix elements for the log cholesky decomposition and
		convert them to the precision matrix. Also return the value of
		the diagonal elements before exponentiation, since we get that for
		free.

		Parameters:
			L_mat_elements (tf.Tensor): A tensor of length
				num_params*(num_params+1)/2 that define the lower traingular
				matrix elements of the log cholesky decomposition

		Returns:
			((tf.Tensor,tf.Tensor)): Both the precision matrix and the diagonal
			elements (before exponentiation) of the log cholesky L matrix.
			Note that this second value is important for the posterior
			calculation.
		"""
		# First split the tensor into the elements that will populate each row
		cov_elements_split = tf.split(L_mat_elements,
			num_or_size_splits=self.split_list,axis=-1)
		# Before we stack these elements, we have to pad them with zeros
		# (corresponding to the 0s of the lower traingular matrix).
		cov_elements_stack = []
		pad_offset = 1
		for cov_element in cov_elements_split:
			# Use tf pad function since it's likely the fastest option.
			pad = tf.constant([[0,0],[0,self.num_params-pad_offset]])
			cov_elements_stack.append(tf.pad(cov_element,pad))
			pad_offset+=1
		# Stack the tensors to form our matrix. Use axis=-2 to avoid issues
		# with batches of matrices being passed in.
		L_mat = tf.stack(cov_elements_stack,axis=-2)
		# Pull out the diagonal part, and then (since we're using log
		# cholesky) exponentiate the diagonal.
		L_mat_diag = tf.linalg.diag_part(L_mat)
		L_mat = tf.linalg.set_diag(L_mat,tf.exp(L_mat_diag))
		# Calculate the actual precision matrix
		prec_mat = tf.matmul(L_mat,tf.transpose(L_mat,perm=[0,2,1]))

		return prec_mat, L_mat_diag, L_mat

[docs]	def log_gauss_full(self,y_true,y_pred,prec_mat,L_diag):
		"""
		Return the negative log posterior of a Gaussian with full
		covariance matrix

		Parameters:
			y_true (tf.Tensor): The true values of the parameters
			y_pred (tf.Tensor): The predicted value of the parameters
			prec_mat: The precision matrix
			L_diag (tf.Tensor): The diagonal (non exponentiated) values of the
				log cholesky decomposition of the precision matrix

		Returns:
			(tf.Tensor): The TF graph for calculating the nlp

		Notes:
			This loss does not include the constant factor of 1/(2*pi)^(d/2).
		"""
		y_dif = y_true - y_pred
		return -tf.reduce_sum(L_diag,-1) + 0.5 * tf.reduce_sum(
			tf.multiply(y_dif,tf.reduce_sum(tf.multiply(tf.expand_dims(
				y_dif,-1),prec_mat),axis=-2)),-1)

[docs]	def full_covariance_loss(self,y_true,output):
		"""
		Return the loss function assuming a full covariance matrix

		Parameters:
			y_true (tf.Tensor): The true values of the lensing parameters
			output (tf.Tensor): The predicted values of the lensing parameters.
				This should include self.num_params parameters for the prediction
				and self.num_params*(self.num_params+1)/2 parameters for the
				lower triangular log cholesky decomposition

		Returns:
			(tf.Tensor): The loss function (i.e. the tensorflow graph for it).
		"""
		# Start by dividing the output into the L_elements and the prediction
		# values.
		L_elements_len = int(self.num_params*(self.num_params+1)/2)
		y_pred, L_mat_elements = tf.split(output,
			num_or_size_splits=[self.num_params,L_elements_len],axis=-1)

		# Build the precision matrix and extract the diagonal part
		prec_mat, L_diag, _ = self.construct_precision_matrix(L_mat_elements)

		# Add each possible flip to the loss list. We will then take the
		# minimum.
		loss_list = []
		for flip_mat in self.flip_mat_list:
			loss_list.append(self.log_gauss_full(y_true,
				tf.matmul(y_pred,flip_mat),prec_mat,L_diag))
		loss_stack = tf.stack(loss_list,axis=-1)
		return tf.reduce_min(loss_stack,axis=-1)

[docs]	def log_gauss_gm_full(self,y_true,y_preds,prec_mats,L_diags,pis):
		"""
		Return the negative log posterior of a GMM with full
		covariance matrix for each GM. Note this code allows for any number
		of GMMs.

		Parameters:
			y_true (tf.Tensor): The true values of the parameters
			y_preds ([tf.Tensor,...]): A list of the predicted value of the
				parameters
			prec_mats ([tf.Tensor,...]): A list of the precision matrices
			L_diags ([tf.Tensor,...]): A list of the diagonal (non exponentiated)
				values of the log cholesky decomposition of the precision
				matrices

		Returns:
			(tf.Tensor): The TF graph for calculating the nlp

		Notes:
			This loss does not include the constant factors of 1/(2*pi)^(d/2).
		"""
		# Stack together the loss to be able to do the logsumexp trick
		loss_list = []
		for p_i in range(len(y_preds)):
			# Since we're summing the probabilities using a logsumexp,
			# we don't want the negative here. Also note that we add an
			# epsilon to our log operation to avoid nan gradients.
			loss_list.append(-self.log_gauss_full(y_true,y_preds[p_i],
				prec_mats[p_i],L_diags[p_i])+tf.squeeze(tf.math.log(
					pis[p_i]+K.epsilon()),axis=-1))

		# Use tf implementation of logsumexp
		return -tf.reduce_logsumexp(tf.stack(loss_list,axis=-1),axis=-1)

[docs]	def gm_full_covariance_loss(self,y_true,output):
		"""
		Return the loss function assuming a mixture of two gaussians each with
		a full covariance matrix

		Parameters:
			y_true (tf.Tensor): The true values of the lensing parameters
			output (tf.Tensor): The predicted values of the lensing parameters.
				This should include 2 gm which consists of self.num_params
				parameters for the prediction and
				self.num_params*(self.num_params+1)/2 parameters for the
				lower triangular log cholesky decomposition of each gm.
				It should also include one final parameter for the ratio
				between the two gms.

		Returns:
			(tf.Tensor): The loss function (i.e. the tensorflow graph for it).
		"""
		# Start by seperating out the predictions for each gaussian model.
		L_elements_len = int(self.num_params*(self.num_params+1)/2)
		y_pred1, L_mat_elements1, y_pred2, L_mat_elements2, pi_logit = tf.split(
			output,num_or_size_splits=[self.num_params,L_elements_len,
			self.num_params,L_elements_len,1],axis=-1)

		# Set the probability between 0.5 and 1.0. In this parameterization the
		# first Gaussian is always favored.
		pi = 0.5+tf.sigmoid(pi_logit)/2.0

		# Now build the precision matrix for our two models and extract the
		# diagonal components used for the loss calculation
		prec_mat1, L_diag1, _ = self.construct_precision_matrix(L_mat_elements1)
		prec_mat2, L_diag2, _ = self.construct_precision_matrix(L_mat_elements2)

		# Add each possible flip to the loss list. We will then take the
		# minimum.
		loss_list = []
		prec_mats = [prec_mat1,prec_mat2]
		L_diags = [L_diag1,L_diag2]
		pis = [pi,1-pi]
		for flip_mat1 in self.flip_mat_list:
			for flip_mat2 in self.flip_mat_list:
				# The y_preds depends on the selected flips
				y_preds = [tf.matmul(y_pred1,flip_mat1),
					tf.matmul(y_pred2,flip_mat2)]
				loss_list.append(self.log_gauss_gm_full(y_true,y_preds,
					prec_mats,L_diags,pis))
		loss_stack = tf.stack(loss_list,axis=-1)
		return tf.reduce_min(loss_stack,axis=-1)


[docs]def p_value(model):
	"""
	Returns the average value of the dropout in each concrete layer.

	Parameters:
		model (keras.Model): A Keras model from with the dropout values will be
			extracted.

	Notes:
		This is a hack that allows us to easily keep track of the dropout value
		during training.
	"""
	def p_fake_loss(y_true,y_pred):
		# We won't be using either y_true or y_pred
		loss = []
		for layer in model.layers:
			if 'dropout' in layer.name:
				loss.append(tf.sigmoid(layer.weights[2]))
		return tf.reduce_mean(loss)

	return p_fake_loss