diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9878bb1b3..d7249a6ba 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
 # Changelog
 
 ## Unreleased
+* fix invalid escape sequence #1011 [janmayer]
 
 ## v1.13.2 (2024-07-21)
 * update string checks in initialization method for latent variable and put `empirical_samples` init-method on a deprecation path
diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 3cfa38ec3..95a4c2200 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -194,7 +194,7 @@ def save_model(self, output_filename, compress=True, save_data=True):
     # Make sure to name this variable and the predict functions will "just work"
     # In maths the predictive variable is:
     #         K_{xx} - K_{xp}W_{pp}^{-1}K_{px}
-    #         W_{pp} := \texttt{Woodbury inv}
+    #         W_{pp} := \\texttt{Woodbury inv}
     #         p := _predictive_variable
 
     @property
@@ -283,7 +283,7 @@ def parameters_changed(self):
 
     def log_likelihood(self):
         """
-        The log marginal likelihood of the model, :math:`p(\mathbf{y})`, this is the objective function of the model being optimised
+        The log marginal likelihood of the model, :math:`p(\\mathbf{y})`, this is the objective function of the model being optimised
         """
         return self._log_marginal_likelihood
 
@@ -296,9 +296,9 @@ def _raw_predict(self, Xnew, full_cov=False, kern=None):
         diagonal of the covariance is returned.
 
         .. math::
-            p(f*|X*, X, Y) = \int^{\inf}_{\inf} p(f*|f,X*)p(f|X,Y) df
-                        = N(f*| K_{x*x}(K_{xx} + \Sigma)^{-1}Y, K_{x*x*} - K_{xx*}(K_{xx} + \Sigma)^{-1}K_{xx*}
-            \Sigma := \texttt{Likelihood.variance / Approximate likelihood covariance}
+            p(f*|X*, X, Y) = \\int^{\\inf}_{\\inf} p(f*|f,X*)p(f|X,Y) df
+                        = N(f*| K_{x*x}(K_{xx} + \\Sigma)^{-1}Y, K_{x*x*} - K_{xx*}(K_{xx} + \\Sigma)^{-1}K_{xx*}
+            \\Sigma := \\texttt{Likelihood.variance / Approximate likelihood covariance}
         """
         mu, var = self.posterior._raw_predict(kern=self.kern if kern is None else kern, Xnew=Xnew, pred_var=self._predictive_variable, full_cov=full_cov)
         if self.mean_function is not None:
@@ -702,7 +702,7 @@ def log_predictive_density(self, x_test, y_test, Y_metadata=None):
         Calculation of the log predictive density
 
         .. math:
-            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\\mu_{*}\\sigma^{2}_{*})
 
         :param x_test: test locations (x_{*})
         :type x_test: (Nx1) array
@@ -718,7 +718,7 @@ def log_predictive_density_sampling(self, x_test, y_test, Y_metadata=None, num_s
         Calculation of the log predictive density by sampling
 
         .. math:
-            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\\mu_{*}\\sigma^{2}_{*})
 
         :param x_test: test locations (x_{*})
         :type x_test: (Nx1) array
@@ -734,24 +734,24 @@ def log_predictive_density_sampling(self, x_test, y_test, Y_metadata=None, num_s
 
     def _raw_posterior_covariance_between_points(self, X1, X2):
         """
-        Computes the posterior covariance between points. Does not account for 
+        Computes the posterior covariance between points. Does not account for
         normalization or likelihood
 
         :param X1: some input observations
         :param X2: other input observations
 
-        :returns: 
+        :returns:
             cov: raw posterior covariance: k(X1,X2) - k(X1,X) G^{-1} K(X,X2)
         """
         return self.posterior.covariance_between_points(self.kern, self.X, X1, X2)
 
 
-    def posterior_covariance_between_points(self, X1, X2, Y_metadata=None, 
-                                            likelihood=None, 
+    def posterior_covariance_between_points(self, X1, X2, Y_metadata=None,
+                                            likelihood=None,
                                             include_likelihood=True):
         """
-        Computes the posterior covariance between points. Includes likelihood 
-        variance as well as normalization so that evaluation at (x,x) is consistent 
+        Computes the posterior covariance between points. Includes likelihood
+        variance as well as normalization so that evaluation at (x,x) is consistent
         with model.predict
 
         :param X1: some input observations
@@ -762,8 +762,8 @@ def posterior_covariance_between_points(self, X1, X2, Y_metadata=None,
                                    the predicted underlying latent function f.
         :type include_likelihood: bool
 
-        :returns: 
-            cov: posterior covariance, a Numpy array, Nnew x Nnew if 
+        :returns:
+            cov: posterior covariance, a Numpy array, Nnew x Nnew if
             self.output_dim == 1, and Nnew x Nnew x self.output_dim otherwise.
         """
 
@@ -774,7 +774,7 @@ def posterior_covariance_between_points(self, X1, X2, Y_metadata=None,
             mean, _ = self._raw_predict(X1, full_cov=True)
             if likelihood is None:
                 likelihood = self.likelihood
-            _, cov = likelihood.predictive_values(mean, cov, full_cov=True, 
+            _, cov = likelihood.predictive_values(mean, cov, full_cov=True,
                                                   Y_metadata=Y_metadata)
 
         if self.normalizer is not None:
diff --git a/GPy/core/symbolic.py b/GPy/core/symbolic.py
index c4261e243..c11312e62 100644
--- a/GPy/core/symbolic.py
+++ b/GPy/core/symbolic.py
@@ -44,7 +44,7 @@ def __init__(self, expressions, cacheable, derivatives=None, parameters=None, fu
         self._set_derivatives(derivatives)
         self._set_parameters(parameters)
         # Convert the expressions to a list for common sub expression elimination
-        # We should find the following type of expressions: 'function', 'derivative', 'second_derivative', 'third_derivative'. 
+        # We should find the following type of expressions: 'function', 'derivative', 'second_derivative', 'third_derivative'.
         self.update_expression_list()
 
         # Apply any global stabilisation operations to expressions.
@@ -86,7 +86,7 @@ def extract_vars(expr):
         # object except as cached. For covariance functions this is X
         # and Z, for likelihoods F and for mapping functions X.
         self.cacheable_vars = [] # list of everything that's cacheable
-        for var in cacheable:            
+        for var in cacheable:
             self.variables[var] = [e for e in vars if e.name.split('_')[0]==var.lower()]
             self.cacheable_vars += self.variables[var]
         for var in cacheable:
@@ -105,7 +105,7 @@ def extract_derivative(function, derivative_arguments):
             for derivative in derivatives:
                 derivative_arguments += self.variables[derivative]
 
-            # Do symbolic work to compute derivatives.        
+            # Do symbolic work to compute derivatives.
             for key, func in self.expressions.items():
                 # if func['function'].is_Matrix:
                 #     rows = func['function'].shape[0]
@@ -126,7 +126,7 @@ def _set_parameters(self, parameters):
                 if theta.name in parameters:
                     val = parameters[theta.name]
             # Add parameter.
-            
+
             self.link_parameters(Param(theta.name, val, None))
             #self._set_attribute(theta.name, )
 
@@ -174,7 +174,7 @@ def eval_update_gradients(self, function, partial, **kwargs):
             code = self.code[function]['derivative'][theta.name]
             gradient[theta.name] = (partial*eval(code, self.namespace)).sum()
         return gradient
-        
+
     def eval_gradients_X(self, function, partial, **kwargs):
         if 'X' in kwargs:
             gradients_X = np.zeros_like(kwargs['X'])
@@ -194,7 +194,7 @@ def code_parameters_changed(self):
         for variable, code in self.variable_sort(self.code['parameters_changed']):
             lcode += self._print_code(variable) + ' = ' + self._print_code(code) + '\n'
         return lcode
-    
+
     def code_update_cache(self):
         lcode = ''
         for var in self.cacheable:
@@ -208,7 +208,7 @@ def code_update_cache(self):
             for i, theta in enumerate(self.variables[var]):
                 lcode+= "\t" + var + '= np.atleast_2d(' + var + ')\n'
                 lcode+= "\t" + self._print_code(theta.name) + ' = ' + var + '[:, ' + str(i) + "]" + reorder + "\n"
-    
+
         for variable, code in self.variable_sort(self.code['update_cache']):
             lcode+= self._print_code(variable) + ' = ' + self._print_code(code) + "\n"
 
@@ -250,7 +250,7 @@ def _set_attribute(self, name, value):
         """Make sure namespace gets updated when setting attributes."""
         setattr(self, name, value)
         self.namespace.update({name: getattr(self, name)})
-        
+
 
     def update_expression_list(self):
         """Extract a list of expressions from the dictionary of expressions."""
@@ -260,9 +260,9 @@ def update_expression_list(self):
         for fname, fexpressions in self.expressions.items():
             for type, texpressions in fexpressions.items():
                 if type == 'function':
-                    self.expression_list.append(texpressions)            
+                    self.expression_list.append(texpressions)
                     self.expression_keys.append([fname, type])
-                    self.expression_order.append(1) 
+                    self.expression_order.append(1)
                 elif type[-10:] == 'derivative':
                     for dtype, expression in texpressions.items():
                         self.expression_list.append(expression)
@@ -274,9 +274,9 @@ def update_expression_list(self):
                         elif type[:-10] == 'third_':
                             self.expression_order.append(5) #sym.count_ops(self.expressions[type][dtype]))
                 else:
-                    self.expression_list.append(fexpressions[type])            
+                    self.expression_list.append(fexpressions[type])
                     self.expression_keys.append([fname, type])
-                    self.expression_order.append(2) 
+                    self.expression_order.append(2)
 
         # This step may be unecessary.
         # Not 100% sure if the sub expression elimination is order sensitive. This step orders the list with the 'function' code first and derivatives after.
@@ -313,7 +313,7 @@ def extract_sub_expressions(self, cache_prefix='cache', sub_prefix='sub', prefix
             sym_var = sym.var(cache_prefix + str(i))
             self.variables[cache_prefix].append(sym_var)
             replace_dict[expr.name] = sym_var
-            
+
         for i, expr in enumerate(params_change_list):
             sym_var = sym.var(sub_prefix + str(i))
             self.variables[sub_prefix].append(sym_var)
@@ -329,7 +329,7 @@ def extract_sub_expressions(self, cache_prefix='cache', sub_prefix='sub', prefix
         for keys in self.expression_keys:
             for replace, void in common_sub_expressions:
                 setInDict(self.expressions, keys, getFromDict(self.expressions, keys).subs(replace, replace_dict[replace.name]))
-        
+
         self.expressions['parameters_changed'] = {}
         self.expressions['update_cache'] = {}
         for var, expr in common_sub_expressions:
@@ -339,7 +339,7 @@ def extract_sub_expressions(self, cache_prefix='cache', sub_prefix='sub', prefix
                 self.expressions['update_cache'][replace_dict[var.name].name] = expr
             else:
                 self.expressions['parameters_changed'][replace_dict[var.name].name] = expr
-            
+
 
     def _gen_code(self):
         """Generate code for the list of expressions provided using the common sub-expression eliminator to separate out portions that are computed multiple times."""
@@ -357,8 +357,8 @@ def match_key(expr):
             return code
 
         self.code = match_key(self.expressions)
-                            
- 
+
+
     def _expr2code(self, arg_list, expr):
         """Convert the given symbolic expression into code."""
         code = lambdastr(arg_list, expr)
@@ -379,7 +379,7 @@ def _print_code(self, code):
     def _display_expression(self, keys, user_substitutes={}):
         """Helper function for human friendly display of the symbolic components."""
         # Create some pretty maths symbols for the display.
-        sigma, alpha, nu, omega, l, variance = sym.var('\sigma, \alpha, \nu, \omega, \ell, \sigma^2')
+        sigma, alpha, nu, omega, l, variance = sym.var(r'\sigma, \alpha, \nu, \omega, \ell, \sigma^2')
         substitutes = {'scale': sigma, 'shape': alpha, 'lengthscale': l, 'variance': variance}
         substitutes.update(user_substitutes)
 
@@ -416,5 +416,5 @@ def sort_key(x):
                 return int(digits[0])
             else:
                 return x[0]
-            
+
         return sorted(var_dict.items(), key=sort_key, reverse=reverse)
diff --git a/GPy/inference/latent_function_inference/expectation_propagation.py b/GPy/inference/latent_function_inference/expectation_propagation.py
index 2a558b5b5..df27d7db2 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@@ -134,10 +134,10 @@ def _recompute(mean_prior, K, ga_approx):
         B = np.eye(num_data) + Sroot_tilde_K * tau_tilde_root[None,:]
         L = jitchol(B)
         V, _ = dtrtrs(L, Sroot_tilde_K, lower=1)
-        Sigma = K - np.dot(V.T,V) #K - KS^(1/2)BS^(1/2)K = (K^(-1) + \Sigma^(-1))^(-1)
+        Sigma = K - np.dot(V.T,V) #K - KS^(1/2)BS^(1/2)K = (K^(-1) + \\Sigma^(-1))^(-1)
 
         aux_alpha , _ = dpotrs(L, tau_tilde_root * (np.dot(K, ga_approx.v) + mean_prior), lower=1)
-        alpha = ga_approx.v - tau_tilde_root * aux_alpha #(K + Sigma^(\tilde))^(-1) (/mu^(/tilde) - /mu_p)
+        alpha = ga_approx.v - tau_tilde_root * aux_alpha #(K + Sigma^(\\tilde))^(-1) (/mu^(/tilde) - /mu_p)
         mu = np.dot(K, alpha) + mean_prior
 
         return posteriorParams(mu=mu, Sigma=Sigma, L=L)
@@ -151,8 +151,8 @@ def _update_rank1(self, LLT, Kmn, delta_v, delta_tau, i):
         DSYR(LLT,Kmn[:,i].copy(),delta_tau)
         L = jitchol(LLT)
         V,info = dtrtrs(L,Kmn,lower=1)
-        self.Sigma_diag = np.maximum(np.sum(V*V,-2), np.finfo(float).eps)  #diag(K_nm (L L^\top)^(-1)) K_mn
-        si = np.sum(V.T*V[:,i],-1) #(V V^\top)[:,i]
+        self.Sigma_diag = np.maximum(np.sum(V*V,-2), np.finfo(float).eps)  #diag(K_nm (L L^\\top)^(-1)) K_mn
+        si = np.sum(V.T*V[:,i],-1) #(V V^\\top)[:,i]
         self.mu += (delta_v-delta_tau*self.mu[i])*si
         #mu = np.dot(Sigma, v_tilde)
 
@@ -391,11 +391,11 @@ def _inference(self, Y, mean_prior, K, ga_approx, cav_params, likelihood, Z_tild
 
 
         aux_alpha , _ = dpotrs(post_params.L, tau_tilde_root * (np.dot(K, ga_approx.v) +  mean_prior), lower=1)
-        alpha = (ga_approx.v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\tilde))^(-1) (/mu^(/tilde) -  /mu_p)
+        alpha = (ga_approx.v - tau_tilde_root * aux_alpha)[:,None] #(K + Sigma^(\\tilde))^(-1) (/mu^(/tilde) -  /mu_p)
 
         LWi, _ = dtrtrs(post_params.L, np.diag(tau_tilde_root), lower=1)
         Wi = np.dot(LWi.T,LWi)
-        symmetrify(Wi) #(K + Sigma^(\tilde))^(-1)
+        symmetrify(Wi) #(K + Sigma^(\\tilde))^(-1)
 
         dL_dK = 0.5 * (tdot(alpha) - Wi)
         dL_dthetaL = likelihood.ep_gradients(Y, cav_params.tau, cav_params.v, np.diag(dL_dK), Y_metadata=Y_metadata, quad_mode='gh')
@@ -530,7 +530,7 @@ def _init_approximations(self, Kmm, Kmn, num_data):
         #initial values - Gaussian factors
         #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
         LLT0 = Kmm.copy()
-        Lm = jitchol(LLT0) #K_m = L_m L_m^\top
+        Lm = jitchol(LLT0) #K_m = L_m L_m^\\top
         Vm,info = dtrtrs(Lm, Kmn,lower=1)
         # Lmi = dtrtri(Lm)
         # Kmmi = np.dot(Lmi.T,Lmi)
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index a77c7631a..e943077b0 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -27,7 +27,7 @@ def __init__(self):
         """
         Laplace Approximation
 
-        Find the moments \hat{f} and the hessian at this point
+        Find the moments \\hat{f} and the hessian at this point
         (using Newton-Raphson) of the unnormalised posterior
 
         """
diff --git a/GPy/inference/latent_function_inference/pep.py b/GPy/inference/latent_function_inference/pep.py
index 79706292e..cd4ac0cc7 100644
--- a/GPy/inference/latent_function_inference/pep.py
+++ b/GPy/inference/latent_function_inference/pep.py
@@ -8,14 +8,14 @@
 class PEP(LatentFunctionInference):
     '''
     Sparse Gaussian processes using Power-Expectation Propagation
-    for regression: alpha \approx 0 gives VarDTC and alpha = 1 gives FITC
-    
-    Reference: A Unifying Framework for Sparse Gaussian Process Approximation using 
+    for regression: alpha \\approx 0 gives VarDTC and alpha = 1 gives FITC
+
+    Reference: A Unifying Framework for Sparse Gaussian Process Approximation using
     Power Expectation Propagation, https://arxiv.org/abs/1605.07066
-    
+
     '''
     const_jitter = 1e-6
-    
+
     def __init__(self, alpha):
         super(PEP, self).__init__()
         self.alpha = alpha
@@ -69,7 +69,7 @@ def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=No
         #compute dL_dR
         Uv = np.dot(U, v)
         dL_dR = 0.5*(np.sum(U*np.dot(U,P), 1) - (1.0+alpha_const_term)/beta_star + np.sum(np.square(Y), 1) - 2.*np.sum(Uv*Y, 1) \
-            + np.sum(np.square(Uv), 1))*beta_star**2 
+            + np.sum(np.square(Uv), 1))*beta_star**2
 
         # Compute dL_dKmm
         vvT_P = tdot(v.reshape(-1,1)) + P
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index 47f8c6158..e0de2aaa0 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -82,7 +82,7 @@ def mean(self):
         Posterior mean
         $$
         K_{xx}v
-        v := \texttt{Woodbury vector}
+        v := \\texttt{Woodbury vector}
         $$
         """
         if self._mean is None:
@@ -95,7 +95,7 @@ def covariance(self):
         Posterior covariance
         $$
         K_{xx} - K_{xx}W_{xx}^{-1}K_{xx}
-        W_{xx} := \texttt{Woodbury inv}
+        W_{xx} := \\texttt{Woodbury inv}
         $$
         """
         if self._covariance is None:
@@ -146,8 +146,8 @@ def woodbury_chol(self):
         """
         return $L_{W}$ where L is the lower triangular Cholesky decomposition of the Woodbury matrix
         $$
-        L_{W}L_{W}^{\top} = W^{-1}
-        W^{-1} := \texttt{Woodbury inv}
+        L_{W}L_{W}^{\\top} = W^{-1}
+        W^{-1} := \\texttt{Woodbury inv}
         $$
         """
         if self._woodbury_chol is None:
@@ -178,8 +178,8 @@ def woodbury_inv(self):
         """
         The inverse of the woodbury matrix, in the gaussian likelihood case it is defined as
         $$
-        (K_{xx} + \Sigma_{xx})^{-1}
-        \Sigma_{xx} := \texttt{Likelihood.variance / Approximate likelihood covariance}
+        (K_{xx} + \\Sigma_{xx})^{-1}
+        \\Sigma_{xx} := \\texttt{Likelihood.variance / Approximate likelihood covariance}
         $$
         """
         if self._woodbury_inv is None:
@@ -200,8 +200,8 @@ def woodbury_vector(self):
         """
         Woodbury vector in the gaussian likelihood case only is defined as
         $$
-        (K_{xx} + \Sigma)^{-1}Y
-        \Sigma := \texttt{Likelihood.variance / Approximate likelihood covariance}
+        (K_{xx} + \\Sigma)^{-1}Y
+        \\Sigma := \\texttt{Likelihood.variance / Approximate likelihood covariance}
         $$
         """
         if self._woodbury_vector is None:
diff --git a/GPy/kern/src/coregionalize.py b/GPy/kern/src/coregionalize.py
index 7f92d4f7e..a7c4f6f70 100644
--- a/GPy/kern/src/coregionalize.py
+++ b/GPy/kern/src/coregionalize.py
@@ -25,12 +25,12 @@ class Coregionalize(Kern):
     This covariance has the form:
 
     .. math::
-       \mathbf{B} = \mathbf{W}\mathbf{W}^\intercal + \mathrm{diag}(kappa)
+       \\mathbf{B} = \\mathbf{W}\\mathbf{W}^\\intercal + \\mathrm{diag}(kappa)
 
     An intrinsic/linear coregionalization covariance function of the form:
 
     .. math::
-       k_2(x, y)=\mathbf{B} k(x, y)
+       k_2(x, y)=\\mathbf{B} k(x, y)
 
     it is obtained as the tensor product between a covariance function
     k(x, y) and B.
diff --git a/GPy/kern/src/eq_ode1.py b/GPy/kern/src/eq_ode1.py
index 4361ec238..caedc7a3a 100644
--- a/GPy/kern/src/eq_ode1.py
+++ b/GPy/kern/src/eq_ode1.py
@@ -15,7 +15,7 @@ class EQ_ODE1(Kern):
 
     This outputs of this kernel have the form
     .. math::
-       \frac{\text{d}y_j}{\text{d}t} = \sum_{i=1}^R w_{j,i} u_i(t-\delta_j) - d_jy_j(t)
+       \\frac{\\text{d}y_j}{\\text{d}t} = \\sum_{i=1}^R w_{j,i} u_i(t-\\delta_j) - d_jy_j(t)
 
     where :math:`R` is the rank of the system, :math:`w_{j,i}` is the sensitivity of the :math:`j`th output to the :math:`i`th latent function, :math:`d_j` is the decay rate of the :math:`j`th output and :math:`u_i(t)` are independent latent Gaussian processes goverened by an exponentiated quadratic covariance.
 
diff --git a/GPy/kern/src/eq_ode2.py b/GPy/kern/src/eq_ode2.py
index 27b15b876..e809b151a 100644
--- a/GPy/kern/src/eq_ode2.py
+++ b/GPy/kern/src/eq_ode2.py
@@ -15,7 +15,7 @@ class EQ_ODE2(Kern):
 
     This outputs of this kernel have the form
     .. math::
-       \frac{\text{d}^2y_j(t)}{\text{d}^2t} + C_j\frac{\text{d}y_j(t)}{\text{d}t} + B_jy_j(t) = \sum_{i=1}^R w_{j,i} u_i(t)
+       \\frac{\\text{d}^2y_j(t)}{\\text{d}^2t} + C_j\\frac{\\text{d}y_j(t)}{\\text{d}t} + B_jy_j(t) = \\sum_{i=1}^R w_{j,i} u_i(t)
 
     where :math:`R` is the rank of the system, :math:`w_{j,i}` is the sensitivity of the :math:`j`th output to the :math:`i`th latent function, :math:`d_j` is the decay rate of the :math:`j`th output and :math:`f_i(t)` and :math:`g_i(t)` are independent latent Gaussian processes goverened by an exponentiated quadratic covariance.
 
diff --git a/GPy/kern/src/grid_kerns.py b/GPy/kern/src/grid_kerns.py
index 9f153c272..5acec5256 100644
--- a/GPy/kern/src/grid_kerns.py
+++ b/GPy/kern/src/grid_kerns.py
@@ -45,7 +45,7 @@ class GridRBF(GridKern):
 
     .. math::
 
-       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg)
+       k(r) = \\sigma^2 \\exp \\bigg(- \\frac{1}{2} r^2 \\bigg)
 
     """
     _support_GPU = True
diff --git a/GPy/kern/src/kern.py b/GPy/kern/src/kern.py
index b8597c733..92b41e250 100644
--- a/GPy/kern/src/kern.py
+++ b/GPy/kern/src/kern.py
@@ -146,25 +146,25 @@ def Kdiag(self, X):
     def psi0(self, Z, variational_posterior):
         """
         .. math::
-            \psi_0 = \sum_{i=0}^{n}E_{q(X)}[k(X_i, X_i)]
+            \\psi_0 = \\sum_{i=0}^{n}E_{q(X)}[k(X_i, X_i)]
         """
         return self.psicomp.psicomputations(self, Z, variational_posterior)[0]
     def psi1(self, Z, variational_posterior):
         """
         .. math::
-            \psi_1^{n,m} = E_{q(X)}[k(X_n, Z_m)]
+            \\psi_1^{n,m} = E_{q(X)}[k(X_n, Z_m)]
         """
         return self.psicomp.psicomputations(self, Z, variational_posterior)[1]
     def psi2(self, Z, variational_posterior):
         """
         .. math::
-            \psi_2^{m,m'} = \sum_{i=0}^{n}E_{q(X)}[ k(Z_m, X_i) k(X_i, Z_{m'})]
+            \\psi_2^{m,m'} = \\sum_{i=0}^{n}E_{q(X)}[ k(Z_m, X_i) k(X_i, Z_{m'})]
         """
         return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=False)[2]
     def psi2n(self, Z, variational_posterior):
         """
         .. math::
-            \psi_2^{n,m,m'} = E_{q(X)}[ k(Z_m, X_n) k(X_n, Z_{m'})]
+            \\psi_2^{n,m,m'} = E_{q(X)}[ k(Z_m, X_n) k(X_n, Z_{m'})]
 
         Thus, we do not sum out n, compared to psi2
         """
@@ -173,7 +173,7 @@ def gradients_X(self, dL_dK, X, X2):
         """
         .. math::
 
-            \\frac{\partial L}{\partial X} = \\frac{\partial L}{\partial K}\\frac{\partial K}{\partial X}
+            \\frac{\\partial L}{\\partial X} = \\frac{\\partial L}{\\partial K}\\frac{\\partial K}{\\partial X}
         """
         raise NotImplementedError
     def gradients_X_X2(self, dL_dK, X, X2):
@@ -182,7 +182,7 @@ def gradients_XX(self, dL_dK, X, X2, cov=True):
         """
         .. math::
 
-            \\frac{\partial^2 L}{\partial X\partial X_2} = \\frac{\partial L}{\partial K}\\frac{\partial^2 K}{\partial X\partial X_2}
+            \\frac{\\partial^2 L}{\\partial X\\partial X_2} = \\frac{\\partial L}{\\partial K}\\frac{\\partial^2 K}{\\partial X\\partial X_2}
         """
         raise NotImplementedError("This is the second derivative of K wrt X and X2, and not implemented for this kernel")
     def gradients_XX_diag(self, dL_dKdiag, X, cov=True):
@@ -203,7 +203,7 @@ def update_gradients_diag(self, dL_dKdiag, X):
     def update_gradients_full(self, dL_dK, X, X2):
         """Set the gradients of all parameters when doing full (N) inference."""
         raise NotImplementedError
-    
+
     def reset_gradients(self):
         raise NotImplementedError
 
@@ -216,9 +216,9 @@ def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variati
 
         .. math::
 
-            \\frac{\partial L}{\partial \\theta_i} & = \\frac{\partial L}{\partial \psi_0}\\frac{\partial \psi_0}{\partial \\theta_i}\\
-                & \quad + \\frac{\partial L}{\partial \psi_1}\\frac{\partial \psi_1}{\partial \\theta_i}\\
-                & \quad + \\frac{\partial L}{\partial \psi_2}\\frac{\partial \psi_2}{\partial \\theta_i}
+            \\frac{\\partial L}{\\partial \\theta_i} & = \\frac{\\partial L}{\\partial \\psi_0}\\frac{\\partial \\psi_0}{\\partial \\theta_i}\\
+                & \\quad + \\frac{\\partial L}{\\partial \\psi_1}\\frac{\\partial \\psi_1}{\\partial \\theta_i}\\
+                & \\quad + \\frac{\\partial L}{\\partial \\psi_2}\\frac{\\partial \\psi_2}{\\partial \\theta_i}
 
         Thus, we push the different derivatives through the gradients of the psi
         statistics. Be sure to set the gradients for all kernel
diff --git a/GPy/kern/src/linear.py b/GPy/kern/src/linear.py
index bf75f2010..1caf93f66 100644
--- a/GPy/kern/src/linear.py
+++ b/GPy/kern/src/linear.py
@@ -16,15 +16,15 @@ class Linear(Kern):
 
     .. math::
 
-       k(x,y) = \sum_{i=1}^{\\text{input_dim}} \sigma^2_i x_iy_i
+       k(x,y) = \\sum_{i=1}^{\\text{input_dim}} \\sigma^2_i x_iy_i
 
     :param input_dim: the number of input dimensions
     :type input_dim: int
-    :param variances: the vector of variances :math:`\sigma^2_i`
+    :param variances: the vector of variances :math:`\\sigma^2_i`
     :type variances: array or list of the appropriate size (or float if there
                      is only one variance parameter)
     :param ARD: Auto Relevance Determination. If False, the kernel has only one
-                variance parameter \sigma^2, otherwise there is one variance
+                variance parameter \\sigma^2, otherwise there is one variance
                 parameter per dimension.
     :type ARD: Boolean
     :rtype: kernel object
@@ -121,7 +121,7 @@ def gradients_XX(self, dL_dK, X, X2=None):
         the returned array is of shape [NxNxQxQ].
 
         ..math:
-            \frac{\partial^2 K}{\partial X2 ^2} = - \frac{\partial^2 K}{\partial X\partial X2}
+            \\frac{\\partial^2 K}{\\partial X2 ^2} = - \\frac{\\partial^2 K}{\\partial X\\partial X2}
 
         ..returns:
             dL2_dXdX2:  [NxMxQxQ] for X [NxQ] and X2[MxQ] (X2 is X if, X2 is None)
diff --git a/GPy/kern/src/mlp.py b/GPy/kern/src/mlp.py
index dc69f5fd3..2b087df06 100644
--- a/GPy/kern/src/mlp.py
+++ b/GPy/kern/src/mlp.py
@@ -20,12 +20,12 @@ class MLP(Kern):
 
     :param input_dim: the number of input dimensions
     :type input_dim: int
-    :param variance: the variance :math:`\sigma^2`
+    :param variance: the variance :math:`\\sigma^2`
     :type variance: float
-    :param weight_variance: the vector of the variances of the prior over input weights in the neural network :math:`\sigma^2_w`
+    :param weight_variance: the vector of the variances of the prior over input weights in the neural network :math:`\\sigma^2_w`
     :type weight_variance: array or list of the appropriate size (or float if there is only one weight variance parameter)
-    :param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b`
-    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+    :param bias_variance: the variance of the prior over bias parameters :math:`\\sigma^2_b`
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \\sigma^2_w), otherwise there is one weight variance parameter per dimension.
     :type ARD: Boolean
     :rtype: Kernpart object
 
diff --git a/GPy/kern/src/rbf.py b/GPy/kern/src/rbf.py
index 9ed81b9c7..28e7a977a 100644
--- a/GPy/kern/src/rbf.py
+++ b/GPy/kern/src/rbf.py
@@ -16,7 +16,7 @@ class RBF(Stationary):
 
     .. math::
 
-       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg)
+       k(r) = \\sigma^2 \\exp \\bigg(- \\frac{1}{2} r^2 \\bigg)
 
     """
     _support_GPU = True
diff --git a/GPy/kern/src/sde_brownian.py b/GPy/kern/src/sde_brownian.py
index b76761a02..8c0d6efb1 100644
--- a/GPy/kern/src/sde_brownian.py
+++ b/GPy/kern/src/sde_brownian.py
@@ -12,48 +12,48 @@
 
 class sde_Brownian(Brownian):
     """
-    
+
     Class provide extra functionality to transfer this covariance function into
     SDE form.
-    
+
     Linear kernel:
 
     .. math::
 
-       k(x,y) = \sigma^2 min(x,y)
+       k(x,y) = \\sigma^2 min(x,y)
 
     """
-    
+
     def sde_update_gradient_full(self, gradients):
         """
         Update gradient in the order in which parameters are represented in the
         kernel
         """
-    
+
         self.variance.gradient = gradients[0]
-        
-    def sde(self): 
-        """ 
-        Return the state space representation of the covariance. 
-        """ 
-        
+
+    def sde(self):
+        """
+        Return the state space representation of the covariance.
+        """
+
         variance = float(self.variance.values) # this is initial variancve in Bayesian linear regression
-        
+
         F = np.array( ((0,1.0),(0,0) ))
         L = np.array( ((1.0,),(0,)) )
         Qc = np.array( ((variance,),) )
         H = np.array( ((1.0,0),) )
-        
+
         Pinf   = np.array( ( (0, -0.5*variance ), (-0.5*variance, 0) ) )
-        #P0 = Pinf.copy() 
-        P0 = np.zeros((2,2))   
+        #P0 = Pinf.copy()
+        P0 = np.zeros((2,2))
         #Pinf   = np.array( ( (t0, 1.0), (1.0, 1.0/t0) ) ) * variance
         dF = np.zeros((2,2,1))
         dQc    = np.ones( (1,1,1) )
-        
+
         dPinf = np.zeros((2,2,1))
         dPinf[:,:,0] = np.array( ( (0, -0.5), (-0.5, 0) ) )
-        #dP0 = dPinf.copy() 
+        #dP0 = dPinf.copy()
         dP0 = np.zeros((2,2,1))
-  
+
         return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
diff --git a/GPy/kern/src/sde_linear.py b/GPy/kern/src/sde_linear.py
index 943e3bd79..590170683 100644
--- a/GPy/kern/src/sde_linear.py
+++ b/GPy/kern/src/sde_linear.py
@@ -11,15 +11,15 @@
 
 class sde_Linear(Linear):
     """
-    
+
     Class provide extra functionality to transfer this covariance function into
     SDE form.
-    
+
     Linear kernel:
 
     .. math::
 
-       k(x,y) = \sum_{i=1}^{input dim} \sigma^2_i x_iy_i
+       k(x,y) = \\sum_{i=1}^{input dim} \\sigma^2_i x_iy_i
 
     """
     def __init__(self, input_dim, X, variances=None, ARD=False, active_dims=None, name='linear'):
@@ -27,40 +27,40 @@ def __init__(self, input_dim, X, variances=None, ARD=False, active_dims=None, na
         Modify the init method, because one extra parameter is required. X - points
         on the X axis.
         """
-        
+
         super(sde_Linear, self).__init__(input_dim, variances, ARD, active_dims, name)
-        
+
         self.t0 = np.min(X)
-        
-    
+
+
     def sde_update_gradient_full(self, gradients):
         """
         Update gradient in the order in which parameters are represented in the
         kernel
         """
-    
+
         self.variances.gradient = gradients[0]
-        
-    def sde(self): 
-        """ 
-        Return the state space representation of the covariance. 
-        """ 
-        
+
+    def sde(self):
+        """
+        Return the state space representation of the covariance.
+        """
+
         variance = float(self.variances.values) # this is initial variancve in Bayesian linear regression
         t0 = float(self.t0)
-        
+
         F = np.array( ((0,1.0),(0,0) ))
         L = np.array( ((0,),(1.0,)) )
         Qc = np.zeros((1,1))
         H = np.array( ((1.0,0),) )
-        
+
         Pinf   = np.zeros((2,2))
-        P0 = np.array( ( (t0**2, t0), (t0, 1) ) ) * variance        
+        P0 = np.array( ( (t0**2, t0), (t0, 1) ) ) * variance
         dF = np.zeros((2,2,1))
         dQc    = np.zeros( (1,1,1) )
-        
+
         dPinf = np.zeros((2,2,1))
         dP0 = np.zeros((2,2,1))
         dP0[:,:,0]  = P0 / variance
-  
+
         return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
diff --git a/GPy/kern/src/sde_matern.py b/GPy/kern/src/sde_matern.py
index fe3027531..de321f58b 100644
--- a/GPy/kern/src/sde_matern.py
+++ b/GPy/kern/src/sde_matern.py
@@ -11,15 +11,15 @@
 
 class sde_Matern32(Matern32):
     """
-    
+
     Class provide extra functionality to transfer this covariance function into
     SDE forrm.
-    
+
     Matern 3/2 kernel:
 
     .. math::
 
-       k(r) = \sigma^2 (1 + \sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \\exp(- \\sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \\sqrt{\\sum_{i=1}^{input dim} \\frac{(x_i-y_i)^2}{\\ell_i^2} }
 
     """
     def sde_update_gradient_full(self, gradients):
@@ -27,59 +27,59 @@ def sde_update_gradient_full(self, gradients):
         Update gradient in the order in which parameters are represented in the
         kernel
         """
-    
+
         self.variance.gradient = gradients[0]
         self.lengthscale.gradient = gradients[1]
-        
-    def sde(self): 
-        """ 
-        Return the state space representation of the covariance. 
-        """ 
-        
+
+    def sde(self):
+        """
+        Return the state space representation of the covariance.
+        """
+
         variance = float(self.variance.values)
         lengthscale = float(self.lengthscale.values)
-        
-        foo  = np.sqrt(3.)/lengthscale 
-        F    = np.array(((0, 1.0), (-foo**2, -2*foo))) 
+
+        foo  = np.sqrt(3.)/lengthscale
+        F    = np.array(((0, 1.0), (-foo**2, -2*foo)))
         L    = np.array(( (0,), (1.0,) ))
-        Qc   = np.array(((12.*np.sqrt(3) / lengthscale**3 * variance,),)) 
-        H    = np.array(((1.0, 0),)) 
+        Qc   = np.array(((12.*np.sqrt(3) / lengthscale**3 * variance,),))
+        H    = np.array(((1.0, 0),))
         Pinf = np.array(((variance, 0.0), (0.0, 3.*variance/(lengthscale**2))))
         P0 = Pinf.copy()
-        
-        # Allocate space for the derivatives 
+
+        # Allocate space for the derivatives
         dF    = np.empty([F.shape[0],F.shape[1],2])
-        dQc   = np.empty([Qc.shape[0],Qc.shape[1],2]) 
-        dPinf = np.empty([Pinf.shape[0],Pinf.shape[1],2]) 
-        # The partial derivatives 
-        dFvariance       = np.zeros((2,2)) 
-        dFlengthscale    = np.array(((0,0), (6./lengthscale**3,2*np.sqrt(3)/lengthscale**2))) 
-        dQcvariance      = np.array((12.*np.sqrt(3)/lengthscale**3)) 
-        dQclengthscale   = np.array((-3*12*np.sqrt(3)/lengthscale**4*variance)) 
-        dPinfvariance    = np.array(((1,0),(0,3./lengthscale**2))) 
-        dPinflengthscale = np.array(((0,0), (0,-6*variance/lengthscale**3))) 
-        # Combine the derivatives 
-        dF[:,:,0]    = dFvariance 
-        dF[:,:,1]    = dFlengthscale 
-        dQc[:,:,0]   = dQcvariance 
-        dQc[:,:,1]   = dQclengthscale 
-        dPinf[:,:,0] = dPinfvariance 
-        dPinf[:,:,1] = dPinflengthscale 
+        dQc   = np.empty([Qc.shape[0],Qc.shape[1],2])
+        dPinf = np.empty([Pinf.shape[0],Pinf.shape[1],2])
+        # The partial derivatives
+        dFvariance       = np.zeros((2,2))
+        dFlengthscale    = np.array(((0,0), (6./lengthscale**3,2*np.sqrt(3)/lengthscale**2)))
+        dQcvariance      = np.array((12.*np.sqrt(3)/lengthscale**3))
+        dQclengthscale   = np.array((-3*12*np.sqrt(3)/lengthscale**4*variance))
+        dPinfvariance    = np.array(((1,0),(0,3./lengthscale**2)))
+        dPinflengthscale = np.array(((0,0), (0,-6*variance/lengthscale**3)))
+        # Combine the derivatives
+        dF[:,:,0]    = dFvariance
+        dF[:,:,1]    = dFlengthscale
+        dQc[:,:,0]   = dQcvariance
+        dQc[:,:,1]   = dQclengthscale
+        dPinf[:,:,0] = dPinfvariance
+        dPinf[:,:,1] = dPinflengthscale
         dP0 = dPinf.copy()
-        
+
         return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 
 class sde_Matern52(Matern52):
     """
-    
+
     Class provide extra functionality to transfer this covariance function into
     SDE forrm.
-    
+
     Matern 5/2 kernel:
 
     .. math::
 
-       k(r) = \sigma^2 (1 + \sqrt{5} r + \frac{5}{3}r^2) \exp(- \sqrt{5} r) \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \\sigma^2 (1 + \\sqrt{5} r + \\frac{5}{3}r^2) \\exp(- \\sqrt{5} r) \\ \\ \\ \\  \\text{ where  } r = \\sqrt{\\sum_{i=1}^{input dim} \\frac{(x_i-y_i)^2}{\\ell_i^2} }
 
     """
     def sde_update_gradient_full(self, gradients):
@@ -87,51 +87,51 @@ def sde_update_gradient_full(self, gradients):
         Update gradient in the order in which parameters are represented in the
         kernel
         """
-    
+
         self.variance.gradient = gradients[0]
         self.lengthscale.gradient = gradients[1]
-        
-    def sde(self): 
-        """ 
-        Return the state space representation of the covariance. 
-        """ 
-        
+
+    def sde(self):
+        """
+        Return the state space representation of the covariance.
+        """
+
         variance = float(self.variance.values)
         lengthscale = float(self.lengthscale.values)
 
         lamda = np.sqrt(5.0)/lengthscale
-        kappa = 5.0/3.0*variance/lengthscale**2        
-        
+        kappa = 5.0/3.0*variance/lengthscale**2
+
         F = np.array(((0, 1,0), (0, 0, 1), (-lamda**3, -3.0*lamda**2, -3*lamda)))
         L = np.array(((0,),(0,),(1,)))
         Qc = np.array((((variance*400.0*np.sqrt(5.0)/3.0/lengthscale**5),),))
-        H = np.array(((1,0,0),))        
-        
+        H = np.array(((1,0,0),))
+
         Pinf = np.array(((variance,0,-kappa), (0, kappa, 0), (-kappa, 0, 25.0*variance/lengthscale**4)))
         P0 = Pinf.copy()
-        # Allocate space for the derivatives         
-        dF = np.empty((3,3,2))        
-        dQc = np.empty((1,1,2))        
+        # Allocate space for the derivatives
+        dF = np.empty((3,3,2))
+        dQc = np.empty((1,1,2))
         dPinf = np.empty((3,3,2))
-        
-         # The partial derivatives 
+
+         # The partial derivatives
         dFvariance = np.zeros((3,3))
-        dFlengthscale = np.array(((0,0,0),(0,0,0),(15.0*np.sqrt(5.0)/lengthscale**4, 
+        dFlengthscale = np.array(((0,0,0),(0,0,0),(15.0*np.sqrt(5.0)/lengthscale**4,
                                    30.0/lengthscale**3, 3*np.sqrt(5.0)/lengthscale**2)))
         dQcvariance = np.array((((400*np.sqrt(5)/3/lengthscale**5,),)))
-        dQclengthscale = np.array((((-variance*2000*np.sqrt(5)/3/lengthscale**6,),)))        
-        
+        dQclengthscale = np.array((((-variance*2000*np.sqrt(5)/3/lengthscale**6,),)))
+
         dPinf_variance = Pinf/variance
         kappa2 = -2.0*kappa/lengthscale
-        dPinf_lengthscale = np.array(((0,0,-kappa2),(0,kappa2,0),(-kappa2, 
-                                    0,-100*variance/lengthscale**5)))        
-        # Combine the derivatives 
+        dPinf_lengthscale = np.array(((0,0,-kappa2),(0,kappa2,0),(-kappa2,
+                                    0,-100*variance/lengthscale**5)))
+        # Combine the derivatives
         dF[:,:,0] = dFvariance
-        dF[:,:,1] = dFlengthscale        
-        dQc[:,:,0] = dQcvariance         
-        dQc[:,:,1] = dQclengthscale        
+        dF[:,:,1] = dFlengthscale
+        dQc[:,:,0] = dQcvariance
+        dQc[:,:,1] = dQclengthscale
         dPinf[:,:,0] = dPinf_variance
         dPinf[:,:,1] = dPinf_lengthscale
         dP0 = dPinf.copy()
-        
-        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)  
\ No newline at end of file
+
+        return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
\ No newline at end of file
diff --git a/GPy/kern/src/sde_standard_periodic.py b/GPy/kern/src/sde_standard_periodic.py
index 2963eb667..add459363 100644
--- a/GPy/kern/src/sde_standard_periodic.py
+++ b/GPy/kern/src/sde_standard_periodic.py
@@ -24,8 +24,8 @@ class sde_StdPeriodic(StdPeriodic):
 
     .. math::
 
-       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} {}\sum_{i=1}^{input\_dim}
-       \left( \frac{\sin(\frac{\pi}{\lambda_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
+       k(x,y) = \\theta_1 \\exp \\left[  - \\frac{1}{2} {}\\sum_{i=1}^{input\\_dim}
+       \\left( \\frac{\\sin(\\frac{\\pi}{\\lambda_i} (x_i - y_i) )}{l_i} \\right)^2 \\right] }
 
     """
 
@@ -177,7 +177,7 @@ def seriescoeff(m=6, lengthScale=1.0, magnSigma2=1.0, true_covariance=False):
     Calculate the coefficients q_j^2 for the covariance function
     approximation:
 
-        k(\tau) =  \sum_{j=0}^{+\infty} q_j^2 \cos(j\omega_0 \tau)
+        k(\\tau) =  \\sum_{j=0}^{+\\infty} q_j^2 \\cos(j\\omega_0 \\tau)
 
     Reference is:
 
diff --git a/GPy/kern/src/sde_static.py b/GPy/kern/src/sde_static.py
index 6a30c693c..c61660a9a 100644
--- a/GPy/kern/src/sde_static.py
+++ b/GPy/kern/src/sde_static.py
@@ -12,63 +12,63 @@
 
 class sde_White(White):
     """
-    
+
     Class provide extra functionality to transfer this covariance function into
     SDE forrm.
-    
+
     White kernel:
 
     .. math::
 
-       k(x,y) = \alpha*\delta(x-y)
+       k(x,y) = \\alpha*\\delta(x-y)
 
     """
-    
+
     def sde_update_gradient_full(self, gradients):
         """
         Update gradient in the order in which parameters are represented in the
         kernel
         """
-    
+
         self.variance.gradient = gradients[0]
-        
-    def sde(self): 
-        """ 
-        Return the state space representation of the covariance. 
-        """ 
-        
-        variance = float(self.variance.values) 
-        
+
+    def sde(self):
+        """
+        Return the state space representation of the covariance.
+        """
+
+        variance = float(self.variance.values)
+
         F = np.array( ((-np.inf,),) )
         L = np.array( ((1.0,),)  )
         Qc = np.array( ((variance,),)  )
         H = np.array( ((1.0,),) )
-        
+
         Pinf   = np.array( ((variance,),)  )
-        P0 = Pinf.copy()     
-        
+        P0 = Pinf.copy()
+
         dF = np.zeros((1,1,1))
         dQc = np.zeros((1,1,1))
         dQc[:,:,0]    = np.array( ((1.0,),) )
-        
+
         dPinf = np.zeros((1,1,1))
         dPinf[:,:,0] = np.array( ((1.0,),) )
         dP0 = dPinf.copy()
-        
+
         return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
 
 
 class sde_Bias(Bias):
     """
-    
+
     Class provide extra functionality to transfer this covariance function into
     SDE forrm.
-    
+
     Bias kernel:
 
     .. math::
 
-       k(x,y) = \alpha
+       k(x,y) = \\alpha
 
     """
     def sde_update_gradient_full(self, gradients):
@@ -76,28 +76,28 @@ def sde_update_gradient_full(self, gradients):
         Update gradient in the order in which parameters are represented in the
         kernel
         """
-    
+
         self.variance.gradient = gradients[0]
-        
-    def sde(self): 
-        """ 
-        Return the state space representation of the covariance. 
-        """ 
-        variance = float(self.variance.values) 
-        
+
+    def sde(self):
+        """
+        Return the state space representation of the covariance.
+        """
+        variance = float(self.variance.values)
+
         F = np.array( ((0.0,),))
         L = np.array( ((1.0,),))
         Qc = np.zeros((1,1))
         H = np.array( ((1.0,),))
-        
+
         Pinf   = np.zeros((1,1))
-        P0 = np.array( ((variance,),) )      
-        
+        P0 = np.array( ((variance,),) )
+
         dF = np.zeros((1,1,1))
         dQc    = np.zeros((1,1,1))
-        
+
         dPinf = np.zeros((1,1,1))
         dP0 = np.zeros((1,1,1))
         dP0[:,:,0] = np.array( ((1.0,),) )
-        
+
         return (F, L, Qc, H, Pinf, P0, dF, dQc, dPinf, dP0)
\ No newline at end of file
diff --git a/GPy/kern/src/sde_stationary.py b/GPy/kern/src/sde_stationary.py
index 615167999..491135c11 100644
--- a/GPy/kern/src/sde_stationary.py
+++ b/GPy/kern/src/sde_stationary.py
@@ -29,7 +29,7 @@ class sde_RBF(RBF):
 
     .. math::
 
-        k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
+        k(r) = \\sigma^2 \\exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \\ \\ \\ \\  \\text{ where  } r = \\sqrt{\\sum_{i=1}^{input dim} \\frac{(x_i-y_i)^2}{\\ell_i^2} }
 
     """
 
@@ -102,7 +102,7 @@ def sde(self):
         eps = 1e-12
         if (float(Qc) > 1.0 / eps) or (float(Qc) < eps):
             warnings.warn(
-                """sde_RBF kernel: the noise variance Qc is either very large or very small. 
+                """sde_RBF kernel: the noise variance Qc is either very large or very small.
                                 It influece conditioning of P_inf: {0:e}""".format(
                     float(Qc)
                 )
@@ -204,7 +204,7 @@ class sde_Exponential(Exponential):
 
     .. math::
 
-       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r \\bigg) \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \\sigma^2 \\exp \\bigg(- \\frac{1}{2} r \\bigg) \\ \\ \\ \\  \\text{ where  } r = \\sqrt{\\sum_{i=1}^{input dim} \\frac{(x_i-y_i)^2}{\\ell_i^2} }
 
     """
 
@@ -259,7 +259,7 @@ class sde_RatQuad(RatQuad):
 
     .. math::
 
-       k(r) = \sigma^2 \\bigg( 1 + \\frac{r^2}{2} \\bigg)^{- \alpha} \\ \\ \\ \\  \text{ where  } r = \sqrt{\sum_{i=1}^{input dim} \frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \\sigma^2 \\bigg( 1 + \\frac{r^2}{2} \\bigg)^{- \\alpha} \\ \\ \\ \\  \\text{ where  } r = \\sqrt{\\sum_{i=1}^{input dim} \\frac{(x_i-y_i)^2}{\\ell_i^2} }
 
     """
 
diff --git a/GPy/kern/src/standard_periodic.py b/GPy/kern/src/standard_periodic.py
index c8b91563d..6168956a1 100644
--- a/GPy/kern/src/standard_periodic.py
+++ b/GPy/kern/src/standard_periodic.py
@@ -24,19 +24,19 @@ class StdPeriodic(Kern):
 
     .. math::
 
-       k(x,y) = \theta_1 \exp \left[  - \frac{1}{2} \sum_{i=1}^{input\_dim}
-       \left( \frac{\sin(\frac{\pi}{T_i} (x_i - y_i) )}{l_i} \right)^2 \right] }
+       k(x,y) = \\theta_1 \\exp \\left[  - \\frac{1}{2} \\sum_{i=1}^{input\\_dim}
+       \\left( \\frac{\\sin(\\frac{\\pi}{T_i} (x_i - y_i) )}{l_i} \\right)^2 \\right] }
 
     :param input_dim: the number of input dimensions
     :type input_dim: int
-    :param variance: the variance :math:`\theta_1` in the formula above
+    :param variance: the variance :math:`\\theta_1` in the formula above
     :type variance: float
-    :param period: the vector of periods :math:`\T_i`. If None then 1.0 is assumed.
+    :param period: the vector of periods :math:`\\T_i`. If None then 1.0 is assumed.
     :type period: array or list of the appropriate size (or float if there is only one period parameter)
-    :param lengthscale: the vector of lengthscale :math:`\l_i`. If None then 1.0 is assumed.
+    :param lengthscale: the vector of lengthscale :math:`\\l_i`. If None then 1.0 is assumed.
     :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
     :param ARD1: Auto Relevance Determination with respect to period.
-        If equal to "False" one single period parameter :math:`\T_i` for
+        If equal to "False" one single period parameter :math:`\\T_i` for
         each dimension is assumed, otherwise there is one lengthscale
         parameter per dimension.
     :type ARD1: Boolean
@@ -177,7 +177,7 @@ def dK_dX2diag(self, X, dimX2):
         Returns only diagonal elements.
         """
         return np.zeros(X.shape[0])
-    
+
     def dK2_dXdX2(self, X, X2, dimX, dimX2):
         """
         Compute the second derivative of K with respect to:
@@ -578,9 +578,9 @@ def gradients_X(self, dL_dK, X, X2=None):
             X2 = X
         dX = -np.pi*((dL_dK*K)[:,:,None]*np.sin(2*np.pi/self.period*(X[:,None,:] - X2[None,:,:]))/(2.*np.square(self.lengthscale)*self.period)).sum(1)
         return dX
-    
+
     def gradients_X_diag(self, dL_dKdiag, X):
         return np.zeros(X.shape)
-    
+
     def input_sensitivity(self, summarize=True):
         return self.variance*np.ones(self.input_dim)/self.lengthscale**2
diff --git a/GPy/kern/src/stationary.py b/GPy/kern/src/stationary.py
index e39310083..507684b8d 100644
--- a/GPy/kern/src/stationary.py
+++ b/GPy/kern/src/stationary.py
@@ -35,7 +35,7 @@ class Stationary(Kern):
 
     .. math::
 
-        r(x, x') = \\sqrt{ \\sum_{q=1}^Q \\frac{(x_q - x'_q)^2}{\ell_q^2} }.
+        r(x, x') = \\sqrt{ \\sum_{q=1}^Q \\frac{(x_q - x'_q)^2}{\\ell_q^2} }.
 
     By default, there's only one lengthscale: seaprate lengthscales for each
     dimension can be enables by setting ARD=True.
@@ -153,7 +153,7 @@ def _scaled_dist(self, X, X2=None):
         Efficiently compute the scaled distance, r.
 
         ..math::
-            r = \sqrt( \sum_{q=1}^Q (x_q - x'q)^2/l_q^2 )
+            r = \\sqrt( \\sum_{q=1}^Q (x_q - x'q)^2/l_q^2 )
 
         Note that if thre is only one lengthscale, l comes outside the sum. In
         this case we compute the unscaled distance first (in a separate
@@ -259,7 +259,7 @@ def gradients_XX(self, dL_dK, X, X2=None):
         the returned array is of shape [NxNxQxQ].
 
         ..math:
-            \frac{\partial^2 K}{\partial X2 ^2} = - \frac{\partial^2 K}{\partial X\partial X2}
+            \\frac{\\partial^2 K}{\\partial X2 ^2} = - \\frac{\\partial^2 K}{\\partial X\\partial X2}
 
         ..returns:
             dL2_dXdX2:  [NxMxQxQ] in the cov=True case, or [NxMxQ] in the cov=False case,
@@ -295,7 +295,7 @@ def gradients_XX_diag(self, dL_dK_diag, X):
         Given the derivative of the objective dL_dK, compute the second derivative of K wrt X:
 
         ..math:
-          \frac{\partial^2 K}{\partial X\partial X}
+          \\frac{\\partial^2 K}{\\partial X\\partial X}
 
         ..returns:
             dL2_dXdX: [NxQxQ]
@@ -423,7 +423,7 @@ class OU(Stationary):
 
     .. math::
 
-       k(r) = \\sigma^2 \exp(- r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^{\text{input_dim}} \\frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \\sigma^2 \\exp(- r) \\ \\ \\ \\  \\text{ where  } r = \\sqrt{\\sum_{i=1}^{\\text{input_dim}} \\frac{(x_i-y_i)^2}{\\ell_i^2} }
 
     """
 
@@ -460,7 +460,7 @@ class Matern32(Stationary):
 
     .. math::
 
-       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \exp(- \sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \sqrt{\sum_{i=1}^{\\text{input_dim}} \\frac{(x_i-y_i)^2}{\ell_i^2} }
+       k(r) = \\sigma^2 (1 + \\sqrt{3} r) \\exp(- \\sqrt{3} r) \\ \\ \\ \\  \\text{ where  } r = \\sqrt{\\sum_{i=1}^{\\text{input_dim}} \\frac{(x_i-y_i)^2}{\\ell_i^2} }
 
     """
 
@@ -559,7 +559,7 @@ class Matern52(Stationary):
 
     .. math::
 
-       k(r) = \sigma^2 (1 + \sqrt{5} r + \\frac53 r^2) \exp(- \sqrt{5} r)
+       k(r) = \\sigma^2 (1 + \\sqrt{5} r + \\frac53 r^2) \\exp(- \\sqrt{5} r)
     """
     def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Mat52'):
         super(Matern52, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
@@ -626,7 +626,7 @@ class ExpQuad(Stationary):
 
     .. math::
 
-       k(r) = \sigma^2 \exp(- 0.5 r^2)
+       k(r) = \\sigma^2 \\exp(- 0.5 r^2)
 
     notes::
      - This is exactly the same as the RBF covariance function, but the
@@ -664,10 +664,10 @@ def dK_dr(self, r):
 class Cosine(Stationary):
     """
     Cosine Covariance function
-    
+
     .. math::
 
-        k(r) = \sigma^2 \cos(r)
+        k(r) = \\sigma^2 \\cos(r)
 
     """
     def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Cosine'):
@@ -682,18 +682,18 @@ def dK_dr(self, r):
 class ExpQuadCosine(Stationary):
     """
     Exponentiated quadratic multiplied by cosine covariance function (spectral mixture kernel).
-    
+
     .. math::
 
-        k(r) = \sigma^2 \exp(-2\pi^2r^2)\cos(2\pi r/T)
+        k(r) = \\sigma^2 \\exp(-2\\pi^2r^2)\\cos(2\\pi r/T)
 
     """
-    
+
     def __init__(self, input_dim, variance=1., lengthscale=None, period=1., ARD=False, active_dims=None, name='ExpQuadCosine'):
         super(ExpQuadCosine, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
         self.period = Param('period', period, Logexp())
         self.link_parameters(self.period)
-        
+
     def K_of_r(self, r):
         return self.variance * np.exp(-2*np.pi**2*r**2)*np.cos(2*np.pi*r/self.period)
 
@@ -712,18 +712,18 @@ def update_gradients_diag(self, dL_dKdiag, X):
         super(ExpQuadCosine, self).update_gradients_diag(dL_dKdiag, X)
         self.period.gradient = 0.
 
-    
-    
+
+
 class Sinc(Stationary):
     """
     Sinc Covariance function
-    
+
     .. math::
 
-        k(r) = \sigma^2 \sinc(\pi r)
+        k(r) = \\sigma^2 \\sinc(\\pi r)
 
     """
-    
+
     def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='Sinc'):
         super(Sinc, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
 
@@ -734,7 +734,7 @@ def dK_dr(self, r):
         # small angle approximation to avoid divide by zero errors.
         return np.where(r<1e-5, -self.variance*4/3*np.pi*np.pi*r, self.variance/r * (np.cos(2*np.pi*r)-np.sinc(2*r)))
 
-    
+
 
 class RatQuad(Stationary):
     """
@@ -742,7 +742,7 @@ class RatQuad(Stationary):
 
     .. math::
 
-       k(r) = \sigma^2 \\bigg( 1 + \\frac{r^2}{2} \\bigg)^{- \\alpha}
+       k(r) = \\sigma^2 \\bigg( 1 + \\frac{r^2}{2} \\bigg)^{- \\alpha}
 
     """
 
diff --git a/GPy/kern/src/todo/eq_ode1.py b/GPy/kern/src/todo/eq_ode1.py
index 7104a8e9c..71604a205 100644
--- a/GPy/kern/src/todo/eq_ode1.py
+++ b/GPy/kern/src/todo/eq_ode1.py
@@ -14,23 +14,23 @@ class Eq_ode1(Kernpart):
 
     This outputs of this kernel have the form
     .. math::
-       \frac{\text{d}y_j}{\text{d}t} = \sum_{i=1}^R w_{j,i} f_i(t-\delta_j) +\sqrt{\kappa_j}g_j(t) - d_jy_j(t)
+       \\frac{\\text{d}y_j}{\\text{d}t} = \\sum_{i=1}^R w_{j,i} f_i(t-\\delta_j) +\\sqrt{\\kappa_j}g_j(t) - d_jy_j(t)
 
     where :math:`R` is the rank of the system, :math:`w_{j,i}` is the sensitivity of the :math:`j`th output to the :math:`i`th latent function, :math:`d_j` is the decay rate of the :math:`j`th output and :math:`f_i(t)` and :math:`g_i(t)` are independent latent Gaussian processes goverened by an exponentiated quadratic covariance.
-    
+
     :param output_dim: number of outputs driven by latent function.
     :type output_dim: int
-    :param W: sensitivities of each output to the latent driving function. 
+    :param W: sensitivities of each output to the latent driving function.
     :type W: ndarray (output_dim x rank).
     :param rank: If rank is greater than 1 then there are assumed to be a total of rank latent forces independently driving the system, each with identical covariance.
     :type rank: int
-    :param decay: decay rates for the first order system. 
+    :param decay: decay rates for the first order system.
     :type decay: array of length output_dim.
     :param delay: delay between latent force and output response.
     :type delay: array of length output_dim.
     :param kappa: diagonal term that allows each latent output to have an independent component to the response.
     :type kappa: array of length output_dim.
-    
+
     .. Note: see first order differential equation examples in GPy.examples.regression for some usage.
     """
     def __init__(self,output_dim, W=None, rank=1, kappa=None, lengthscale=1.0,  decay=None, delay=None):
@@ -62,7 +62,7 @@ def __init__(self,output_dim, W=None, rank=1, kappa=None, lengthscale=1.0,  deca
         self.is_stationary = False
         self.gaussian_initial = False
         self._set_params(self._get_params())
-        
+
     def _get_params(self):
         param_list = [self.W.flatten()]
         if self.kappa is not None:
@@ -103,11 +103,11 @@ def _get_param_names(self):
         param_names += ['decay_%i'%i for i in range(1,self.output_dim)]
         if self.delay is not None:
             param_names += ['delay_%i'%i for i in 1+range(1,self.output_dim)]
-        param_names+= ['lengthscale'] 
+        param_names+= ['lengthscale']
         return param_names
 
     def K(self,X,X2,target):
-        
+
         if X.shape[1] > 2:
             raise ValueError('Input matrix for ode1 covariance should have at most two columns, one containing times, the other output indices')
 
@@ -123,13 +123,13 @@ def K(self,X,X2,target):
     def Kdiag(self,index,target):
         #target += np.diag(self.B)[np.asarray(index,dtype=int).flatten()]
         pass
-    
+
     def _param_grad_helper(self,dL_dK,X,X2,target):
-        
+
         # First extract times and indices.
         self._extract_t_indices(X, X2, dL_dK=dL_dK)
         self._dK_ode_dtheta(target)
-        
+
 
     def _dK_ode_dtheta(self, target):
         """Do all the computations for the ode parts of the covariance function."""
@@ -138,7 +138,7 @@ def _dK_ode_dtheta(self, target):
         index_ode = self._index[self._index>0]-1
         if self._t2 is None:
             if t_ode.size==0:
-                return        
+                return
             t2_ode = t_ode
             dL_dK_ode = dL_dK_ode[:, self._index>0]
             index2_ode = index_ode
@@ -210,7 +210,7 @@ def _extract_t_indices(self, X, X2=None, dL_dK=None):
         self._index = self._index[self._order]
         self._t = self._t[self._order]
         self._rorder = self._order.argsort() # rorder is for reversing the order
-        
+
         if X2 is None:
             self._t2 = None
             self._index2 = None
@@ -229,7 +229,7 @@ def _extract_t_indices(self, X, X2=None, dL_dK=None):
         if dL_dK is not None:
             self._dL_dK = dL_dK[self._order, :]
             self._dL_dK = self._dL_dK[:, self._order2]
-            
+
     def _K_computations(self, X, X2):
         """Perform main body of computations for the ode1 covariance function."""
         # First extract times and indices.
@@ -253,8 +253,8 @@ def _K_computations(self, X, X2):
                                                    np.hstack((self._K_ode_eq, self._K_ode))))
         self._K_dvar = self._K_dvar[self._rorder, :]
         self._K_dvar = self._K_dvar[:, self._rorder2]
-        
-        
+
+
         if X2 is None:
             # Matrix giving scales of each output
             self._scale = np.zeros((self._t.size, self._t.size))
@@ -303,7 +303,7 @@ def _K_compute_eq(self):
                 self._K_eq = np.zeros((t_eq.size, t2_eq.size))
                 return
             self._dist2 = np.square(t_eq[:, None] - t2_eq[None, :])
-        
+
         self._K_eq = np.exp(-self._dist2/(2*self.lengthscale*self.lengthscale))
         if self.is_normalized:
             self._K_eq/=(np.sqrt(2*np.pi)*self.lengthscale)
@@ -361,7 +361,7 @@ def _K_compute_ode_eq(self, transpose=False):
             self._K_eq_ode = sK.T
         else:
             self._K_ode_eq = sK
-        
+
     def _K_compute_ode(self):
         # Compute covariances between outputs of the ODE models.
 
@@ -370,7 +370,7 @@ def _K_compute_ode(self):
         if self._t2 is None:
             if t_ode.size==0:
                 self._K_ode = np.zeros((0, 0))
-                return        
+                return
             t2_ode = t_ode
             index2_ode = index_ode
         else:
@@ -379,14 +379,14 @@ def _K_compute_ode(self):
                 self._K_ode = np.zeros((t_ode.size, t2_ode.size))
                 return
             index2_ode = self._index2[self._index2>0]-1
-        
+
         # When index is identical
         h = self._compute_H(t_ode, index_ode, t2_ode, index2_ode, stationary=self.is_stationary)
 
         if self._t2 is None:
             self._K_ode = 0.5 * (h + h.T)
         else:
-            h2 = self._compute_H(t2_ode, index2_ode, t_ode, index_ode, stationary=self.is_stationary)                
+            h2 = self._compute_H(t2_ode, index2_ode, t_ode, index_ode, stationary=self.is_stationary)
             self._K_ode = 0.5 * (h + h2.T)
 
         if not self.is_normalized:
@@ -410,28 +410,28 @@ def _compute_diag_H(self, t, index, update_derivatives=False, stationary=False):
             Decay = self.decay[index]
             if self.delay is not None:
                 t = t - self.delay[index]
-            
+
             t_squared = t*t
             half_sigma_decay = 0.5*self.sigma*Decay
             [ln_part_1, sign1] = ln_diff_erfs(half_sigma_decay + t/self.sigma,
                                               half_sigma_decay)
-    
+
             [ln_part_2, sign2] = ln_diff_erfs(half_sigma_decay,
                                               half_sigma_decay - t/self.sigma)
-            
+
             h = (sign1*np.exp(half_sigma_decay*half_sigma_decay
                              + ln_part_1
-                             - log(Decay + D_j)) 
+                             - log(Decay + D_j))
                  - sign2*np.exp(half_sigma_decay*half_sigma_decay
                                 - (Decay + D_j)*t
-                                + ln_part_2 
+                                + ln_part_2
                                 - log(Decay + D_j)))
-    
+
             sigma2 = self.sigma*self.sigma
 
         if update_derivatives:
-        
-            dh_dD_i = ((0.5*Decay*sigma2*(Decay + D_j)-1)*h 
+
+            dh_dD_i = ((0.5*Decay*sigma2*(Decay + D_j)-1)*h
                        + t*sign2*np.exp(
                 half_sigma_decay*half_sigma_decay-(Decay+D_j)*t + ln_part_2
                 )
@@ -439,11 +439,11 @@ def _compute_diag_H(self, t, index, update_derivatives=False, stationary=False):
                        (-1 + np.exp(-t_squared/sigma2-Decay*t)
                         + np.exp(-t_squared/sigma2-D_j*t)
                         - np.exp(-(Decay + D_j)*t)))
-        
+
             dh_dD_i = (dh_dD_i/(Decay+D_j)).real
-        
-        
-        
+
+
+
             dh_dD_j = (t*sign2*np.exp(
                 half_sigma_decay*half_sigma_decay-(Decay + D_j)*t+ln_part_2
                 )
@@ -457,7 +457,7 @@ def _compute_diag_H(self, t, index, update_derivatives=False, stationary=False):
                           - (-t/sigma2-Decay/2)*np.exp(-t_squared/sigma2 - D_j*t) \
                           - Decay/2*np.exp(-(Decay+D_j)*t))"""
         pass
-    
+
     def _compute_H(self, t, index, t2, index2, update_derivatives=False, stationary=False):
         """Helper function for computing part of the ode1 covariance function.
 
@@ -491,7 +491,7 @@ def _compute_H(self, t, index, t2, index2, update_derivatives=False, stationary=
         inv_sigma_diff_t = 1./self.sigma*diff_t
         half_sigma_decay_i = 0.5*self.sigma*Decay[:, None]
 
-        ln_part_1, sign1 = ln_diff_erfs(half_sigma_decay_i + t2_mat/self.sigma, 
+        ln_part_1, sign1 = ln_diff_erfs(half_sigma_decay_i + t2_mat/self.sigma,
                                         half_sigma_decay_i - inv_sigma_diff_t,
                                         return_sign=True)
         ln_part_2, sign2 = ln_diff_erfs(half_sigma_decay_i,
@@ -529,7 +529,7 @@ def _compute_H(self, t, index, t2, index2, update_derivatives=False, stationary=
                 )
                 ))
             self._dh_ddecay = (dh_ddecay/(Decay[:, None]+Decay2[None, :])).real
-            
+
             # Update jth decay gradient
             dh_ddecay2 = (t2_mat*sign2
                          *np.exp(
@@ -539,7 +539,7 @@ def _compute_H(self, t, index, t2, index2, update_derivatives=False, stationary=
                 )
                          -h)
             self._dh_ddecay2 = (dh_ddecay/(Decay[:, None] + Decay2[None, :])).real
-            
+
             # Update sigma gradient
             self._dh_dsigma = (half_sigma_decay_i*Decay[:, None]*h
                                + 2/(np.sqrt(np.pi)
@@ -547,10 +547,10 @@ def _compute_H(self, t, index, t2, index2, update_derivatives=False, stationary=
                                *((-diff_t/sigma2-Decay[:, None]/2)
                                  *np.exp(-diff_t*diff_t/sigma2)
                                  + (-t2_mat/sigma2+Decay[:, None]/2)
-                                 *np.exp(-t2_mat*t2_mat/sigma2-Decay[:, None]*t_mat) 
-                                 - (-t_mat/sigma2-Decay[:, None]/2) 
-                                 *np.exp(-t_mat*t_mat/sigma2-Decay2[None, :]*t2_mat) 
+                                 *np.exp(-t2_mat*t2_mat/sigma2-Decay[:, None]*t_mat)
+                                 - (-t_mat/sigma2-Decay[:, None]/2)
+                                 *np.exp(-t_mat*t_mat/sigma2-Decay2[None, :]*t2_mat)
                                  - Decay[:, None]/2
                                  *np.exp(-(Decay[:, None]*t_mat+Decay2[None, :]*t2_mat))))
-                
+
         return h
diff --git a/GPy/kern/src/todo/gibbs.py b/GPy/kern/src/todo/gibbs.py
index 682412454..1626af1b4 100644
--- a/GPy/kern/src/todo/gibbs.py
+++ b/GPy/kern/src/todo/gibbs.py
@@ -9,13 +9,13 @@
 
 class Gibbs(Kernpart):
     """
-    Gibbs non-stationary covariance function. 
+    Gibbs non-stationary covariance function.
 
     .. math::
-       
+
        r = sqrt((x_i - x_j)'*(x_i - x_j))
-       
-       k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
+
+       k(x_i, x_j) = \\sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
 
        Z = (2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')^{q/2}
 
@@ -25,18 +25,18 @@ class Gibbs(Kernpart):
         with input location. This leads to an additional term in front of
         the kernel.
 
-        The parameters are :math:`\sigma^2`, the process variance, and
+        The parameters are :math:`\\sigma^2`, the process variance, and
         the parameters of l(x) which is a function that can be
         specified by the user, by default an multi-layer peceptron is
         used.
 
         :param input_dim: the number of input dimensions
-        :type input_dim: int 
-        :param variance: the variance :math:`\sigma^2`
+        :type input_dim: int
+        :param variance: the variance :math:`\\sigma^2`
         :type variance: float
         :param mapping: the mapping that gives the lengthscale across the input space (by default GPy.mappings.MLP is used with 20 hidden nodes).
         :type mapping: GPy.core.Mapping
-        :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+        :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \\sigma^2_w), otherwise there is one weight variance parameter per dimension.
         :type ARD: Boolean
         :rtype: Kernpart object
 
@@ -113,7 +113,7 @@ def gradients_X(self, dL_dK, X, X2, target):
             target += 2.*self.mapping.df_dX(self._dL_dl[:, None], X)
         else:
             target += self.mapping.df_dX(self._dL_dl[:, None], X)
-    
+
     def dKdiag_dX(self, dL_dKdiag, X, target):
         """Gradient of diagonal of covariance with respect to X."""
         pass
@@ -123,7 +123,7 @@ def dKdiag_dtheta(self, dL_dKdiag, X, target):
         target[0] += np.sum(dL_dKdiag)
 
 
-    
+
     def _K_computations(self, X, X2=None):
         """Pre-computations for the covariance function (used both when computing the covariance and its gradients). Here self._dK_dvar and self._K_dist2 are updated."""
         self._lengthscales=self.mapping.f(X)
@@ -146,7 +146,7 @@ def _dK_computations(self, dL_dK):
         """Pre-computations for the gradients of the covaraince function. Here the gradient of the covariance with respect to all the individual lengthscales is computed.
         :param dL_dK: the gradient of the objective with respect to the covariance function.
         :type dL_dK: ndarray"""
-        
+
         self._dL_dl = (dL_dK*self.variance*self._K_dvar*(self.input_dim/2.*(self._lengthscales_two.T**4 - self._lengthscales**4) + 2*self._lengthscales2*self._K_dist2)/(self._w2*self._w2*self._lengthscales)).sum(1)
         if self._lengthscales_two is self._lengthscales:
             self._dL_dl_two = None
diff --git a/GPy/kern/src/todo/hetero.py b/GPy/kern/src/todo/hetero.py
index 507f6251e..63a1600dc 100644
--- a/GPy/kern/src/todo/hetero.py
+++ b/GPy/kern/src/todo/hetero.py
@@ -19,11 +19,11 @@ class Hetero(Kernpart):
 
     .. math::
 
-       k(x_i, x_j) = \delta_{i,j} \sigma^2(x_i)
+       k(x_i, x_j) = \\delta_{i,j} \\sigma^2(x_i)
 
-       where :math:`\sigma^2(x)` is a function giving the variance  as a function of input space and :math:`\delta_{i,j}` is the Kronecker delta function.
+       where :math:`\\sigma^2(x)` is a function giving the variance  as a function of input space and :math:`\\delta_{i,j}` is the Kronecker delta function.
 
-    The parameters are the parameters of \sigma^2(x) which is a
+    The parameters are the parameters of \\sigma^2(x) which is a
     function that can be specified by the user, by default an
     multi-layer peceptron is used.
 
diff --git a/GPy/kern/src/todo/poly.py b/GPy/kern/src/todo/poly.py
index 0deb11f47..439450329 100644
--- a/GPy/kern/src/todo/poly.py
+++ b/GPy/kern/src/todo/poly.py
@@ -11,28 +11,28 @@ class POLY(Kernpart):
     Polynomial kernel parameter initialisation.  Included for completeness, but generally not recommended, is the polynomial kernel:
 
     .. math::
-        k(x, y) = \sigma^2\*(\sigma_w^2 x'y+\sigma_b^b)^d
+        k(x, y) = \\sigma^2\\*(\\sigma_w^2 x'y+\\sigma_b^b)^d
 
-    The kernel parameters are :math:`\sigma^2` (variance), :math:`\sigma^2_w`
-    (weight_variance), :math:`\sigma^2_b` (bias_variance) and d
+    The kernel parameters are :math:`\\sigma^2` (variance), :math:`\\sigma^2_w`
+    (weight_variance), :math:`\\sigma^2_b` (bias_variance) and d
     (degree). Only gradients of the first three are provided for
     kernel optimisation, it is assumed that polynomial degree would
     be set by hand.
 
     The kernel is not recommended as it is badly behaved when the
-    :math:`\sigma^2_w\*x'\*y + \sigma^2_b` has a magnitude greater than one. For completeness
+    :math:`\\sigma^2_w\\*x'\\*y + \\sigma^2_b` has a magnitude greater than one. For completeness
     there is an automatic relevance determination version of this
     kernel provided (NOTE YET IMPLEMENTED!).
     :param input_dim: the number of input dimensions
-    :type input_dim: int 
-    :param variance: the variance :math:`\sigma^2`
+    :type input_dim: int
+    :param variance: the variance :math:`\\sigma^2`
     :type variance: float
-    :param weight_variance: the vector of the variances of the prior over input weights in the neural network :math:`\sigma^2_w`
+    :param weight_variance: the vector of the variances of the prior over input weights in the neural network :math:`\\sigma^2_w`
     :type weight_variance: array or list of the appropriate size (or float if there is only one weight variance parameter)
-    :param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b`
+    :param bias_variance: the variance of the prior over bias parameters :math:`\\sigma^2_b`
     :param degree: the degree of the polynomial.
     :type degree: int
-    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter :math:`\sigma^2_w`), otherwise there is one weight variance parameter per dimension.
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter :math:`\\sigma^2_w`), otherwise there is one weight variance parameter per dimension.
     :type ARD: Boolean
     :rtype: Kernpart object
 
@@ -93,7 +93,7 @@ def _param_grad_helper(self, dL_dK, X, X2, target):
         base_cov_grad = base*dL_dK
 
 
-            
+
         target[0] += np.sum(self._K_dvar*dL_dK)
         target[1] += (self._K_inner_prod*base_cov_grad).sum()
         target[2] += base_cov_grad.sum()
@@ -107,14 +107,14 @@ def gradients_X(self, dL_dK, X, X2, target):
             target += 2*self.weight_variance*self.degree*self.variance*(((X[None,:, :])) *(arg**(self.degree-1))[:, :, None]*dL_dK[:, :, None]).sum(1)
         else:
             target += self.weight_variance*self.degree*self.variance*(((X2[None,:, :])) *(arg**(self.degree-1))[:, :, None]*dL_dK[:, :, None]).sum(1)
-            
+
     def dKdiag_dX(self, dL_dKdiag, X, target):
         """Gradient of diagonal of covariance with respect to X"""
         self._K_diag_computations(X)
         arg = self._K_diag_poly_arg
         target += 2.*self.weight_variance*self.degree*self.variance*X*dL_dKdiag[:, None]*(arg**(self.degree-1))[:, None]
-    
-    
+
+
     def _K_computations(self, X, X2):
         if self.ARD:
             pass
@@ -133,6 +133,6 @@ def _K_diag_computations(self, X):
             self._K_diag_poly_arg = (X*X).sum(1)*self.weight_variance + self.bias_variance
         self._K_diag_dvar = self._K_diag_poly_arg**self.degree
 
-  
+
 
 
diff --git a/GPy/kern/src/todo/rbf_inv.py b/GPy/kern/src/todo/rbf_inv.py
index 8405ae840..56ee7b79e 100644
--- a/GPy/kern/src/todo/rbf_inv.py
+++ b/GPy/kern/src/todo/rbf_inv.py
@@ -15,9 +15,9 @@ class RBFInv(RBF):
 
     .. math::
 
-       k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \ \ \ \ \  \\text{ where  } r^2 = \sum_{i=1}^d \\frac{ (x_i-x^\prime_i)^2}{\ell_i^2}
+       k(r) = \\sigma^2 \\exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \\ \\ \\ \\ \\  \\text{ where  } r^2 = \\sum_{i=1}^d \\frac{ (x_i-x^\\prime_i)^2}{\\ell_i^2}
 
-    where \ell_i is the lengthscale, \sigma^2 the variance and d the dimensionality of the input.
+    where \\ell_i is the lengthscale, \\sigma^2 the variance and d the dimensionality of the input.
 
     :param input_dim: the number of input dimensions
     :type input_dim: int
@@ -25,7 +25,7 @@ class RBFInv(RBF):
     :type variance: float
     :param lengthscale: the vector of lengthscale of the kernel
     :type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
-    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \\ell), otherwise there is one lengthscale parameter per dimension.
     :type ARD: Boolean
     :rtype: kernel object
 
diff --git a/GPy/kern/src/trunclinear.py b/GPy/kern/src/trunclinear.py
index bb94ae732..917ba4139 100644
--- a/GPy/kern/src/trunclinear.py
+++ b/GPy/kern/src/trunclinear.py
@@ -14,15 +14,15 @@ class TruncLinear(Kern):
 
     .. math::
 
-       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \sigma_q)
+       k(x,y) = \\sum_{i=1}^input_dim \\sigma^2_i \\max(0, x_iy_i - \\sigma_q)
 
     :param input_dim: the number of input dimensions
     :type input_dim: int
-    :param variances: the vector of variances :math:`\sigma^2_i`
+    :param variances: the vector of variances :math:`\\sigma^2_i`
     :type variances: array or list of the appropriate size (or float if there
                      is only one variance parameter)
     :param ARD: Auto Relevance Determination. If False, the kernel has only one
-                variance parameter \sigma^2, otherwise there is one variance
+                variance parameter \\sigma^2, otherwise there is one variance
                 parameter per dimension.
     :type ARD: Boolean
     :rtype: kernel object
@@ -113,15 +113,15 @@ class TruncLinear_inf(Kern):
 
     .. math::
 
-       k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \sigma_q)
+       k(x,y) = \\sum_{i=1}^input_dim \\sigma^2_i \\max(0, x_iy_i - \\sigma_q)
 
     :param input_dim: the number of input dimensions
     :type input_dim: int
-    :param variances: the vector of variances :math:`\sigma^2_i`
+    :param variances: the vector of variances :math:`\\sigma^2_i`
     :type variances: array or list of the appropriate size (or float if there
                      is only one variance parameter)
     :param ARD: Auto Relevance Determination. If False, the kernel has only one
-                variance parameter \sigma^2, otherwise there is one variance
+                variance parameter \\sigma^2, otherwise there is one variance
                 parameter per dimension.
     :type ARD: Boolean
     :rtype: kernel object
diff --git a/GPy/likelihoods/bernoulli.py b/GPy/likelihoods/bernoulli.py
index 2e745b9b4..b02b6994d 100644
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@@ -243,7 +243,7 @@ def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
         assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
         #d3logpdf_dlink3 = 2*(y/(inv_link_f**3) - (1-y)/((1-inv_link_f)**3))
         state = np.seterr(divide='ignore')
-        # TODO check y \in {0, 1} or {-1, 1}
+        # TODO check y \\in {0, 1} or {-1, 1}
         d3logpdf_dlink3 = np.where(y==1, 2./(inv_link_f**3), -2./((1.-inv_link_f)**3))
         np.seterr(**state)
         return d3logpdf_dlink3
diff --git a/GPy/likelihoods/exponential.py b/GPy/likelihoods/exponential.py
index ecf0977eb..ef071fdfd 100644
--- a/GPy/likelihoods/exponential.py
+++ b/GPy/likelihoods/exponential.py
@@ -14,7 +14,7 @@ class Exponential(Likelihood):
     Y is expected to take values in {0,1,2,...}
     -----
     $$
-    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
+    L(x) = \\exp(\\lambda) * \\lambda**Y_i / Y_i!
     $$
     """
     def __init__(self,gp_link=None):
@@ -46,7 +46,7 @@ def logpdf_link(self, link_f, y, Y_metadata=None):
         Log Likelihood Function given link(f)
 
         .. math::
-            \\ln p(y_{i}|\lambda(f_{i})) = \\ln \\lambda(f_{i}) - y_{i}\\lambda(f_{i})
+            \\ln p(y_{i}|\\lambda(f_{i})) = \\ln \\lambda(f_{i}) - y_{i}\\lambda(f_{i})
 
         :param link_f: latent variables (link(f))
         :type link_f: Nx1 array
@@ -65,7 +65,7 @@ def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
         Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
 
         .. math::
-            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{1}{\\lambda(f)} - y_{i}
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{1}{\\lambda(f)} - y_{i}
 
         :param link_f: latent variables (f)
         :type link_f: Nx1 array
@@ -87,7 +87,7 @@ def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
         The hessian will be 0 unless i == j
 
         .. math::
-            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = -\\frac{1}{\\lambda(f_{i})^{2}}
+            \\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}\\lambda(f)} = -\\frac{1}{\\lambda(f_{i})^{2}}
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
@@ -110,7 +110,7 @@ def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
         Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
 
         .. math::
-            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2}{\\lambda(f_{i})^{3}}
+            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2}{\\lambda(f_{i})^{3}}
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
diff --git a/GPy/likelihoods/gamma.py b/GPy/likelihoods/gamma.py
index 79aba4a54..18ad580ca 100644
--- a/GPy/likelihoods/gamma.py
+++ b/GPy/likelihoods/gamma.py
@@ -54,7 +54,7 @@ def logpdf_link(self, link_f, y, Y_metadata=None):
         Log Likelihood Function given link(f)
 
         .. math::
-            \\ln p(y_{i}|\lambda(f_{i})) = \\alpha_{i}\\log \\beta - \\log \\Gamma(\\alpha_{i}) + (\\alpha_{i} - 1)\\log y_{i} - \\beta y_{i}\\\\
+            \\ln p(y_{i}|\\lambda(f_{i})) = \\alpha_{i}\\log \\beta - \\log \\Gamma(\\alpha_{i}) + (\\alpha_{i} - 1)\\log y_{i} - \\beta y_{i}\\\\
             \\alpha_{i} = \\beta y_{i}
 
         :param link_f: latent variables (link(f))
@@ -101,7 +101,7 @@ def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
         The hessian will be 0 unless i == j
 
         .. math::
-            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = -\\beta^{2}\\frac{d\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
+            \\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}\\lambda(f)} = -\\beta^{2}\\frac{d\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
             \\alpha_{i} = \\beta y_{i}
 
         :param link_f: latent variables link(f)
@@ -126,7 +126,7 @@ def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
         Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
 
         .. math::
-            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = -\\beta^{3}\\frac{d^{2}\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
+            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = -\\beta^{3}\\frac{d^{2}\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
             \\alpha_{i} = \\beta y_{i}
 
         :param link_f: latent variables link(f)
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index 0f17a883b..bbfc81ad1 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -130,7 +130,7 @@ def log_predictive_density(self, y_test, mu_star, var_star, Y_metadata=None):
         Calculation of the log predictive density
 
         .. math:
-            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\\mu_{*}\\sigma^{2}_{*})
 
         :param y_test: test observations (y_{*})
         :type y_test: (Nx1) array
@@ -199,7 +199,7 @@ def log_predictive_density_sampling(self, y_test, mu_star, var_star, Y_metadata=
 
         .. math:
             log p(y_{*}|D) = log 1/num_samples prod^{S}_{s=1} p(y_{*}|f_{*s})
-            f_{*s} ~ p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+            f_{*s} ~ p(f_{*}|\\mu_{*}\\sigma^{2}_{*})
 
         :param y_test: test observations (y_{*})
         :type y_test: (Nx1) array
diff --git a/GPy/likelihoods/link_functions.py b/GPy/likelihoods/link_functions.py
index 0554bd938..dfb8f3e4f 100644
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@@ -145,7 +145,7 @@ class ScaledProbit(Probit):
     """
     def __init__(self, nu=1.):
         self.nu = float(nu)
-        
+
     def transf(self,f):
         return std_norm_cdf(f*self.nu)
 
@@ -157,7 +157,7 @@ def d2transf_df2(self,f):
 
     def d3transf_df3(self,f):
         return (safe_square(f*self.nu)-1.)*std_norm_pdf(f*self.nu)*(self.nu**3)
-    
+
     def to_dict(self):
         """
         Convert the object into a json serializable dictionary.
@@ -180,7 +180,7 @@ class Cloglog(GPTransformation):
 
         or
 
-        f = \log (-\log(1-p))
+        f = \\log (-\\log(1-p))
 
     """
     def transf(self,f):
diff --git a/GPy/likelihoods/poisson.py b/GPy/likelihoods/poisson.py
index d3eef7a45..5f2fe39e1 100644
--- a/GPy/likelihoods/poisson.py
+++ b/GPy/likelihoods/poisson.py
@@ -54,7 +54,7 @@ def logpdf_link(self, link_f, y, Y_metadata=None):
         Log Likelihood Function given link(f)
 
         .. math::
-            \\ln p(y_{i}|\lambda(f_{i})) = -\\lambda(f_{i}) + y_{i}\\log \\lambda(f_{i}) - \\log y_{i}!
+            \\ln p(y_{i}|\\lambda(f_{i})) = -\\lambda(f_{i}) + y_{i}\\log \\lambda(f_{i}) - \\log y_{i}!
 
         :param link_f: latent variables (link(f))
         :type link_f: Nx1 array
@@ -72,7 +72,7 @@ def dlogpdf_dlink(self, link_f, y, Y_metadata=None):
         Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
 
         .. math::
-            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{y_{i}}{\\lambda(f_{i})} - 1
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{y_{i}}{\\lambda(f_{i})} - 1
 
         :param link_f: latent variables (f)
         :type link_f: Nx1 array
@@ -92,7 +92,7 @@ def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
         The hessian will be 0 unless i == j
 
         .. math::
-            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{-y_{i}}{\\lambda(f_{i})^{2}}
+            \\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{-y_{i}}{\\lambda(f_{i})^{2}}
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
@@ -113,7 +113,7 @@ def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
         Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
 
         .. math::
-            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2y_{i}}{\\lambda(f_{i})^{3}}
+            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{2y_{i}}{\\lambda(f_{i})^{3}}
 
         :param link_f: latent variables link(f)
         :type link_f: Nx1 array
diff --git a/GPy/likelihoods/student_t.py b/GPy/likelihoods/student_t.py
index 6c97a5d8d..4d3202fc0 100644
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@@ -78,7 +78,7 @@ def logpdf_link(self, inv_link_f, y, Y_metadata=None):
         Log Likelihood Function given link(f)
 
         .. math::
-            \\ln p(y_{i}|\lambda(f_{i})) = \\ln \\Gamma\\left(\\frac{v+1}{2}\\right) - \\ln \\Gamma\\left(\\frac{v}{2}\\right) - \\ln \\sqrt{v \\pi\\sigma^{2}} - \\frac{v+1}{2}\\ln \\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)
+            \\ln p(y_{i}|\\lambda(f_{i})) = \\ln \\Gamma\\left(\\frac{v+1}{2}\\right) - \\ln \\Gamma\\left(\\frac{v}{2}\\right) - \\ln \\sqrt{v \\pi\\sigma^{2}} - \\frac{v+1}{2}\\ln \\left(1 + \\frac{1}{v}\\left(\\frac{(y_{i} - \\lambda(f_{i}))^{2}}{\\sigma^{2}}\\right)\\right)
 
         :param inv_link_f: latent variables (link(f))
         :type inv_link_f: Nx1 array
@@ -107,7 +107,7 @@ def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
         Gradient of the log likelihood function at y, given link(f) w.r.t link(f)
 
         .. math::
-            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\lambda(f)} = \\frac{(v+1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v}
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\lambda(f)} = \\frac{(v+1)(y_{i}-\\lambda(f_{i}))}{(y_{i}-\\lambda(f_{i}))^{2} + \\sigma^{2}v}
 
         :param inv_link_f: latent variables (f)
         :type inv_link_f: Nx1 array
@@ -129,7 +129,7 @@ def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
         The hessian will be 0 unless i == j
 
         .. math::
-            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{(v+1)((y_{i}-\lambda(f_{i}))^{2} - \\sigma^{2}v)}{((y_{i}-\lambda(f_{i}))^{2} + \\sigma^{2}v)^{2}}
+            \\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}\\lambda(f)} = \\frac{(v+1)((y_{i}-\\lambda(f_{i}))^{2} - \\sigma^{2}v)}{((y_{i}-\\lambda(f_{i}))^{2} + \\sigma^{2}v)^{2}}
 
         :param inv_link_f: latent variables inv_link(f)
         :type inv_link_f: Nx1 array
@@ -154,7 +154,7 @@ def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
         Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
 
         .. math::
-            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{-2(v+1)((y_{i} - \lambda(f_{i}))^3 - 3(y_{i} - \lambda(f_{i})) \\sigma^{2} v))}{((y_{i} - \lambda(f_{i})) + \\sigma^{2} v)^3}
+            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = \\frac{-2(v+1)((y_{i} - \\lambda(f_{i}))^3 - 3(y_{i} - \\lambda(f_{i})) \\sigma^{2} v))}{((y_{i} - \\lambda(f_{i})) + \\sigma^{2} v)^3}
 
         :param inv_link_f: latent variables link(f)
         :type inv_link_f: Nx1 array
@@ -175,7 +175,7 @@ def dlogpdf_link_dvar(self, inv_link_f, y, Y_metadata=None):
         Gradient of the log-likelihood function at y given f, w.r.t variance parameter (t_noise)
 
         .. math::
-            \\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{d\\sigma^{2}} = \\frac{v((y_{i} - \lambda(f_{i}))^{2} - \\sigma^{2})}{2\\sigma^{2}(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})}
+            \\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{d\\sigma^{2}} = \\frac{v((y_{i} - \\lambda(f_{i}))^{2} - \\sigma^{2})}{2\\sigma^{2}(\\sigma^{2}v + (y_{i} - \\lambda(f_{i}))^{2})}
 
         :param inv_link_f: latent variables link(f)
         :type inv_link_f: Nx1 array
@@ -199,7 +199,7 @@ def dlogpdf_dlink_dvar(self, inv_link_f, y, Y_metadata=None):
         Derivative of the dlogpdf_dlink w.r.t variance parameter (t_noise)
 
         .. math::
-            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\lambda(f_{i}))}{df}) = \\frac{-2\\sigma v(v + 1)(y_{i}-\lambda(f_{i}))}{(y_{i}-\lambda(f_{i}))^2 + \\sigma^2 v)^2}
+            \\frac{d}{d\\sigma^{2}}(\\frac{d \\ln p(y_{i}|\\lambda(f_{i}))}{df}) = \\frac{-2\\sigma v(v + 1)(y_{i}-\\lambda(f_{i}))}{(y_{i}-\\lambda(f_{i}))^2 + \\sigma^2 v)^2}
 
         :param inv_link_f: latent variables inv_link_f
         :type inv_link_f: Nx1 array
@@ -220,7 +220,7 @@ def d2logpdf_dlink2_dvar(self, inv_link_f, y, Y_metadata=None):
         Gradient of the hessian (d2logpdf_dlink2) w.r.t variance parameter (t_noise)
 
         .. math::
-            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}f}) = \\frac{v(v+1)(\\sigma^{2}v - 3(y_{i} - \lambda(f_{i}))^{2})}{(\\sigma^{2}v + (y_{i} - \lambda(f_{i}))^{2})^{3}}
+            \\frac{d}{d\\sigma^{2}}(\\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}f}) = \\frac{v(v+1)(\\sigma^{2}v - 3(y_{i} - \\lambda(f_{i}))^{2})}{(\\sigma^{2}v + (y_{i} - \\lambda(f_{i}))^{2})^{3}}
 
         :param inv_link_f: latent variables link(f)
         :type inv_link_f: Nx1 array
diff --git a/GPy/likelihoods/weibull.py b/GPy/likelihoods/weibull.py
index ba9eb540e..61da4c66e 100644
--- a/GPy/likelihoods/weibull.py
+++ b/GPy/likelihoods/weibull.py
@@ -54,7 +54,7 @@ def logpdf_link(self, link_f, y, Y_metadata=None):
         Log Likelihood Function given link(f)
 
         .. math::
-            \\ln p(y_{i}|\lambda(f_{i})) = \\alpha_{i}\\log \\beta - \\log \\Gamma(\\alpha_{i}) + (\\alpha_{i} - 1)\\log y_{i} - \\beta y_{i}\\\\
+            \\ln p(y_{i}|\\lambda(f_{i})) = \\alpha_{i}\\log \\beta - \\log \\Gamma(\\alpha_{i}) + (\\alpha_{i} - 1)\\log y_{i} - \\beta y_{i}\\\\
             \\alpha_{i} = \\beta y_{i}
 
         :param link_f: latent variables (link(f))
@@ -117,7 +117,7 @@ def d2logpdf_dlink2(self, link_f, y, Y_metadata=None):
         The hessian will be 0 unless i == j
 
         .. math::
-            \\frac{d^{2} \\ln p(y_{i}|\lambda(f_{i}))}{d^{2}\\lambda(f)} = -\\beta^{2}\\frac{d\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
+            \\frac{d^{2} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{2}\\lambda(f)} = -\\beta^{2}\\frac{d\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
             \\alpha_{i} = \\beta y_{i}
 
         :param link_f: latent variables link(f)
@@ -150,7 +150,7 @@ def d3logpdf_dlink3(self, link_f, y, Y_metadata=None):
         Third order derivative log-likelihood function at y given link(f) w.r.t link(f)
 
         .. math::
-            \\frac{d^{3} \\ln p(y_{i}|\lambda(f_{i}))}{d^{3}\\lambda(f)} = -\\beta^{3}\\frac{d^{2}\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
+            \\frac{d^{3} \\ln p(y_{i}|\\lambda(f_{i}))}{d^{3}\\lambda(f)} = -\\beta^{3}\\frac{d^{2}\\Psi(\\alpha_{i})}{d\\alpha_{i}}\\\\
             \\alpha_{i} = \\beta y_{i}
 
         :param link_f: latent variables link(f)
diff --git a/GPy/mappings/additive.py b/GPy/mappings/additive.py
index 0fb72ca40..b75118e4e 100644
--- a/GPy/mappings/additive.py
+++ b/GPy/mappings/additive.py
@@ -10,7 +10,7 @@ class Additive(Mapping):
 
     .. math::
 
-       f(\mathbf{x}*) = f_1(\mathbf{x}*) + f_2(\mathbf(x)*)
+       f(\\mathbf{x}*) = f_1(\\mathbf{x}*) + f_2(\\mathbf(x)*)
 
     :param mapping1: first mapping to add together.
     :type mapping1: GPy.mappings.Mapping
diff --git a/GPy/mappings/compound.py b/GPy/mappings/compound.py
index 9078910f2..9c7dd3810 100644
--- a/GPy/mappings/compound.py
+++ b/GPy/mappings/compound.py
@@ -9,7 +9,7 @@ class Compound(Mapping):
 
     .. math::
 
-       f(\mathbf{x}) = f_2(f_1(\mathbf{x}))
+       f(\\mathbf{x}) = f_2(f_1(\\mathbf{x}))
 
     :param mapping1: first mapping
     :type mapping1: GPy.mappings.Mapping
diff --git a/GPy/mappings/constant.py b/GPy/mappings/constant.py
index 281d72d2e..85a5f6957 100644
--- a/GPy/mappings/constant.py
+++ b/GPy/mappings/constant.py
@@ -9,7 +9,7 @@ class Constant(Mapping):
 
     .. math::
 
-       F(\mathbf{x}) = c
+       F(\\mathbf{x}) = c
 
 
     :param input_dim: dimension of input.
diff --git a/GPy/mappings/kernel.py b/GPy/mappings/kernel.py
index 18b771269..dea86f5ab 100644
--- a/GPy/mappings/kernel.py
+++ b/GPy/mappings/kernel.py
@@ -12,20 +12,20 @@ class Kernel(Mapping):
 
     .. math::
 
-       f(\mathbf{x}) = \sum_i \alpha_i k(\mathbf{z}_i, \mathbf{x})
+       f(\\mathbf{x}) = \\sum_i \\alpha_i k(\\mathbf{z}_i, \\mathbf{x})
 
     or for multple outputs
 
     .. math::
 
-       f_i(\mathbf{x}) = \sum_j \alpha_{i,j} k(\mathbf{z}_i, \mathbf{x})
+       f_i(\\mathbf{x}) = \\sum_j \\alpha_{i,j} k(\\mathbf{z}_i, \\mathbf{x})
 
 
     :param input_dim: dimension of input.
     :type input_dim: int
     :param output_dim: dimension of output.
     :type output_dim: int
-    :param Z: input observations containing :math:`\mathbf{Z}`
+    :param Z: input observations containing :math:`\\mathbf{Z}`
     :type Z: ndarray
     :param kernel: a GPy kernel, defaults to GPy.kern.RBF
     :type kernel: GPy.kern.kern
diff --git a/GPy/mappings/linear.py b/GPy/mappings/linear.py
index 82b7a0e05..c708f7c0e 100644
--- a/GPy/mappings/linear.py
+++ b/GPy/mappings/linear.py
@@ -12,7 +12,7 @@ class Linear(Mapping):
 
     .. math::
 
-       F(\mathbf{x}) = \mathbf{A} \mathbf{x})
+       F(\\mathbf{x}) = \\mathbf{A} \\mathbf{x})
 
 
     :param input_dim: dimension of input.
diff --git a/GPy/models/gp_kronecker_gaussian_regression.py b/GPy/models/gp_kronecker_gaussian_regression.py
index 7898a97a5..c1cd38bf1 100644
--- a/GPy/models/gp_kronecker_gaussian_regression.py
+++ b/GPy/models/gp_kronecker_gaussian_regression.py
@@ -22,7 +22,7 @@ class GPKroneckerGaussianRegression(Model):
 
     .. rubric:: References
 
-    .. [stegle_et_al_2011] Stegle, O.; Lippert, C.; Mooij, J.M.; Lawrence, N.D.; Borgwardt, K.:Efficient inference in matrix-variate Gaussian models with \iid observation noise. In: Advances in Neural Information Processing Systems, 2011, Pages 630-638
+    .. [stegle_et_al_2011] Stegle, O.; Lippert, C.; Mooij, J.M.; Lawrence, N.D.; Borgwardt, K.:Efficient inference in matrix-variate Gaussian models with \\iid observation noise. In: Advances in Neural Information Processing Systems, 2011, Pages 630-638
 
     """
     def __init__(self, X1, X2, Y, kern1, kern2, noise_var=1., name='KGPR'):
diff --git a/GPy/models/state_space_main.py b/GPy/models/state_space_main.py
index 0280dafc2..630c31b68 100644
--- a/GPy/models/state_space_main.py
+++ b/GPy/models/state_space_main.py
@@ -4002,10 +4002,10 @@ def lti_sde_to_descrete(
         """
         Linear Time-Invariant Stochastic Differential Equation (LTI SDE):
 
-            dx(t) = F x(t) dt + L d \beta  ,where
+            dx(t) = F x(t) dt + L d \\beta  ,where
 
                 x(t): (vector) stochastic process
-                \beta: (vector) Brownian motion process
+                \\beta: (vector) Brownian motion process
                 F, L: (time invariant) matrices of corresponding dimensions
                 Qc: covariance of noise.
 
@@ -4022,7 +4022,7 @@ def lti_sde_to_descrete(
         F,L: LTI SDE matrices of corresponding dimensions
 
         Qc: matrix (n,n)
-            Covarince between different dimensions of noise \beta.
+            Covarince between different dimensions of noise \\beta.
             n is the dimensionality of the noise.
 
         dt: double or iterable
diff --git a/GPy/models/tp_regression.py b/GPy/models/tp_regression.py
index 56ecf4b55..46e223aa2 100644
--- a/GPy/models/tp_regression.py
+++ b/GPy/models/tp_regression.py
@@ -171,7 +171,7 @@ def parameters_changed(self):
 
     def log_likelihood(self):
         """
-        The log marginal likelihood of the model, :math:`p(\mathbf{y})`, this is the objective function of the model being optimised
+        The log marginal likelihood of the model, :math:`p(\\mathbf{y})`, this is the objective function of the model being optimised
         """
         return self._log_marginal_likelihood or self.inference()[1]
 
@@ -184,10 +184,10 @@ def _raw_predict(self, Xnew, full_cov=False, kern=None):
         diagonal of the covariance is returned.
 
         .. math::
-            p(f*|X*, X, Y) = \int^{\inf}_{\inf} p(f*|f,X*)p(f|X,Y) df
-                        = MVN\left(\nu + N,f*| K_{x*x}(K_{xx})^{-1}Y,
-                        \frac{\nu + \beta - 2}{\nu + N - 2}K_{x*x*} - K_{xx*}(K_{xx})^{-1}K_{xx*}\right)
-            \nu := \texttt{Degrees of freedom}
+            p(f*|X*, X, Y) = \\int^{\\inf}_{\\inf} p(f*|f,X*)p(f|X,Y) df
+                        = MVN\\left(\\nu + N,f*| K_{x*x}(K_{xx})^{-1}Y,
+                        \\frac{\\nu + \\beta - 2}{\\nu + N - 2}K_{x*x*} - K_{xx*}(K_{xx})^{-1}K_{xx*}\\right)
+            \\nu := \\texttt{Degrees of freedom}
         """
         mu, var = self.posterior._raw_predict(kern=self.kern if kern is None else kern, Xnew=Xnew,
                                               pred_var=self._predictive_variable, full_cov=full_cov)
diff --git a/GPy/models/warped_gp.py b/GPy/models/warped_gp.py
index a24401fec..0c0aaaacc 100644
--- a/GPy/models/warped_gp.py
+++ b/GPy/models/warped_gp.py
@@ -146,7 +146,7 @@ def log_predictive_density(self, x_test, y_test, Y_metadata=None):
         the jacobian of the warping function here.
 
         .. math:
-            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\mu_{*}\\sigma^{2}_{*})
+            p(y_{*}|D) = p(y_{*}|f_{*})p(f_{*}|\\mu_{*}\\sigma^{2}_{*})
 
         :param x_test: test locations (x_{*})
         :type x_test: (Nx1) array
diff --git a/GPy/testing/state_space_main_tests.py b/GPy/testing/state_space_main_tests.py
index 898eaa904..29b30ff35 100644
--- a/GPy/testing/state_space_main_tests.py
+++ b/GPy/testing/state_space_main_tests.py
@@ -159,7 +159,7 @@ def generate_brownian_data(
 ):
     """
     Generate brownian data - data from Brownian motion.
-    First point is always 0, and \Beta(0) = 0  - standard conditions for Brownian motion.
+    First point is always 0, and \\Beta(0) = 0  - standard conditions for Brownian motion.
 
     Input:
     --------------------------------
diff --git a/GPy/testing/test_model.py b/GPy/testing/test_model.py
index 954bffe96..72695a9fa 100644
--- a/GPy/testing/test_model.py
+++ b/GPy/testing/test_model.py
@@ -269,8 +269,8 @@ def test_predict_uncertain_inputs(self):
         from GPy.core.parameterization.variational import NormalPosterior
 
         X_pred = NormalPosterior(X_pred_mu, X_pred_var)
-        # mu = \int f(x)q(x|mu,S) dx = \int 2x.q(x|mu,S) dx = 2.mu
-        # S = \int (f(x) - m)^2q(x|mu,S) dx = \int f(x)^2 q(x) dx - mu**2 = 4(mu^2 + S) - (2.mu)^2 = 4S
+        # mu = \\int f(x)q(x|mu,S) dx = \\int 2x.q(x|mu,S) dx = 2.mu
+        # S = \\int (f(x) - m)^2q(x|mu,S) dx = \\int f(x)^2 q(x) dx - mu**2 = 4(mu^2 + S) - (2.mu)^2 = 4S
         Y_mu_true = 2 * X_pred_mu
         Y_var_true = 4 * X_pred_var
         Y_mu_pred, Y_var_pred = m.predict_noiseless(X_pred)
@@ -684,7 +684,7 @@ def test_warped_gp_cubic_sine(self):
         warp_m = GPy.models.WarpedGP(
             X, Y
         )  # , kernel=warp_k)#, warping_function=warp_f)
-        warp_m[".*\.d"].constrain_fixed(1.0)
+        warp_m[r".*\.d"].constrain_fixed(1.0)
         warp_m.optimize_restarts(
             parallel=False, robust=False, num_restarts=5, max_iters=max_iters
         )
diff --git a/GPy/testing/test_rv_transformation.py b/GPy/testing/test_rv_transformation.py
index 403d4c171..a6e4a666c 100644
--- a/GPy/testing/test_rv_transformation.py
+++ b/GPy/testing/test_rv_transformation.py
@@ -47,7 +47,7 @@ def _test_trans(self, trans):
         # ax.hist(phi_s, normed=True, bins=100, alpha=0.25, label='Histogram')
         # ax.plot(phi, kde(phi), '--', linewidth=2, label='Kernel Density Estimation')
         # ax.plot(phi, pdf_phi, ':', linewidth=2, label='Transformed PDF')
-        # ax.set_xlabel(r'transformed $\theta$', fontsize=16)
+        # ax.set_xlabel(r'transformed $\\theta$', fontsize=16)
         # ax.set_ylabel('PDF', fontsize=16)
         # plt.legend(loc='best')
         # plt.show(block=True)
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index c406ed7ba..a8e2ea3eb 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -537,7 +537,7 @@ def google_trends(query_terms=['big data', 'machine learning', 'data science'],
         # In the notebook they did some data cleaning: remove Javascript header+footer, and translate new Date(....,..,..) into YYYY-MM-DD.
         header = """// Data table response\ngoogle.visualization.Query.setResponse("""
         data = data[len(header):-2]
-        data = re.sub('new Date\((\d+),(\d+),(\d+)\)', (lambda m: '"%s-%02d-%02d"' % (m.group(1).strip(), 1+int(m.group(2)), int(m.group(3)))), data)
+        data = re.sub(r'new Date\((\d+),(\d+),(\d+)\)', (lambda m: '"%s-%02d-%02d"' % (m.group(1).strip(), 1+int(m.group(2)), int(m.group(3)))), data)
         timeseries = json.loads(data)
         columns = [k['label'] for k in timeseries['table']['cols']]
         rows = map(lambda x: [k['v'] for k in x['c']], timeseries['table']['rows'])
@@ -782,7 +782,7 @@ def hapmap3(data_set='hapmap3'):
 
           /  1, iff SNPij==(B1,B1)
     Aij = |  0, iff SNPij==(B1,B2)
-          \ -1, iff SNPij==(B2,B2)
+          \\ -1, iff SNPij==(B2,B2)
 
     The SNP data and the meta information (such as iid, sex and phenotype) are
     stored in the dataframe datadf, index is the Individual ID,
@@ -1011,7 +1011,7 @@ def singlecell_rna_seq_deng(dataset='singlecell_deng'):
     sample_info.columns = c
 
     # get the labels right:
-    rep = re.compile('\(.*\)')
+    rep = re.compile(r'\(.*\)')
     def filter_dev_stage(row):
         if isnull(row):
             row = "2-cell stage embryo"
@@ -1050,7 +1050,7 @@ def filter_dev_stage(row):
                 #gene_info[file_info.name[:-18]] = inner.Refseq_IDs
 
     # Strip GSM number off data index
-    rep = re.compile('GSM\d+_')
+    rep = re.compile(r'GSM\d+_')
 
     from pandas import MultiIndex
     columns = MultiIndex.from_tuples([row.split('_', 1) for row in data.columns])
diff --git a/GPy/util/netpbmfile.py b/GPy/util/netpbmfile.py
index 030bd5742..dc09c64b2 100644
--- a/GPy/util/netpbmfile.py
+++ b/GPy/util/netpbmfile.py
@@ -180,24 +180,24 @@ def _read_pam_header(self, data):
         """Read PAM header and initialize instance."""
         regroups = re.search(
             b"(^P7[\n\r]+(?:(?:[\n\r]+)|(?:#.*)|"
-            b"(HEIGHT\s+\d+)|(WIDTH\s+\d+)|(DEPTH\s+\d+)|(MAXVAL\s+\d+)|"
-            b"(?:TUPLTYPE\s+\w+))*ENDHDR\n)", data).groups()
+            rb"(HEIGHT\s+\d+)|(WIDTH\s+\d+)|(DEPTH\s+\d+)|(MAXVAL\s+\d+)|"
+            rb"(?:TUPLTYPE\s+\w+))*ENDHDR\n)", data).groups()
         self.header = regroups[0]
         self.magicnum = b'P7'
         for group in regroups[1:]:
             key, value = group.split()
             setattr(self, unicode(key).lower(), int(value))
-        matches = re.findall(b"(TUPLTYPE\s+\w+)", self.header)
+        matches = re.findall(rb"(TUPLTYPE\s+\w+)", self.header)
         self.tupltypes = [s.split(None, 1)[1] for s in matches]
 
     def _read_pnm_header(self, data):
         """Read PNM header and initialize instance."""
         bpm = data[1:2] in b"14"
         regroups = re.search(b"".join((
-            b"(^(P[123456]|P7 332)\s+(?:#.*[\r\n])*",
-            b"\s*(\d+)\s+(?:#.*[\r\n])*",
-            b"\s*(\d+)\s+(?:#.*[\r\n])*" * (not bpm),
-            b"\s*(\d+)\s(?:\s*#.*[\r\n]\s)*)")), data).groups() + (1, ) * bpm
+            rb"(^(P[123456]|P7 332)\s+(?:#.*[\r\n])*",
+            rb"\s*(\d+)\s+(?:#.*[\r\n])*",
+            rb"\s*(\d+)\s+(?:#.*[\r\n])*" * (not bpm),
+            rb"\s*(\d+)\s(?:\s*#.*[\r\n]\s)*)")), data).groups() + (1, ) * bpm
         self.header = regroups[0]
         self.magicnum = regroups[1]
         self.width = int(regroups[2])
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 5fe9be772..a6a7c9cf9 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -150,7 +150,7 @@ def __getattr__(cls, name):
     version = f.read()
     release = version
 
-print version
+print(version)
 
 # version = '0.8.8'
 # The full version, including alpha/beta/rc tags.