FluxML · AdarshKumar712 · Mar 7, 2020 · Apr 6, 2020 · Apr 6, 2020 · Apr 6, 2020
diff --git a/src/activation.jl b/src/activation.jl
@@ -28,10 +28,9 @@ end
 Segment-wise linear approximation of sigmoid.
 See [BinaryConnect: Training Deep Neural Networks withbinary weights during propagations](https://arxiv.org/pdf/1511.00363.pdf).
 """
-hardσ(x::Real, a=0.2) = oftype(x/1, max(zero(x/1), min(one(x/1), oftype(x/1,a) * x + oftype(x/1,0.5))))
+hardσ(x::Real, a=0.2) = oftype(x / 1, max(zero(x / 1), min(one(x / 1), oftype(x / 1, a) * x + oftype(x / 1, 0.5))))
 const hardsigmoid = hardσ
 
-
 """
     logσ(x)
 
@@ -48,7 +47,6 @@ Return `log(σ(x))` which is computed in a numerically stable way.
 logσ(x::Real) = -softplus(-x)
 const logsigmoid = logσ
 
-
 """
     hardtanh(x) = max(-1, min(1, x))
 
@@ -57,7 +55,6 @@ See [Large Scale Machine Learning](http://ronan.collobert.org/pub/matos/2004_phd
 """
 hardtanh(x::Real) = max(-one(x), min( one(x), x))
 
-
 """
     relu(x) = max(0, x)
 
@@ -66,15 +63,14 @@ activation function.
 """
 relu(x::Real) = max(zero(x), x)
 
-
 """
     leakyrelu(x, a=0.01) = max(a*x, x)
 
 Leaky [Rectified Linear Unit](https://en.wikipedia.org/wiki/Rectifier_(neural_networks))
 activation function.
 You can also specify the coefficient explicitly, e.g. `leakyrelu(x, 0.01)`.
 """
-leakyrelu(x::Real, a = oftype(x / 1, 0.01)) = max(a * x, x / one(x))
+leakyrelu(x::Real, a=0.01) = max(oftype(x / 1, a) * x, x / 1)
 
 """
     relu6(x) = min(max(0, x), 6)
@@ -107,8 +103,7 @@ Exponential Linear Unit activation function.
 See [Fast and Accurate Deep Network Learning by Exponential Linear Units](https://arxiv.org/abs/1511.07289).
 You can also specify the coefficient explicitly, e.g. `elu(x, 1)`.
 """
-elu(x::Real, α = one(x)) = ifelse(x ≥ 0, x / one(x), α * (exp(x) - one(x)))
-
+elu(x::Real, α = one(x)) = ifelse(x ≥ 0, x / 1, α * (exp(x) - one(x)))
 
 """
     gelu(x) = 0.5x * (1 + tanh(√(2/π) * (x + 0.044715x^3)))
@@ -124,7 +119,6 @@ function gelu(x::Real)
     h * x * (one(x) + tanh(λ * (x + α * x^3)))
 end
 
-
 """
     swish(x) = x * σ(x)
 
@@ -133,7 +127,6 @@ See [Swish: a Self-Gated Activation Function](https://arxiv.org/pdf/1710.05941.p
 """
 swish(x::Real) = x * σ(x)
 
-
 """
     lisht(x) = x * tanh(x)
 
@@ -142,7 +135,6 @@ See [LiSHT](https://arxiv.org/abs/1901.05894)
 """
 lisht(x::Real) = x * tanh(x)
 
-
 """
     selu(x) = λ * (x ≥ 0 ? x : α * (exp(x) - 1))
 
@@ -155,53 +147,47 @@ See [Self-Normalizing Neural Networks](https://arxiv.org/pdf/1706.02515.pdf).
 function selu(x::Real)
   λ = oftype(x / 1, 1.0507009873554804934193349852946)
   α = oftype(x / 1, 1.6732632423543772848170429916717)
-  λ * ifelse(x > 0, x / one(x), α * (exp(x) - one(x)))
+  λ * ifelse(x > 0, x / 1, α * (exp(x) - one(x)))
 end
 
 """
     celu(x, α=1) = 
         (x ≥ 0 ? x : α * (exp(x/α) - 1))
 
-Continuously Differentiable Exponential Linear Units
 See [Continuously Differentiable Exponential Linear Units](https://arxiv.org/pdf/1704.07483.pdf).
 """
-celu(x::Real, α::Real = one(x)) = ifelse(x ≥ 0, x / one(x), α * (exp(x/α) - one(x))) 
-
+celu(x::Real, α::Real = one(x)) = ifelse(x ≥ 0, x / 1, α * (exp(x/α) - one(x))) 
 
 """
-    trelu(x, theta = 1.0) = x > theta ? x : 0 
+    trelu(x, θ=1.0) = x > θ ? x : 0 
 
 Threshold Gated Rectified Linear.
 See [ThresholdRelu](https://arxiv.org/pdf/1402.3337.pdf)
 """
-trelu(x::Real,theta = one(x)) = ifelse(x> theta, x, zero(x))
+trelu(x::Real,θ = one(x)) = ifelse(x> θ, x, zero(x))
-trelu(x::Real,θ = one(x)) = ifelse(x> θ, x, zero(x))
+trelu(x::Real, θ=one(x)) = ifelse(x> θ, x, zero(x))
-trelu(x::Real,θ = one(x)) = ifelse(x> θ, x, zero(x))
+trelu(x::Real, θ=one(x)) = ifelse(x> θ, x, zero(x))
 const thresholdrelu = trelu
 
-
 """
     softsign(x) = x / (1 + |x|)
 
 See [Quadratic Polynomials Learn Better Image Features](http://www.iro.umontreal.ca/~lisa/publications2/index.php/attachments/single/205).
 """
 softsign(x::Real) = x / (one(x) + abs(x))
 
-
 """
     softplus(x) = log(exp(x) + 1)
 
 See [Deep Sparse Rectifier Neural Networks](http://proceedings.mlr.press/v15/glorot11a/glorot11a.pdf).
 """
 softplus(x::Real) = ifelse(x > 0, x + log1p(exp(-x)), log1p(exp(x)))
 
-
 """
-    logcosh(x)
+    logcosh(x) = x + softplus(-2x) - log(2)
 
 Return `log(cosh(x))` which is computed in a numerically stable way.
 """
 logcosh(x::Real) = x + softplus(-2x) - log(oftype(x, 2))
 
-
 """
     mish(x) = x * tanh(softplus(x))
 
@@ -223,7 +209,7 @@ tanhshrink(x::Real) = x - tanh(x)
 
 See [Softshrink Activation Function](https://www.gabormelli.com/RKB/Softshrink_Activation_Function).
 """
-softshrink(x::Real, λ = oftype(x/1, 0.5)) = min(max(zero(x), x - λ), x + λ)
+softshrink(x::Real, λ = oftype(x / 1, 0.5)) = min(max(zero(x), x - λ), x + λ)
 
 # Provide an informative error message if activation functions are called with an array
 for f in (:σ, :σ_stable, :hardσ, :logσ, :hardtanh, :relu, :leakyrelu, :relu6, :rrelu, :elu, :gelu, :swish, :lisht, :selu, :celu, :trelu, :softsign, :softplus, :logcosh, :mish, :tanhshrink, :softshrink)

diff --git a/src/conv.jl b/src/conv.jl
@@ -29,7 +29,7 @@ export conv, conv!, ∇conv_data, ∇conv_data!, ∇conv_filter, ∇conv_filter!
 
 
 # First, we will define mappings from the generic API names to our accelerated backend
-# implementations. For homogeneous-datatype 1, 2 and 3d convolutions, we default to using
+# implementations. For homogeneous-datatype 1d, 2d and 3d convolutions, we default to using
 # im2col + GEMM.  Do so in a loop, here:
 for (front_name, backend) in (
         # This maps from public, front-facing name, to internal backend name
@@ -86,7 +86,7 @@ end
 
 # We always support a fallback, non-accelerated path, where we use the direct, but
 # slow, implementations.  These should not typically be used, hence the `@debug`,
-# but let's ggo ahead and define them first:
+# but let's go ahead and define them first:
 for front_name in (:conv, :∇conv_data, :∇conv_filter,
                    :depthwiseconv, :∇depthwiseconv_data, :∇depthwiseconv_filter)
     @eval begin
@@ -179,8 +179,6 @@ function conv(x, w::AbstractArray{T, N}; stride=1, pad=0, dilation=1, flipped=fa
 end
 
 
-
-
 """
     depthwiseconv(x, w; stride=1, pad=0, dilation=1, flipped=false)
 

diff --git a/src/impl/conv_direct.jl b/src/impl/conv_direct.jl
@@ -30,9 +30,9 @@ kernel, storing the result in a `Float32` output, there is at least a function c
 for that madness.
 
 The keyword arguments `alpha` and `beta` control accumulation behavior; this function
-calculates `y = alpha * x * w + beta * y`, therefore by setting `beta` to a nonzero
-value, the user is able to accumulate values into a preallocated `y` buffer, or by
-setting `alpha` to a nonunitary value, an arbitrary gain factor can be applied.
+calculates `y = alpha * x * w + beta * y`, therefore by setting `beta` to a non-zero
+value, the user is able to accumulate values into a pre-allocated `y` buffer, or by
+setting `alpha` to a non-unitary value, an arbitrary gain factor can be applied.
 
 By defaulting `beta` to `false`, we make use of the Bradbury promotion trick to override
 `NaN`'s that may pre-exist within our output buffer, as `false*NaN == 0.0`, whereas

diff --git a/src/impl/conv_im2col.jl b/src/impl/conv_im2col.jl
@@ -16,8 +16,8 @@ end
 Perform a convolution using im2col and GEMM, store the result in `y`.  The  kwargs
 `alpha` and `beta` control accumulation behavior; internally this operation is
 implemented as a matrix multiply that boils down to `y = alpha * x * w + beta * y`, thus
-by setting `beta` to a nonzero value, multiple results can be accumulated into `y`, or
-by setting `alpha` to a nonunitary value, various gain factors can be applied.
+by setting `beta` to a non-zero value, multiple results can be accumulated into `y`, or
+by setting `alpha` to a non-unitary value, various gain factors can be applied.
 
 Note for the particularly performance-minded, you can provide a pre-allocated `col`,
 which should eliminate any need for large allocations within this method.
@@ -39,7 +39,7 @@ function conv_im2col!(
     # In english, we're grabbing each input patch and laying them out along
     # the M dimension in `col`, so that the GEMM call below multiplies each
     # kernel (which is kernel_h * kernel_w * channels_in elments long) is
-    # dotproducted with that input patch, effectively computing a convolution
+    # dot-producted with that input patch, effectively computing a convolution
     # in a somewhat memory-wasteful but easily-computed way (since we already
     # have an extremely highly-optimized GEMM call available in BLAS).
     M = prod(output_size(cdims))
@@ -162,9 +162,6 @@ function ∇conv_data_im2col!(
 end
 
 
-
-
-
 """
     im2col!(col, x, cdims)
 
@@ -233,7 +230,7 @@ function im2col!(col::AbstractArray{T,2}, x::AbstractArray{T,4},
         end
     end
 
-    
+
     # For each "padded region", we run the fully general version
     @inbounds for (w_region, h_region, d_region) in padded_regions
         for c in 1:C_in,