From 2b0ab921d074bbbd10fc6b6ef2b87eb6376cc414 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=BAlio=20Hoffimann?= <julio.hoffimann@gmail.com>
Date: Thu, 5 Oct 2017 10:13:55 -0700
Subject: [PATCH] Fix #71: Add Haversine distance (#77)

* Fix #71: Add Haversine distance

* Add Haversine to README.md

* Add Haversine to benchmarks

* Add benchmark results for Haversine to README.md

* Add helper function haversine() with default Earth radius

* Add tests for Haversine distance

* Use 4 spaces for indentation

* Use 4 spaces for indentation on test suite

* Remove default radius for Haversine distance

* Fix typo in Haversine tests

* fixups
---
 README.md                |  4 ++++
 benchmark/benchmarks.jl  |  4 +++-
 benchmark/print_table.jl |  3 ++-
 src/Distances.jl         |  5 +++++
 src/haversine.jl         | 39 +++++++++++++++++++++++++++++++++++++++
 test/test_dists.jl       | 22 +++++++++++++++++++---
 6 files changed, 72 insertions(+), 5 deletions(-)
 create mode 100644 src/haversine.jl

diff --git a/README.md b/README.md
index 70c057c..40be705 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,7 @@ This package also provides optimized functions to compute column-wise and pairwi
 * Squared Mahalanobis distance
 * Bhattacharyya distance
 * Hellinger distance
+* Haversine distance
 * Mean absolute deviation
 * Mean squared deviation
 * Root mean squared deviation
@@ -148,6 +149,7 @@ Each distance corresponds to a distance type. The type name and the correspondin
 |  SpanNormDist        |  `spannorm_dist(x, y)`     | `max(x - y) - min(x - y )` |
 |  BhattacharyyaDist   |  `bhattacharyya(x, y)`     | `-log(sum(sqrt(x .* y) / sqrt(sum(x) * sum(y)))` |
 |  HellingerDist       |  `hellinger(x, y) `        | `sqrt(1 - sum(sqrt(x .* y) / sqrt(sum(x) * sum(y))))` |
+|  Haversine           |  `haversine(x, y, r)`      | [Haversine formula](https://en.wikipedia.org/wiki/Haversine_formula) |
 |  Mahalanobis         |  `mahalanobis(x, y, Q)`    | `sqrt((x - y)' * Q * (x - y))` |
 |  SqMahalanobis       |  `sqmahalanobis(x, y, Q)`  | ` (x - y)' * Q * (x - y)`  |
 |  MeanAbsDeviation    |  `meanad(x, y)`            | `mean(abs.(x - y))` |
@@ -226,6 +228,7 @@ The table below compares the performance (measured in terms of average elapsed t
 | WeightedHamming | 0.009042s |  0.003182s |  2.8417 |
 | SqMahalanobis | 0.070869s |  0.019199s |  3.6913 |
 | Mahalanobis | 0.070980s |  0.019305s |  3.6768 |
+| Haversine | 0.006549s |  0.000809s |  8.0967 |
 
 We can see that using ``colwise`` instead of a simple loop yields considerable gain (2x - 4x), especially when the internal computation of each distance is simple. Nonetheless, when the computation of a single distance is heavy enough (e.g. *KLDivergence*,  *RenyiDivergence*), the gain is not as significant.
 
@@ -257,5 +260,6 @@ The table below compares the performance (measured in terms of average elapsed t
 | WeightedHamming | 0.024456s |  0.007403s |  3.3033 |
 | SqMahalanobis | 0.113107s |  0.000366s | **309.3621** |
 | Mahalanobis | 0.114646s |  0.000686s | **167.0595** |
+| Haversine | 0.015267s |  0.003656s |  4.1763 |
 
 For distances of which a major part of the computation is a quadratic form (e.g. *Euclidean*, *CosineDist*, *Mahalanobis*), the performance can be drastically improved by restructuring the computation and delegating the core part to ``GEMM`` in *BLAS*. The use of this strategy can easily lead to 100x performance gain over simple loops (see the highlighted part of the table above).
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 72a0e90..bd08adc 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -19,6 +19,8 @@ function create_distances(w, Q)
         BhattacharyyaDist(),
         HellingerDist(),
 
+        Haversine(),
+
         WeightedSqEuclidean(w),
         WeightedEuclidean(w),
         WeightedCityblock(w),
@@ -57,7 +59,7 @@ function evaluate_colwise(dist, x, y)
 end
 
 function add_colwise_benchmarks!(SUITE)
-    
+
     m = 200
     n = 10000
 
diff --git a/benchmark/print_table.jl b/benchmark/print_table.jl
index b4a07af..e440b85 100644
--- a/benchmark/print_table.jl
+++ b/benchmark/print_table.jl
@@ -31,6 +31,7 @@ order = [
     :WeightedHamming,
     :SqMahalanobis,
     :Mahalanobis,
+    :Haversine,
 ]
 
 BenchmarkTools.DEFAULT_PARAMETERS.seconds = 2.0 # Long enough
@@ -67,4 +68,4 @@ function print_table(judgement)
     end
 end
 
-print_table(judgement)
\ No newline at end of file
+print_table(judgement)
diff --git a/src/Distances.jl b/src/Distances.jl
index f45daae..f949b73 100644
--- a/src/Distances.jl
+++ b/src/Distances.jl
@@ -45,6 +45,8 @@ export
     BhattacharyyaDist,
     HellingerDist,
 
+    Haversine,
+
     MeanAbsDeviation,
     MeanSqDeviation,
     RMSDeviation,
@@ -79,6 +81,8 @@ export
     bhattacharyya,
     hellinger,
 
+    haversine,
+
     meanad,
     msd,
     rmsd,
@@ -88,6 +92,7 @@ include("common.jl")
 include("generic.jl")
 include("metrics.jl")
 include("wmetrics.jl")
+include("haversine.jl")
 include("mahalanobis.jl")
 include("bhattacharyya.jl")
 
diff --git a/src/haversine.jl b/src/haversine.jl
new file mode 100644
index 0000000..693eff3
--- /dev/null
+++ b/src/haversine.jl
@@ -0,0 +1,39 @@
+"""
+    Haversine(radius)
+
+The haversine distance between two locations on a sphere of given `radius`.
+
+Locations are described with longitude and latitude in degrees.
+The computed distance has the same units as that of the radius.
+"""
+struct Haversine{T<:Real} <: Metric
+    radius::T
+end
+
+const VecOrLengthTwoTuple{T} = Union{AbstractVector{T}, NTuple{2, T}}
+
+function evaluate(dist::Haversine, x::VecOrLengthTwoTuple, y::VecOrLengthTwoTuple) 
+    length(x) == length(y) == 2 || haversine_error()
+
+    @inbounds begin
+        # longitudes
+        Δλ = deg2rad(y[1] - x[1])
+
+        # latitudes
+        φ₁ = deg2rad(x[2])
+        φ₂ = deg2rad(y[2])
+    end
+
+    Δφ = φ₂ - φ₁
+
+    # haversine formula
+    a = sin(Δφ/2)^2 + cos(φ₁)*cos(φ₂)*sin(Δλ/2)^2
+    c = 2atan2(√a, √(1-a))
+
+    # distance on the sphere
+    c*dist.radius
+end
+
+haversine(x::VecOrLengthTwoTuple, y::VecOrLengthTwoTuple, radius::Real) = evaluate(Haversine(radius), x, y)
+
+@noinline haversine_error() = throw(ArgumentError("expected both inputs to have length 2 in Haversine distance"))
\ No newline at end of file
diff --git a/test/test_dists.jl b/test/test_dists.jl
index 8e13fb4..de2a682 100644
--- a/test/test_dists.jl
+++ b/test/test_dists.jl
@@ -58,7 +58,13 @@ end
 
         test_metricity(BhattacharyyaDist(), x, y, z)
         test_metricity(HellingerDist(), x, y, z)
-    
+
+        x₁ = rand(T, 2)
+        x₂ = rand(T, 2)
+        x₃ = rand(T, 2)
+
+        test_metricity(Haversine(6371.), x₁, x₂, x₃)
+
         k = rand(1:3, n)
         l = rand(1:3, n)
         m = rand(1:3, n)
@@ -154,7 +160,7 @@ end
             @test wcityblock(x, y, w) ≈ dot(abs.(x - vec(y)), w)
             @test wminkowski(x, y, w, 2) ≈ weuclidean(x, y, w)
         end
- 
+
 
         # Test weighted Hamming distances with even weights
         a = T.([1.0, 2.0, 1.0, 3.0, 2.0, 1.0])
@@ -187,7 +193,7 @@ end
         end
         @test kl_divergence(p, q) ≈ klv
         @test typeof(kl_divergence(p, q)) == T
-   
+
 
         @test renyi_divergence(p, r, 0) ≈ -log(scale)
         @test renyi_divergence(p, r, 1) ≈ -log(scale)
@@ -272,6 +278,16 @@ end # testset
     end
 end #testset
 
+@testset "haversine" begin
+    for T in (Float64, F64)
+        @test haversine([-180.,0.], [180.,0.], 1.) ≈ 0 atol=1e-10
+        @test haversine([0.,-90.],  [0.,90.],  1.) ≈ π atol=1e-10
+        @test haversine((-180.,0.), (180.,0.), 1.) ≈ 0 atol=1e-10
+        @test haversine((0.,-90.),  (0.,90.),  1.) ≈ π atol=1e-10
+        @test_throws ArgumentError haversine([0.,-90., 0.25], [0.,90.], 1.)
+    end
+end
+
 @testset "bhattacharyya / hellinger" begin
     for T in (Float64, F64)
         x, y = T.([4.0, 5.0, 6.0, 7.0]), T.([3.0, 9.0, 8.0, 1.0])