From 2b0ab921d074bbbd10fc6b6ef2b87eb6376cc414 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=BAlio=20Hoffimann?= Date: Thu, 5 Oct 2017 10:13:55 -0700 Subject: [PATCH] Fix #71: Add Haversine distance (#77) * Fix #71: Add Haversine distance * Add Haversine to README.md * Add Haversine to benchmarks * Add benchmark results for Haversine to README.md * Add helper function haversine() with default Earth radius * Add tests for Haversine distance * Use 4 spaces for indentation * Use 4 spaces for indentation on test suite * Remove default radius for Haversine distance * Fix typo in Haversine tests * fixups --- README.md | 4 ++++ benchmark/benchmarks.jl | 4 +++- benchmark/print_table.jl | 3 ++- src/Distances.jl | 5 +++++ src/haversine.jl | 39 +++++++++++++++++++++++++++++++++++++++ test/test_dists.jl | 22 +++++++++++++++++++--- 6 files changed, 72 insertions(+), 5 deletions(-) create mode 100644 src/haversine.jl diff --git a/README.md b/README.md index 70c057c..40be705 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ This package also provides optimized functions to compute column-wise and pairwi * Squared Mahalanobis distance * Bhattacharyya distance * Hellinger distance +* Haversine distance * Mean absolute deviation * Mean squared deviation * Root mean squared deviation @@ -148,6 +149,7 @@ Each distance corresponds to a distance type. The type name and the correspondin | SpanNormDist | `spannorm_dist(x, y)` | `max(x - y) - min(x - y )` | | BhattacharyyaDist | `bhattacharyya(x, y)` | `-log(sum(sqrt(x .* y) / sqrt(sum(x) * sum(y)))` | | HellingerDist | `hellinger(x, y) ` | `sqrt(1 - sum(sqrt(x .* y) / sqrt(sum(x) * sum(y))))` | +| Haversine | `haversine(x, y, r)` | [Haversine formula](https://en.wikipedia.org/wiki/Haversine_formula) | | Mahalanobis | `mahalanobis(x, y, Q)` | `sqrt((x - y)' * Q * (x - y))` | | SqMahalanobis | `sqmahalanobis(x, y, Q)` | ` (x - y)' * Q * (x - y)` | | MeanAbsDeviation | `meanad(x, y)` | `mean(abs.(x - y))` | @@ -226,6 +228,7 @@ The table below compares the performance (measured in terms of average elapsed t | WeightedHamming | 0.009042s | 0.003182s | 2.8417 | | SqMahalanobis | 0.070869s | 0.019199s | 3.6913 | | Mahalanobis | 0.070980s | 0.019305s | 3.6768 | +| Haversine | 0.006549s | 0.000809s | 8.0967 | We can see that using ``colwise`` instead of a simple loop yields considerable gain (2x - 4x), especially when the internal computation of each distance is simple. Nonetheless, when the computation of a single distance is heavy enough (e.g. *KLDivergence*, *RenyiDivergence*), the gain is not as significant. @@ -257,5 +260,6 @@ The table below compares the performance (measured in terms of average elapsed t | WeightedHamming | 0.024456s | 0.007403s | 3.3033 | | SqMahalanobis | 0.113107s | 0.000366s | **309.3621** | | Mahalanobis | 0.114646s | 0.000686s | **167.0595** | +| Haversine | 0.015267s | 0.003656s | 4.1763 | For distances of which a major part of the computation is a quadratic form (e.g. *Euclidean*, *CosineDist*, *Mahalanobis*), the performance can be drastically improved by restructuring the computation and delegating the core part to ``GEMM`` in *BLAS*. The use of this strategy can easily lead to 100x performance gain over simple loops (see the highlighted part of the table above). diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 72a0e90..bd08adc 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -19,6 +19,8 @@ function create_distances(w, Q) BhattacharyyaDist(), HellingerDist(), + Haversine(), + WeightedSqEuclidean(w), WeightedEuclidean(w), WeightedCityblock(w), @@ -57,7 +59,7 @@ function evaluate_colwise(dist, x, y) end function add_colwise_benchmarks!(SUITE) - + m = 200 n = 10000 diff --git a/benchmark/print_table.jl b/benchmark/print_table.jl index b4a07af..e440b85 100644 --- a/benchmark/print_table.jl +++ b/benchmark/print_table.jl @@ -31,6 +31,7 @@ order = [ :WeightedHamming, :SqMahalanobis, :Mahalanobis, + :Haversine, ] BenchmarkTools.DEFAULT_PARAMETERS.seconds = 2.0 # Long enough @@ -67,4 +68,4 @@ function print_table(judgement) end end -print_table(judgement) \ No newline at end of file +print_table(judgement) diff --git a/src/Distances.jl b/src/Distances.jl index f45daae..f949b73 100644 --- a/src/Distances.jl +++ b/src/Distances.jl @@ -45,6 +45,8 @@ export BhattacharyyaDist, HellingerDist, + Haversine, + MeanAbsDeviation, MeanSqDeviation, RMSDeviation, @@ -79,6 +81,8 @@ export bhattacharyya, hellinger, + haversine, + meanad, msd, rmsd, @@ -88,6 +92,7 @@ include("common.jl") include("generic.jl") include("metrics.jl") include("wmetrics.jl") +include("haversine.jl") include("mahalanobis.jl") include("bhattacharyya.jl") diff --git a/src/haversine.jl b/src/haversine.jl new file mode 100644 index 0000000..693eff3 --- /dev/null +++ b/src/haversine.jl @@ -0,0 +1,39 @@ +""" + Haversine(radius) + +The haversine distance between two locations on a sphere of given `radius`. + +Locations are described with longitude and latitude in degrees. +The computed distance has the same units as that of the radius. +""" +struct Haversine{T<:Real} <: Metric + radius::T +end + +const VecOrLengthTwoTuple{T} = Union{AbstractVector{T}, NTuple{2, T}} + +function evaluate(dist::Haversine, x::VecOrLengthTwoTuple, y::VecOrLengthTwoTuple) + length(x) == length(y) == 2 || haversine_error() + + @inbounds begin + # longitudes + Δλ = deg2rad(y[1] - x[1]) + + # latitudes + φ₁ = deg2rad(x[2]) + φ₂ = deg2rad(y[2]) + end + + Δφ = φ₂ - φ₁ + + # haversine formula + a = sin(Δφ/2)^2 + cos(φ₁)*cos(φ₂)*sin(Δλ/2)^2 + c = 2atan2(√a, √(1-a)) + + # distance on the sphere + c*dist.radius +end + +haversine(x::VecOrLengthTwoTuple, y::VecOrLengthTwoTuple, radius::Real) = evaluate(Haversine(radius), x, y) + +@noinline haversine_error() = throw(ArgumentError("expected both inputs to have length 2 in Haversine distance")) \ No newline at end of file diff --git a/test/test_dists.jl b/test/test_dists.jl index 8e13fb4..de2a682 100644 --- a/test/test_dists.jl +++ b/test/test_dists.jl @@ -58,7 +58,13 @@ end test_metricity(BhattacharyyaDist(), x, y, z) test_metricity(HellingerDist(), x, y, z) - + + x₁ = rand(T, 2) + x₂ = rand(T, 2) + x₃ = rand(T, 2) + + test_metricity(Haversine(6371.), x₁, x₂, x₃) + k = rand(1:3, n) l = rand(1:3, n) m = rand(1:3, n) @@ -154,7 +160,7 @@ end @test wcityblock(x, y, w) ≈ dot(abs.(x - vec(y)), w) @test wminkowski(x, y, w, 2) ≈ weuclidean(x, y, w) end - + # Test weighted Hamming distances with even weights a = T.([1.0, 2.0, 1.0, 3.0, 2.0, 1.0]) @@ -187,7 +193,7 @@ end end @test kl_divergence(p, q) ≈ klv @test typeof(kl_divergence(p, q)) == T - + @test renyi_divergence(p, r, 0) ≈ -log(scale) @test renyi_divergence(p, r, 1) ≈ -log(scale) @@ -272,6 +278,16 @@ end # testset end end #testset +@testset "haversine" begin + for T in (Float64, F64) + @test haversine([-180.,0.], [180.,0.], 1.) ≈ 0 atol=1e-10 + @test haversine([0.,-90.], [0.,90.], 1.) ≈ π atol=1e-10 + @test haversine((-180.,0.), (180.,0.), 1.) ≈ 0 atol=1e-10 + @test haversine((0.,-90.), (0.,90.), 1.) ≈ π atol=1e-10 + @test_throws ArgumentError haversine([0.,-90., 0.25], [0.,90.], 1.) + end +end + @testset "bhattacharyya / hellinger" begin for T in (Float64, F64) x, y = T.([4.0, 5.0, 6.0, 7.0]), T.([3.0, 9.0, 8.0, 1.0])