From 50569b528c5bbfbf09154b7e17cb0f4ba723d1f0 Mon Sep 17 00:00:00 2001
From: Bence Ferdinandy <bence@ferdinandy.com>
Date: Sun, 28 May 2023 21:05:55 +0200
Subject: [PATCH] output: add NormalizedName

Since mra uses the most frequent name used by each correspondent and
people with non-ascii names sometimes use the non-ascii version of their
names when emailing and sometimes they convert to ascii it's hard to
know what exactly to search for. Included NormalizedName in AddressData
which is unicode normalized. If the software reading the address book
allows for extra information for an entry, this field can be used to
search for both the ascii and the proper unicode version of a name as
well.
---
 README.md  | 13 +++++++++++--
 data.go    | 19 ++++++++++---------
 ranking.go | 11 +++++++++++
 3 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index cb6880d..87e3b35 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,7 @@ available extremely fast.
 - ranks addresses explicitly emailed by you higher
 - configurable output via go templates
 - uses the most frequent non-empty display name for each email
+- display name can be unicode normalized for search purposes
 - filters common "no reply" addresses, additional filters can be added via regexes
 - normalizes emails to lower case
 - ability to add additional email addresses from a command
@@ -76,6 +77,7 @@ Available keys:
 ```
 	Address
 	Name
+	NormalizedName: same as Name, but unicode normalized
 	Names
 	Class
 	FrequencyRank
@@ -139,7 +141,7 @@ addresses = [
 ]
 filters = ["@spam.(com|org)"]
 outputpath = "~/.mail/addressbook"
-template = "{{.Address}}\t{{.Name}}"
+template = "{{.Address}}\t{{.Name}}\t{{.NormalizedName}}"
 ```
 
 ## Integration
@@ -152,11 +154,18 @@ Put something like this in your aerc config (using your favourite grep):
 address-book-cmd="ugrep -jP -m 100 --color=never %s /home/[myuser]/.cache/maildir-rank-addr/addressbook.tsv"
 ```
 
-(`-j` is smart case insensitive, and needs to be combined with `-P` for UTF-8)
+(`-j` is smart case insensitive, and needs to be combined with `-P` for UTF-8).
+
+Since aerc only uses the first two of the tab separated columns any other
+column can be added to help with search or to combine with external tools. For
+example adding `NormalizedName` as the third column will allow you to type
+"arpad", and still find and use the entry for "Árpád X" who uses accents in his
+name properly, and "Arpad Y" who conformed to ASCII for some reason.
 
 Note that `address-book-cmd` is not executed in the shell, so you need to hard
 code the path without shell expansion.
 
+
 # Behind the scenes
 
 ## Ranking
diff --git a/data.go b/data.go
index 3e01c3a..0cb43ab 100644
--- a/data.go
+++ b/data.go
@@ -7,15 +7,16 @@ import (
 )
 
 type AddressData struct {
-	Address       string
-	Names         []string
-	Class         int
-	FrequencyRank int
-	RecencyRank   int
-	TotalRank     int
-	ClassCount    [3]int
-	ClassDate     [3]int64
-	Name          string
+	Address        string
+	Names          []string
+	Class          int
+	FrequencyRank  int
+	RecencyRank    int
+	TotalRank      int
+	ClassCount     [3]int
+	ClassDate      [3]int64
+	Name           string
+	NormalizedName string
 }
 
 type Config struct {
diff --git a/ranking.go b/ranking.go
index b8b7022..2470456 100644
--- a/ranking.go
+++ b/ranking.go
@@ -3,6 +3,10 @@ package main
 import (
 	"sort"
 	"strings"
+	"unicode"
+
+	"golang.org/x/text/transform"
+	"golang.org/x/text/unicode/norm"
 )
 
 func getMostFrequent(names []string) string {
@@ -31,11 +35,18 @@ func getMostFrequent(names []string) string {
 	return lastname
 }
 
+func isMn(r rune) bool {
+	return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks
+}
+
 func normalizeAddressNames(aD AddressData) AddressData {
 	if aD.Name != "" {
 		return aD
 	}
 	aD.Name = getMostFrequent(aD.Names)
+	t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
+	normStr, _, _ := transform.String(t, aD.Name)
+	aD.NormalizedName = normStr
 	return aD
 }