output: add NormalizedName

Since mra uses the most frequent name used by each correspondent and people with non-ascii names sometimes use the non-ascii version of their names when emailing and sometimes they convert to ascii it's hard to know what exactly to search for. Included NormalizedName in AddressData which is unicode normalized. If the software reading the address book allows for extra information for an entry, this field can be used to search for both the ascii and the proper unicode version of a name as well.
ferdinandyb · May 28, 2023 · 50569b5 · 50569b5
1 parent 5c839fb
commit 50569b5
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ available extremely fast.
 - ranks addresses explicitly emailed by you higher
 - configurable output via go templates
 - uses the most frequent non-empty display name for each email
+- display name can be unicode normalized for search purposes
 - filters common "no reply" addresses, additional filters can be added via regexes
 - normalizes emails to lower case
 - ability to add additional email addresses from a command
@@ -76,6 +77,7 @@ Available keys:
 ```
 	Address
 	Name
+	NormalizedName: same as Name, but unicode normalized
 	Names
 	Class
 	FrequencyRank
@@ -139,7 +141,7 @@ addresses = [
 ]
 filters = ["@spam.(com|org)"]
 outputpath = "~/.mail/addressbook"
-template = "{{.Address}}\t{{.Name}}"
+template = "{{.Address}}\t{{.Name}}\t{{.NormalizedName}}"
 ```
 
 ## Integration
@@ -152,11 +154,18 @@ Put something like this in your aerc config (using your favourite grep):
 address-book-cmd="ugrep -jP -m 100 --color=never %s /home/[myuser]/.cache/maildir-rank-addr/addressbook.tsv"
 ```
 
-(`-j` is smart case insensitive, and needs to be combined with `-P` for UTF-8)
+(`-j` is smart case insensitive, and needs to be combined with `-P` for UTF-8).
+
+Since aerc only uses the first two of the tab separated columns any other
+column can be added to help with search or to combine with external tools. For
+example adding `NormalizedName` as the third column will allow you to type
+"arpad", and still find and use the entry for "Árpád X" who uses accents in his
+name properly, and "Arpad Y" who conformed to ASCII for some reason.
 
 Note that `address-book-cmd` is not executed in the shell, so you need to hard
 code the path without shell expansion.
 
+
 # Behind the scenes
 
 ## Ranking

diff --git a/data.go b/data.go
@@ -7,15 +7,16 @@ import (
 )
 
 type AddressData struct {
-	Address       string
-	Names         []string
-	Class         int
-	FrequencyRank int
-	RecencyRank   int
-	TotalRank     int
-	ClassCount    [3]int
-	ClassDate     [3]int64
-	Name          string
+	Address        string
+	Names          []string
+	Class          int
+	FrequencyRank  int
+	RecencyRank    int
+	TotalRank      int
+	ClassCount     [3]int
+	ClassDate      [3]int64
+	Name           string
+	NormalizedName string
 }
 
 type Config struct {

diff --git a/ranking.go b/ranking.go
@@ -3,6 +3,10 @@ package main
 import (
 	"sort"
 	"strings"
+	"unicode"
+
+	"golang.org/x/text/transform"
+	"golang.org/x/text/unicode/norm"
 )
 
 func getMostFrequent(names []string) string {
@@ -31,11 +35,18 @@ func getMostFrequent(names []string) string {
 	return lastname
 }
 
+func isMn(r rune) bool {
+	return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks
+}
+
 func normalizeAddressNames(aD AddressData) AddressData {
 	if aD.Name != "" {
 		return aD
 	}
 	aD.Name = getMostFrequent(aD.Names)
+	t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
+	normStr, _, _ := transform.String(t, aD.Name)
+	aD.NormalizedName = normStr
 	return aD
 }