From 50569b528c5bbfbf09154b7e17cb0f4ba723d1f0 Mon Sep 17 00:00:00 2001 From: Bence Ferdinandy Date: Sun, 28 May 2023 21:05:55 +0200 Subject: [PATCH] output: add NormalizedName Since mra uses the most frequent name used by each correspondent and people with non-ascii names sometimes use the non-ascii version of their names when emailing and sometimes they convert to ascii it's hard to know what exactly to search for. Included NormalizedName in AddressData which is unicode normalized. If the software reading the address book allows for extra information for an entry, this field can be used to search for both the ascii and the proper unicode version of a name as well. --- README.md | 13 +++++++++++-- data.go | 19 ++++++++++--------- ranking.go | 11 +++++++++++ 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index cb6880d..87e3b35 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ available extremely fast. - ranks addresses explicitly emailed by you higher - configurable output via go templates - uses the most frequent non-empty display name for each email +- display name can be unicode normalized for search purposes - filters common "no reply" addresses, additional filters can be added via regexes - normalizes emails to lower case - ability to add additional email addresses from a command @@ -76,6 +77,7 @@ Available keys: ``` Address Name + NormalizedName: same as Name, but unicode normalized Names Class FrequencyRank @@ -139,7 +141,7 @@ addresses = [ ] filters = ["@spam.(com|org)"] outputpath = "~/.mail/addressbook" -template = "{{.Address}}\t{{.Name}}" +template = "{{.Address}}\t{{.Name}}\t{{.NormalizedName}}" ``` ## Integration @@ -152,11 +154,18 @@ Put something like this in your aerc config (using your favourite grep): address-book-cmd="ugrep -jP -m 100 --color=never %s /home/[myuser]/.cache/maildir-rank-addr/addressbook.tsv" ``` -(`-j` is smart case insensitive, and needs to be combined with `-P` for UTF-8) +(`-j` is smart case insensitive, and needs to be combined with `-P` for UTF-8). + +Since aerc only uses the first two of the tab separated columns any other +column can be added to help with search or to combine with external tools. For +example adding `NormalizedName` as the third column will allow you to type +"arpad", and still find and use the entry for "Árpád X" who uses accents in his +name properly, and "Arpad Y" who conformed to ASCII for some reason. Note that `address-book-cmd` is not executed in the shell, so you need to hard code the path without shell expansion. + # Behind the scenes ## Ranking diff --git a/data.go b/data.go index 3e01c3a..0cb43ab 100644 --- a/data.go +++ b/data.go @@ -7,15 +7,16 @@ import ( ) type AddressData struct { - Address string - Names []string - Class int - FrequencyRank int - RecencyRank int - TotalRank int - ClassCount [3]int - ClassDate [3]int64 - Name string + Address string + Names []string + Class int + FrequencyRank int + RecencyRank int + TotalRank int + ClassCount [3]int + ClassDate [3]int64 + Name string + NormalizedName string } type Config struct { diff --git a/ranking.go b/ranking.go index b8b7022..2470456 100644 --- a/ranking.go +++ b/ranking.go @@ -3,6 +3,10 @@ package main import ( "sort" "strings" + "unicode" + + "golang.org/x/text/transform" + "golang.org/x/text/unicode/norm" ) func getMostFrequent(names []string) string { @@ -31,11 +35,18 @@ func getMostFrequent(names []string) string { return lastname } +func isMn(r rune) bool { + return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks +} + func normalizeAddressNames(aD AddressData) AddressData { if aD.Name != "" { return aD } aD.Name = getMostFrequent(aD.Names) + t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC) + normStr, _, _ := transform.String(t, aD.Name) + aD.NormalizedName = normStr return aD }