-
Notifications
You must be signed in to change notification settings - Fork 62
[WIP] additionnal lib/cleanup for French language to improve quality of inputs #635
Changes from all commits
8128068
db15ee1
9e0d82f
58430bb
13cbb55
2abf67c
12fcb1e
4a2ea4e
aef4284
18fdf37
1a0cff5
8d7d973
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,8 @@ function sortSentences(sentences) { | |
function clean(sentences) { | ||
return sentences.map((sentence) => { | ||
return sentence | ||
//caracters and space cleanup | ||
|
||
// no space after opening '(' or '[' | ||
.replace(/\(\s+/g, '(') | ||
.replace(/\[\s+/g, '[') | ||
|
@@ -18,6 +20,8 @@ function clean(sentences) { | |
.replace(/\s+\)/g, ')') | ||
.replace(/\s+\]/g, ']') | ||
|
||
// normalize – (long hyphen) into - (short hyphen) | ||
.replace(/–/g, '-') | ||
// no space before or after hyphen | ||
.replace(/\s+-\s+/g, '-') | ||
|
||
|
@@ -31,15 +35,18 @@ function clean(sentences) { | |
// Normalize three consecutive dots into unicode elipsis | ||
.replace(/\.{3}/g, '…') | ||
|
||
// In fr-FR, those should have a no space before | ||
.replace(/\s+,/g, ',') | ||
// Normalize ´ or ’ (french apostroph) into ' (usual apostroph) | ||
.replace(/\´|’/g, '\'') | ||
|
||
// In fr-FR, those should have a no space before and a normal space after | ||
.replace(/\s+,/g, ',') // before ... | ||
.replace(/\s+\./g, '.') | ||
.replace(/\s+…/g, '…') | ||
.replace(/,(?!\s+)/g, ', ') | ||
.replace(/,(?!\s+)/g, ', ') // after ... | ||
.replace(/\.(?!\s+)/g, '. ') | ||
.replace(/…(?!\s+)/g, '… ') | ||
|
||
// In fr-FR, those should have a non-breakable space before and after | ||
// In fr-FR, those should have a non-breakable space before and a normal space after | ||
.replace(/([^ ]|^):/g, '$1 :') // before ... | ||
.replace(/([^ ]|^);/g, '$1 ;') | ||
.replace(/([^ ]|^)\?/g, '$1 ?') | ||
|
@@ -49,9 +56,128 @@ function clean(sentences) { | |
.replace(/\?(?!\s+)/g, '? ') | ||
.replace(/!(?!\s+)/g, '! ') | ||
|
||
//abrevation fr-FR cleanup | ||
//based on common-voice/CorporaCreator#87 | ||
.replace(/(^|\s|\w)\/an(\s|\.|,|\?|!|$)/g, '$1 par an ') | ||
.replace(/(^|\s)km(\s|\.|,|\?|!|$)/g, ' kilomètres ') | ||
.replace(/%/, ' pourcent ') | ||
.replace(/(^|\s|\w)\+(\s|\.|,|\?|!|$)/g, ' plus ') | ||
.replace(/(^|\s|[0-9]+)m(?:2|²)(\s|\.|,|\?|!|$)/g, '$1 mètres carrés ') | ||
.replace(/(^|\s|[0-9]+)(\/|\/\s)m(?:2|²)(\s|\.|,|\?|!|$)/g, '$1 par mètres carrés ') | ||
.replace(/\s?€/g, ' euros ') | ||
.replace(/\s?£/g, ' livres ') | ||
.replace(/\s?$/g, ' dollars ') | ||
.replace(/(^| )(n|N)(?:°|º|°)(\s)?/g, ' $2uméro ') //n° or N° => 'numéro' or 'Numéro' | ||
|
||
|
||
//roman numerals + century | ||
.replace(/(^|\s)Ie(r)? s.(\s|\.|,|\?|!|$)/g, ' premier siècle ') | ||
.replace(/(^|\s)II(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' deuxième siècle ') | ||
.replace(/(^|\s)III(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' troisième siècle ') | ||
.replace(/(^|\s)IV(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' quatrième siècle ') | ||
.replace(/(^|\s)V(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' cinquième siècle ') | ||
.replace(/(^|\s)VI(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' sixième siècle ') | ||
.replace(/(^|\s)VII(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' septième siècle ') | ||
.replace(/(^|\s)VIII(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' huitième siècle ') | ||
.replace(/(^|\s)(VIIII|IX)(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' neuvième siècle ') | ||
.replace(/(^|\s)X(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' dixième siècle ') | ||
.replace(/(^|\s)XI(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' onzième siècle ') | ||
.replace(/(^|\s)XII(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' douxième siècle ') | ||
.replace(/(^|\s)XIII(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' treizième siècle ') | ||
.replace(/(^|\s)(XIIII|XIV)(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' quatorzième siècle ') | ||
.replace(/(^|\s)XV(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' quinzième siècle ') | ||
.replace(/(^|\s)XVI(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' seixième siècle ') | ||
.replace(/(^|\s)XVII(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' dix-septième siècle ') | ||
.replace(/(^|\s)XVIII(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' dix-huitième siècle ') | ||
.replace(/(^|\s)(XIX|XVIIII)(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' dix_neuvième siècle ') | ||
.replace(/(^|\s)XX(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' vingtième siècle ') | ||
.replace(/(^|\s)XXI(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' vingt-et-unième siècle ') | ||
.replace(/(^|\s)XXII(e|è)(me)? s.(\s|\.|,|\?|!|$)/g, ' vingt-deuxième siècle ') | ||
|
||
//roman numerals. | ||
.replace(/(^|\s)I(\s|\.|,|\?|!|$)/g, ' premier ') //translated as 'first'. We considere that it's encountered after chapter (Chapitre I => 'chapitre premier'). Work also with names (Charles I => 'Charles premier') | ||
.replace(/(^|\s)II(\s|\.|,|\?|!|$)/g, ' deux ') | ||
.replace(/(^|\s)III(\s|\.|,|\?|!|$)/g, ' trois ') | ||
.replace(/(^|\s)IV(\s|\.|,|\?|!|$)/g, ' quatre ') | ||
.replace(/(^|\s)V(\s|\.|,|\?|!|$)/g, ' cinq ') | ||
.replace(/(^|\s)VI(\s|\.|,|\?|!|$)/g, ' six ') | ||
.replace(/(^|\s)VII(\s|\.|,|\?|!|$)/g, ' sept ') | ||
.replace(/(^|\s)VIII(\s|\.|,|\?|!|$)/g, ' huit ') | ||
.replace(/(^|\s)(VIIII|IX)(\s|\.|,|\?|!|$)/g, ' neuf ') | ||
.replace(/(^|\s)X(\s|\.|,|\?|!|$)/g, ' dix ') | ||
.replace(/(^|\s)XI(\s|\.|,|\?|!|$)/g, ' onze ') | ||
.replace(/(^|\s)XII(\s|\.|,|\?|!|$)/g, ' douze ') | ||
.replace(/(^|\s)XIII(\s|\.|,|\?|!|$)/g, ' treize ') | ||
.replace(/(^|\s)(XIIII|XIV)(\s|\.|,|\?|!|$)/g, ' quatorze ') | ||
.replace(/(^|\s)XV(\s|\.|,|\?|!|$)/g, ' quinze ') | ||
.replace(/(^|\s)XVI(\s|\.|,|\?|!|$)/g, ' seize ') | ||
.replace(/(^|\s)XVII(\s|\.|,|\?|!|$)/g, ' dix-sept ') | ||
.replace(/(^|\s)XVIII(\s|\.|,|\?|!|$)/g, ' dix-huit ') | ||
.replace(/(^|\s)(XIX|XVIIII)(\s|\.|,|\?|!|$)/g, ' dix-neuf ') | ||
.replace(/(^|\s)XX(\s|\.|,|\?|!|$)/g, ' vingt ') | ||
.replace(/(^|\s)XXI(\s|\.|,|\?|!|$)/g, ' vingt-et-un ') | ||
.replace(/(^|\s)XXII(\s|\.|,|\?|!|$)/g, ' vingt-deux ') | ||
|
||
//first, second, etc. | ||
.replace(/(^|\s)1er?s?(\s|\.|,|\?|!|$)/g, ' premier ') | ||
.replace(/(^|\s)1(e|è)res?(\s|\.|,|\?|!|$)/g, ' premier ') | ||
.replace(/(^|\s)2(e|è)?me?s?(\s|\.|,|\?|!|$)/g, ' deuxième ') | ||
.replace(/(^|\s)2n?ds?(\s|\.|,|\?|!|$)/g, ' second ') | ||
.replace(/(^|\s)2n?des?(\s|\.|,|\?|!|$)/g, ' seconde ') | ||
.replace(/(^|\s)3i?(e|è)me?s?(\s|\.|,|\?|!|$)/g, ' troisième ') | ||
.replace(/(^|\s)4i?(e|è)me?s?(\s|\.|,|\?|!|$)/g, ' quatrième ') | ||
.replace(/(^|\s)5i?(e|è)me?s?(\s|\.|,|\?|!|$)/g, ' cinquième ') | ||
.replace(/(^|\s)6i?(e|è)me?s?(\s|\.|,|\?|!|$)/g, ' sixième ') | ||
.replace(/(^|\s)7i?(e|è)me?s?(\s|\.|,|\?|!|$)/g, ' septième ') | ||
.replace(/(^|\s)8i?(e|è)me?s?(\s|\.|,|\?|!|$)/g, ' huitième ') | ||
.replace(/(^|\s)9i?(e|è)me?s?(\s|\.|,|\?|!|$)/g, ' neuvième ') | ||
.replace(/(^|\s)10i?(e|è)me?s?(\s|\.|,|\?|!|$)/g, ' dixième ') | ||
|
||
|
||
//acronym fr-FR cleanup | ||
//based on common-voice/CorporaCreator#87 | ||
.replace(/(^|\s)ANPE(\s|\.|,|\?|!|$)/g, ' Agence Nationale Pour l\'Emploi $3 ') | ||
.replace(/(^|\s)APL(\s|\.|,|\?|!|$)/g, ' Aide personnalisée au logement $3 ') | ||
.replace(/(^|\s)CDI(\s|\.|,|\?|!|$)/g, ' Contrat à Durée Indéterminée $3 ') | ||
.replace(/(^|\s)CICE(\s|\.|,|\?|!|$)/g, ' Crédit d\'impôt pour la compétitivité et l\'emploi $3 ') | ||
.replace(/(^|\s)DRH(\s|\.|,|\?|!|$)/g, ' Direction des Ressources Humaines $3 ') | ||
.replace(/(^|\s)EDF(\s|\.|,|\?|!|$)/g, ' Electricité de France $3 ') | ||
.replace(/(^|\s)FN(\s|\.|,|\?|!|$)/g, ' Front National $3 ') | ||
.replace(/(^|\s)HLM(\s|\.|,|\?|!|$)/g, ' Habitation à Loyer Modéré $3 ') | ||
.replace(/(^|\s)IGN(\s|\.|,|\?|!|$)/g, ' Institut Géographique National $3 ') | ||
.replace(/(^|\s)INPI(\s|\.|,|\?|!|$)/g, ' Institut National de la Propriété Intellectuelle $3 ') | ||
.replace(/(^|\s)ISF(\s|\.|,|\?|!|$)/g, ' Impôt sur la fortune $3 ') | ||
.replace(/(^|\s)IUT(\s|\.|,|\?|!|$)/g, ' Institut Universitaire de Technologie $3 ') | ||
.replace(/(^|\s)LREM(\s|\.|,|\?|!|$)/g, ' La Réplublique En Marche $3 ') | ||
.replace(/(^|\s)NUPES(\s|\.|,|\?|!|$)/g, ' Nupès $3 ') | ||
.replace(/(^|\s)PHP(\s|\.|,|\?|!|$)/g, ' Protocole Hypertexte Protocolaire $3 ') | ||
.replace(/(^|\s)PMA(\s|\.|,|\?|!|$)/g, ' Procréation médicalement assistée $3 ') | ||
.replace(/(^|\s)PME(\s|\.|,|\?|!|$)/g, ' Petite et Moyenne Entreprise $3 ') | ||
.replace(/(^|\s)RN(\s|\.|,|\?|!|$)/g, ' Rassemblement National $3 ') | ||
.replace(/(^|\s)RSA(\s|\.|,|\?|!|$)/g, ' Revenu de Solidarité Active $3 ') | ||
.replace(/(^|\s)RSA(\s|\.|,|\?|!|$)/g, ' Revenu de Solidarité Active $3 ') | ||
.replace(/(^|\s)RSI(\s|\.|,|\?|!|$)/g, ' Régime Social des Indépendants $3 ') | ||
.replace(/(^|\s)RTE(\s|\.|,|\?|!|$)/g, ' Réseau de Transport d\'Électricité $3 ') | ||
.replace(/(^|\s)SNCF(\s|\.|,|\?|!|$)/g, ' Société Nationale des Chemins de Fer $3 ') | ||
.replace(/(^|\s)TGV(\s|\.|,|\?|!|$)/g, ' Train à Grande Vitesse $3 ') | ||
.replace(/(^|\s)TVA(\s|\.|,|\?|!|$)/g, ' Taxe sur la Valeur Ajoutée $3 ') | ||
.replace(/(^|\s)UDI(\s|\.|,|\?|!|$)/g, ' Union des Démocrates Indépendants $3 ') | ||
.replace(/(^|\s)UMP(\s|\.|,|\?|!|$)/g, ' Union pour un Mouvement Populaire $3 ') | ||
.replace(/(^|\s)USA(\s|\.|,|\?|!|$)/g, ' Etats Unis d\'Amérique $3 ') | ||
|
||
//replace fraction 1/2 => '1 sur 2' | ||
¨.replace(/(^| )(\d+)(\s)?(\/)(\s)?(\d+)(\s|\.|,|\?|!|$)/g, '$2 sur $6') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Numbers are not allowed by the validation file. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't we keep it for "future-proof rules" and/or "bulk load rules if you don't have yours" ? |
||
|
||
//dates, digits and numbers fr-FR cleanup | ||
//todo : CONVERT TO TEXT instead of removing it | ||
.replace((^|\s)\d{1,2}\/\d{1,2}\/(\d{2}[^\d]|\d{4})(\s|$), ' ') //date format dd/mm/yy ou dd/mm/yyyy | ||
.replace((^|\s)\d{1,2}\/(\d{2}[^\d]|\d{4})(\s|$), ' ') //date format mm/yy ou mm/yyyy | ||
.replace(\d, '') //any digit ou number left | ||
|
||
// Final normalization of spaces | ||
.replace(/\s+/g, ' ') | ||
.replace(/\s+$/g, '') | ||
|
||
; | ||
}); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
//see : https://github.com/common-voice/sentence-collector/blob/main/server/lib/validation/VALIDATION.md for more information | ||
|
||
const tokenizeWords = require('talisman/tokenizers/words'); | ||
|
||
// Minimum of words that qualify as a sentence. | ||
const MIN_WORDS = 1; | ||
|
||
// Maximum of words allowed per sentence to keep recordings in a manageable duration. | ||
const MAX_WORDS = 14; | ||
|
||
const INVALIDATIONS = [{ | ||
//[min..max] words | ||
fn: (sentence) => { | ||
const words = tokenizeWords(sentence); | ||
return words.length < MIN_WORDS || words.length > MAX_WORDS; | ||
}, | ||
error: `Le nombre de mots doit être entre ${MIN_WORDS} et ${MAX_WORDS} (inclus)`, | ||
}, { | ||
//no numbers | ||
regex: /[0-9]+/, | ||
error: 'Les phrases ne doivent pas contenir de nombres', | ||
}, { | ||
//no symbols | ||
regex: /[<>+*#@%^[\]()/]/, | ||
error: 'Les phrases de doivent pas contenir de symboles (*, #, (, etc.)', | ||
}, { | ||
// Any words consisting of uppercase letters or uppercase letters with a period | ||
// inbetween are considered abbreviations or acronyms. | ||
// This currently also matches fooBAR but we most probably don't want that either | ||
// as users wouldn't know how to pronounce the uppercase letters. | ||
regex: /[A-Z]{2,}|[A-Z]+\.*[A-Z]+/, | ||
error: 'Les phrases ne doivent pas contenir des abréviations ou sigles', | ||
}]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Aren't we missing something about the french apostrophe (´)? I don't know how the model would benefit (or not) from this situation. I'm pretty sure text processing would do the uniformisation (before TTS for example) so I believe we should make it too. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. commit 1a0cff5 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW, sources files like https://github.com/common-voice/commonvoice-fr/blob/master/CommonVoice-Data/data/debats-assemblee-nationale/20130718093000000.txt are using french apostrophe ?!
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. so... commit 8d7d973 |
||
|
||
module.exports = { | ||
INVALIDATIONS, | ||
}; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure about this one. This could very well be a date.
Il est né le 25/12
qui ne devrait pas être remplacé de la sorte et prend priorité sur la regexp suivante (date format mm/yy)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh shoot ! you're right !
To solve it, we shall test numbers, and see "if it's in date [1..31]/[1..12] (dd/mm) or [1..12]/[00..99] or [1..12]/[1700..2030] (mm/yy(yy)?) range, it shall be a date stamp, otherwise it's a fraction ?!"
Wow! It escalated quickly!
Sometimes, only context will show us that "mettre 1/4 de litre de lait" will go to "mettre un avril de litre de lait"... But it shall be more readable than "mettre de litre de lait".
On the other hand, if we manage poolry date (not checking mm/yy but only mm/yyyy), it will go like : "il est né en 12/88." to "il est né en douze sur quatre-vingt huit." instead of "il est née en décembre quatre-vingt huit."... Again, it's more readable than ""il est né en."
IMHO, [1..31]/[1..12] (dd/mm) or [1..12]/[1700..2030] (mm/yyyy) range shall work fine. The reste will go through 'fraction replacement', and will always be better that 'full removal'.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's first wait for issue #636 resolution, 'coz I don't want to work for nothing :-/