From e0fdca0de68add2a7af227986a058e5140364cef Mon Sep 17 00:00:00 2001 From: John T Maxwell III Date: Wed, 15 May 2024 12:25:23 -0700 Subject: [PATCH] Change parser to parse lowercase forms of uppercase words separately (#54) * Also fix undesired behavior in Sandbox and update liblcm --------- Co-authored-by: Jason Naylor --- Build/mkall.targets | 2 +- Build/nuget-common/packages.config | 18 ++++++------- .../Interlinear/InterlinDocRootSiteBase.cs | 20 +++++++++++++- Src/LexText/Interlinear/SandboxBase.cs | 25 +++++++++++++++++- .../ParserCoreTests/ParseWorkerTests.cs | 11 +++++--- Src/LexText/ParserCore/ParserWorker.cs | 26 ++++++++++++------- Src/xWorks/RecordList.cs | 3 ++- 7 files changed, 78 insertions(+), 27 deletions(-) diff --git a/Build/mkall.targets b/Build/mkall.targets index 298ba3c581..fe4a0ea566 100644 --- a/Build/mkall.targets +++ b/Build/mkall.targets @@ -285,7 +285,7 @@ 5.2.0-beta0003 13.0.0-beta0076 9.4.0.1-beta - 11.0.0-beta0089 + 11.0.0-beta0090 70.1.123 2.5.13 diff --git a/Build/nuget-common/packages.config b/Build/nuget-common/packages.config index e0e5d9a534..ac1ec0bc4b 100644 --- a/Build/nuget-common/packages.config +++ b/Build/nuget-common/packages.config @@ -52,15 +52,15 @@ - - - - - - - - - + + + + + + + + + diff --git a/Src/LexText/Interlinear/InterlinDocRootSiteBase.cs b/Src/LexText/Interlinear/InterlinDocRootSiteBase.cs index 2f749a8314..7649ebdab3 100644 --- a/Src/LexText/Interlinear/InterlinDocRootSiteBase.cs +++ b/Src/LexText/Interlinear/InterlinDocRootSiteBase.cs @@ -18,6 +18,7 @@ using SIL.LCModel.Infrastructure; using SIL.FieldWorks.FwCoreDlgControls; using XCore; +using SIL.LCModel.Core.Text; namespace SIL.FieldWorks.IText { @@ -1046,11 +1047,28 @@ public virtual void PropChanged(int hvo, int tag, int ivMin, int cvIns, int cvDe break; case WfiWordformTags.kflidAnalyses: IWfiWordform wordform = m_cache.ServiceLocator.GetInstance().GetObject(hvo); - if (RootStText.UniqueWordforms().Contains(wordform)) + var uniqueWordforms = RootStText.UniqueWordforms(); + if (uniqueWordforms.Contains(wordform)) { m_wordformsToUpdate.Add(wordform); m_mediator.IdleQueue.Add(IdleQueuePriority.High, PostponedUpdateWordforms); } + // Update uppercase versions of wordform. + // (When a lowercase wordform changes, it affects the best guess of its uppercase versions.) + var form = wordform.Form.VernacularDefaultWritingSystem; + var cf = new CaseFunctions(m_cache.ServiceLocator.WritingSystemManager.Get(form.get_WritingSystemAt(0))); + foreach (IWfiWordform ucWordform in uniqueWordforms) + { + var ucForm = ucWordform.Form.VernacularDefaultWritingSystem; + if (ucForm != form && ucForm != null && !string.IsNullOrEmpty(ucForm.Text)) + { + if (cf.ToLower(ucForm.Text) == form.Text) + { + m_wordformsToUpdate.Add(ucWordform); + m_mediator.IdleQueue.Add(IdleQueuePriority.High, PostponedUpdateWordforms); + } + } + } break; } } diff --git a/Src/LexText/Interlinear/SandboxBase.cs b/Src/LexText/Interlinear/SandboxBase.cs index 7e4f10cf53..a19c272b9e 100644 --- a/Src/LexText/Interlinear/SandboxBase.cs +++ b/Src/LexText/Interlinear/SandboxBase.cs @@ -1662,8 +1662,31 @@ private void GetDefaults(IWfiWordform wordform, ref IWfiAnalysis analysis, out I // to prevent using data that does not exist anymore if(!Cache.ServiceLocator.IsValidObjectId(hvoDefault)) hvoDefault = 0; + if (hvoDefault != 0 && m_fSetWordformInProgress) + { + // Verify that the guess includes the wordform set by the user. + // (The guesser may have guessed a lowercase wordform for an uppercase occurrence.) + // If it doesn't include the wordform, set hvoDefault to 0. + var obj = m_caches.MainCache.ServiceLocator.GetObject(hvoDefault); + IWfiWordform guessWf = null; + switch (obj.ClassID) + { + case WfiAnalysisTags.kClassId: + guessWf = ((IWfiAnalysis)obj).Wordform; + break; + case WfiGlossTags.kClassId: + guessWf = ((IWfiGloss)obj).Wordform; + break; + case WfiWordformTags.kClassId: + guessWf = (IWfiWordform)obj; + break; + } + if (guessWf != null && guessWf != wordform) + hvoDefault = 0; + } + } - else + if (hvoDefault == 0) { // Try to establish a default based on the wordform itself. int ws = wordform.Cache.DefaultVernWs; diff --git a/Src/LexText/ParserCore/ParserCoreTests/ParseWorkerTests.cs b/Src/LexText/ParserCore/ParserCoreTests/ParseWorkerTests.cs index 46c86ce6a4..37383fb515 100644 --- a/Src/LexText/ParserCore/ParserCoreTests/ParseWorkerTests.cs +++ b/Src/LexText/ParserCore/ParserCoreTests/ParseWorkerTests.cs @@ -149,17 +149,20 @@ public void UpdateWordform() parserWorker.Parser = new TestParserClass(lowerResult, null); // SUT + // Parsing an uppercase wordform should cause the lowercase wordform to be parsed. + // The uppercase wordform doesn't get a parse. var bVal = parserWorker.UpdateWordform(catsUpperTest, ParserPriority.Low); ExecuteIdleQueue(); Assert.IsTrue(bVal); - CheckAnalysisSize("Cats", 1, false); - CheckAnalysisSize("cats", 0, false); + CheckAnalysisSize("Cats", 0, false); + CheckAnalysisSize("cats", 1, false); // SUT + // The lowercase wordform has already been parsed. bVal = parserWorker.UpdateWordform(catsLowerTest, ParserPriority.Low); ExecuteIdleQueue(); - Assert.IsTrue(bVal); - CheckAnalysisSize("Cats", 1, false); + Assert.IsFalse(bVal); + CheckAnalysisSize("Cats", 0, false); CheckAnalysisSize("cats", 1, false); } #endregion // Tests diff --git a/Src/LexText/ParserCore/ParserWorker.cs b/Src/LexText/ParserCore/ParserWorker.cs index 7015212615..e2eb769b62 100644 --- a/Src/LexText/ParserCore/ParserWorker.cs +++ b/Src/LexText/ParserCore/ParserWorker.cs @@ -32,6 +32,7 @@ using SIL.LCModel.Infrastructure; using SIL.ObjectModel; using XCore; +using SIL.LCModel.DomainServices; namespace SIL.FieldWorks.WordWorks.Parser { @@ -164,18 +165,23 @@ public bool UpdateWordform(IWfiWordform wordform, ParserPriority priority) CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFD) .Normalize(form.Text.Replace(' ', '.'))); - // If the parse of the original word was not successful,then try to parse the lowercase word. - if (result.Analyses.Count == 0 || result.ErrorMessage != null) - { - var cf = new CaseFunctions(m_cache.ServiceLocator.WritingSystemManager.Get(form.get_WritingSystemAt(0))); - string sLower = cf.ToLower(form.Text); + // Try parsing the lowercase word if it is different from the original word. + // Do this even if the uppercase word parsed successfully. + var cf = new CaseFunctions(m_cache.ServiceLocator.WritingSystemManager.Get(form.get_WritingSystemAt(0))); + string sLower = cf.ToLower(form.Text); - // Try parsing the lowercase word if it is different from the original word. - if (sLower != form.Text) + if (sLower != form.Text) + { + var lcResult = m_parser.ParseWord( + CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFD) + .Normalize(sLower.Replace(' ', '.'))); + if (lcResult.Analyses.Count > 0 && lcResult.ErrorMessage == null) { - result = m_parser.ParseWord( - CustomIcu.GetIcuNormalizer(FwNormalizationMode.knmNFD) - .Normalize(sLower.Replace(' ', '.'))); + var text = TsStringUtils.MakeString(sLower, form.get_WritingSystem(0)); + var lcWordform = WfiWordformServices.FindOrCreateWordform(m_cache, text); + m_parseFiler.ProcessParse(lcWordform, priority, lcResult); + m_parseFiler.ProcessParse(wordform, priority, result); + return true; } } diff --git a/Src/xWorks/RecordList.cs b/Src/xWorks/RecordList.cs index d36307882b..7a5ed518c5 100644 --- a/Src/xWorks/RecordList.cs +++ b/Src/xWorks/RecordList.cs @@ -1743,7 +1743,8 @@ protected virtual bool TryHandleUpdateOrMarkPendingReload(int hvo, int tag, int return true; } } - else if (tag == SegmentTags.kflidAnalyses && m_publisher.OwningFieldName == "Wordforms") + // tag == WfiWordformTags.kflidAnalyses is needed for wordforms that don't appear in a segment. + else if ((tag == SegmentTags.kflidAnalyses || tag == WfiWordformTags.kflidAnalyses) && m_publisher.OwningFieldName == "Wordforms") { // Changing this potentially changes the list of wordforms that occur in the interesting texts. // Hopefully we don't rebuild the list every time; usually this can only be changed in another view.