From 19ff973558bdde89781918096e3e60659b6d6258 Mon Sep 17 00:00:00 2001 From: Natalia Date: Tue, 5 Feb 2019 16:34:28 +0100 Subject: [PATCH 1/2] Fix keyphrase in slug researcher --- spec/fullTextTests/testTexts/ru/russianPaper1.js | 4 ++-- spec/researches/keywordCountInUrlSpec.js | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/spec/fullTextTests/testTexts/ru/russianPaper1.js b/spec/fullTextTests/testTexts/ru/russianPaper1.js index 6a3c22fae2..4d158fa8a6 100644 --- a/spec/fullTextTests/testTexts/ru/russianPaper1.js +++ b/spec/fullTextTests/testTexts/ru/russianPaper1.js @@ -82,8 +82,8 @@ const expectedResults = { }, urlKeyword: { isApplicable: true, - score: 6, - resultText: "Keyphrase in slug: (Part of) your keyphrase does not appear in the slug. Change that!", + score: 9, + resultText: "Keyphrase in slug: Great work!", }, urlLength: { isApplicable: true, diff --git a/spec/researches/keywordCountInUrlSpec.js b/spec/researches/keywordCountInUrlSpec.js index aff923aeb5..1e2e546f63 100644 --- a/spec/researches/keywordCountInUrlSpec.js +++ b/spec/researches/keywordCountInUrlSpec.js @@ -18,13 +18,27 @@ describe( "test to check url for keyword", function() { expect( urlKeyword( paper, researcher ) ).toEqual( { keyphraseLength: 2, percentWordMatches: 100 } ); } ); - it( "returns no matches for dashed words", function() { + it( "returns no matches for differently dashed words", function() { const paper = new Paper( "", { url: "url-with-key-word", keyword: "keyword" } ); const researcher = new Researcher( paper ); researcher.addResearchData( "morphology", morphologyData ); expect( urlKeyword( paper, researcher ) ).toEqual( { keyphraseLength: 1, percentWordMatches: 0 } ); } ); + it( "returns matches for equally dashed words", function() { + const paper = new Paper( "", { url: "url-with-key-word", keyword: "key-word" } ); + const researcher = new Researcher( paper ); + researcher.addResearchData( "morphology", morphologyData ); + expect( urlKeyword( paper, researcher ) ).toEqual( { keyphraseLength: 2, percentWordMatches: 100 } ); + } ); + + it( "returns matches for equally dashed words with more words around", function() { + const paper = new Paper( "", { url: "url-with-key-word", keyword: "exciting key-word exciting" } ); + const researcher = new Researcher( paper ); + researcher.addResearchData( "morphology", morphologyData ); + expect( urlKeyword( paper, researcher ) ).toEqual( { keyphraseLength: 4, percentWordMatches: 50 } ); + } ); + it( "returns matches with diacritics", function() { const paper = new Paper( "", { url: "url-with-key-word", keyword: "këyword" } ); const researcher = new Researcher( paper ); From e1a5e0bbd14d0e90f56dd0cc63f8722525adf498 Mon Sep 17 00:00:00 2001 From: Natalia Date: Tue, 5 Feb 2019 16:39:36 +0100 Subject: [PATCH 2/2] Commit the forgotten research --- src/researches/keywordCountInUrl.js | 47 ++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/src/researches/keywordCountInUrl.js b/src/researches/keywordCountInUrl.js index bd3b7e6013..ab34dd052c 100644 --- a/src/researches/keywordCountInUrl.js +++ b/src/researches/keywordCountInUrl.js @@ -2,6 +2,51 @@ import { findTopicFormsInString } from "./findKeywordFormsInString.js"; +/** + * Trims the modifier from compound words and makes it a separate keyphrase entry. + * E.g., for a keyphrase "modern pop-art" the current version of the morphological research generates forms + * [ [ modern, moderner, ...], [ pop-art, pop-arts, ...] ]. This is problematic for the research that searches for + * keyphrase in slug, because it treats compound words as 2 words. I.e., the research is looking for forms `pop` and `art`. + * This function takes the default-generated morphological forms and splits the compound words into two, such that + * the forms that serve input to the keyphraseCountInUrl researcher are + * [ [ modern, moderner, ...], [ pop ], [ art, arts, ...] ]. + * + * @param {Array} topicForms The keyphraseForms and synonymsForms of the paper. + * + * @returns {Array} topicForms with split compounds. + */ +function dehyphenateKeyphraseForms( topicForms ) { + const dehyphenatedKeyphraseForms = []; + + topicForms.keyphraseForms.forEach( function( lemma ) { + const firstWord = lemma[ 0 ]; + + if ( firstWord.indexOf( "-" ) === -1 ) { + dehyphenatedKeyphraseForms.push( lemma ); + return; + } + + const unchangedPart = firstWord.split( "-" )[ 0 ]; + + dehyphenatedKeyphraseForms.push( [ unchangedPart ] ); + + const dehyphenatedLemma = []; + lemma.forEach( function( wordInLemma ) { + if ( wordInLemma.indexOf( unchangedPart ) === 0 ) { + const trimmedWordInLemma = wordInLemma.slice( unchangedPart.length + 1, wordInLemma.length ); + dehyphenatedLemma.push( trimmedWordInLemma ); + } + } ); + + dehyphenatedKeyphraseForms.push( dehyphenatedLemma ); + } ); + + topicForms.keyphraseForms = dehyphenatedKeyphraseForms; + + return topicForms; +} + + /** * Matches the keyword in the URL. Replaces dashes and underscores with whitespaces and uses whitespace as wordboundary. * @@ -11,7 +56,7 @@ import { findTopicFormsInString } from "./findKeywordFormsInString.js"; * @returns {int} Number of times the keyword is found. */ export default function( paper, researcher ) { - const topicForms = researcher.getResearch( "morphology" ); + const topicForms = dehyphenateKeyphraseForms( researcher.getResearch( "morphology" ) ); const slug = paper.getUrl().replace( /[-_]/ig, " " ); const keyphraseInSlug = findTopicFormsInString( topicForms, slug, false, paper.getLocale() );