Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removes hyphens from the keyphrase before checking if slug contains keyphrase #17424

Merged
merged 2 commits into from
Sep 10, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ const expectedResults = {
},
urlKeyword: {
isApplicable: true,
score: 6,
resultText: "<a href='https://yoa.st/33o' target='_blank'>Keyphrase in slug</a>: (Part of) your keyphrase does not appear in the slug. <a href='https://yoa.st/33p' target='_blank'>Change that</a>!",
score: 9,
resultText: "<a href='https://yoa.st/33o' target='_blank'>Keyphrase in slug</a>: Great work!",
},
urlLength: {
isApplicable: true,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,27 @@ describe( "test to check url for keyword", function() {
expect( urlKeyword( paper, researcher ) ).toEqual( { keyphraseLength: 2, percentWordMatches: 100 } );
} );

it( "returns no matches for dashed words", function() {
it( "returns no matches for differently dashed words", function() {
const paper = new Paper( "", { url: "url-with-key-word", keyword: "keyword" } );
const researcher = new EnglishResearcher( paper );
researcher.addResearchData( "morphology", morphologyData );
expect( urlKeyword( paper, researcher ) ).toEqual( { keyphraseLength: 1, percentWordMatches: 0 } );
} );

it( "returns matches for equally dashed words", function() {
const paper = new Paper( "", { url: "url-with-key-word", keyword: "key-word" } );
const researcher = new EnglishResearcher( paper );
researcher.addResearchData( "morphology", morphologyData );
expect( urlKeyword( paper, researcher ) ).toEqual( { keyphraseLength: 2, percentWordMatches: 100 } );
} );

it( "returns matches for equally dashed words with more words around", function() {
const paper = new Paper( "", { url: "url-with-key-word", keyword: "exciting key-word exciting" } );
const researcher = new EnglishResearcher( paper );
researcher.addResearchData( "morphology", morphologyData );
expect( urlKeyword( paper, researcher ) ).toEqual( { keyphraseLength: 4, percentWordMatches: 50 } );
} );

it( "returns matches with diacritics", function() {
const paper = new Paper( "", { url: "url-with-key-word", keyword: "këyword" } );
const researcher = new EnglishResearcher( paper );
Expand Down Expand Up @@ -197,21 +211,20 @@ describe( "test to check url for keyword", function() {
const paper = new Paper( "", { url: "buku-buku", keyword: "buku-buku" } );
const researcher = new EnglishResearcher( paper );
researcher.addResearchData( "morphology", morphologyData );
expect( urlKeyword( paper, researcher ) ).toEqual( { keyphraseLength: 1, percentWordMatches: 100 } );
expect( urlKeyword( paper, researcher ) ).toEqual( { keyphraseLength: 2, percentWordMatches: 100 } );
} );

it( "works with dash within the keyword in url", function() {
const paper = new Paper( "", { url: "on-the-go", keyword: "on-the-go" } );
const researcher = new EnglishResearcher( paper );
researcher.addResearchData( "morphology", morphologyData );
expect( urlKeyword( paper, researcher ) ).toEqual( { keyphraseLength: 1, percentWordMatches: 100 } );
expect( urlKeyword( paper, researcher ) ).toEqual( { keyphraseLength: 3, percentWordMatches: 100 } );
} );

// eslint-disable-next-line capitalized-comments
/* it( "works with dash within the keyword in url", function() {
const paper = new Paper( "", { url: "two-room-apartment", keyword: "two-room apartment" } );
it( "works with dash within the keyword in url", function() {
const paper = new Paper( "", { url: "two-room-apartment", keyword: "two-room apartment" } );
const researcher = new EnglishResearcher( paper );
researcher.addResearchData( "morphology", morphologyData );
expect( urlKeyword( paper, researcher ) ).toEqual( { keyphraseLength: 1, percentWordMatches: 100 } );
} );*/
expect( urlKeyword( paper, researcher ) ).toEqual( { keyphraseLength: 3, percentWordMatches: 100 } );
} );
} );
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,37 @@
import parseSlug from "../helpers/url/parseSlug";
import { findTopicFormsInString } from "../helpers/match/findKeywordFormsInString.js";

/**
* Splits hyphenated keyphrases so that each compound is an individual word, e.g. 'pop-art' becomes 'pop' and 'art'.
* Splitting the keyphrase forms allows for hyphenated keyphrases to be detected in the slug. The slug is parsed on hyphens, and the words from
* the keyphrase are compared with the words from the slug to find a match. Without dehyphenating the keyphrase, the word from the keyphrase would be
* 'pop-art' while the words from the slug would be 'pop' and 'art', and a match would not be detected.
*
* @param {Array} topicForms The keyphraseForms and synonymsForms of the paper.
*
* @returns {Array} topicForms with split compounds.
*/
function dehyphenateKeyphraseForms( topicForms ) {
const dehyphenatedKeyphraseForms = [];

topicForms.keyphraseForms.forEach( function( wordForms ) {
// If a word doesn't contain hyphens, don't split it.
if ( wordForms[ 0 ].indexOf( "-" ) === -1 ) {
dehyphenatedKeyphraseForms.push( wordForms );
return;
}

// Split each form of a hyphenated word and add each compound to the array of dehyphenated keyphrase forms.
wordForms.forEach( function( wordForm ) {
const splitWordForm = wordForm.split( "-" );
splitWordForm.forEach( compound => dehyphenatedKeyphraseForms.push( [ compound ] ) );
} );
} );
topicForms.keyphraseForms = dehyphenatedKeyphraseForms;

return topicForms;
}

/**
* Matches the keyword in the URL. Replaces dashes and underscores with whitespaces and uses whitespace as wordboundary.
*
Expand All @@ -11,17 +42,11 @@ import { findTopicFormsInString } from "../helpers/match/findKeywordFormsInStrin
* @returns {int} Number of times the keyword is found.
*/
export default function( paper, researcher ) {
const topicForms = researcher.getResearch( "morphology" );
const topicForms = dehyphenateKeyphraseForms( researcher.getResearch( "morphology" ) );
const parsedSlug = parseSlug( paper.getUrl() );

let keyphraseInSlug = findTopicFormsInString( topicForms, parsedSlug, false, paper.getLocale() );
/* In case we deal with a language where dashes are part of the word (e.g., in Indonesian: buku-buku),
* Try looking for the keywords in the unparsed slug.
*/
if ( keyphraseInSlug.percentWordMatches === 0 ) {
const unparsedSlug = paper.getUrl();
keyphraseInSlug = findTopicFormsInString( topicForms, unparsedSlug, false, paper.getLocale() );
}
const keyphraseInSlug = findTopicFormsInString( topicForms, parsedSlug, false, paper.getLocale() );

return {
keyphraseLength: topicForms.keyphraseForms.length,
percentWordMatches: keyphraseInSlug.percentWordMatches,
Expand Down