diff --git a/README.md b/README.md index 5aacf13..b8589b7 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ const newString = sw.removeStopwords(oldString, [ 'even', 'a', 'custom', 'stopwo ### <language code> -Arrays of stopwords for the following 24 languages are supplied: +Arrays of stopwords for the following 27 languages are supplied: * `ar` - Modern Standard Arabic * `bn` - Bengali @@ -71,10 +71,13 @@ Arrays of stopwords for the following 24 languages are supplied: * `pt` - Portuguese * `pa` - Punjabi Gurmukhi * `ru` - Russian +* `so` - Somali +* `st` - Sotho * `sv` - Swedish * `sw` - Swahili * `yo` - Yoruba * `zh` - Chinese Simplified +* `zu` - Zulu ```javascript sw = require('stopword') diff --git a/lib/stopword.js b/lib/stopword.js index 9560d97..c59b2c0 100644 --- a/lib/stopword.js +++ b/lib/stopword.js @@ -31,7 +31,10 @@ exports.pa = require('./stopwords_pa.js').words exports.pl = require('./stopwords_pl.js').words exports.pt = require('./stopwords_pt.js').words exports.ru = require('./stopwords_ru.js').words +exports.so = require('./stopwords_so.js').words +exports.st = require('./stopwords_st.js').words exports.sv = require('./stopwords_sv.js').words exports.sw = require('./stopwords_sw.js').words exports.yo = require('./stopwords_yo.js').words exports.zh = require('./stopwords_zh.js').words +exports.zu = require('./stopwords_zu.js').words diff --git a/lib/stopwords_so.js b/lib/stopwords_so.js new file mode 100644 index 0000000..34a417e --- /dev/null +++ b/lib/stopwords_so.js @@ -0,0 +1,30 @@ +/* Copyright 2016 Liam Doherty + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* A list of commonly used words that have little meaning and can be excluded +from analysis. +This list is frequency sorted. That means it can be sliced from the bottom +and be less agressive in excluding stopwords */ + +var words = [ + 'oo', 'atabo', 'ay', 'ku', 'waxeey', 'uu', 'lakin', 'si', 'ayuu', 'soo', + 'waa', 'ka', 'kasoo', 'kale', 'waxuu', 'ayee', 'ayaa', 'kuu', 'isku', 'ugu', + 'jiray', 'dhan', 'dambeestii', 'inuu', 'in', 'jirtay', 'uheestay', 'aad', + 'uga', 'hadana', 'timaado', 'timaaday' +] + + +// Tell the world about the noise words. +exports.words = words diff --git a/lib/stopwords_st.js b/lib/stopwords_st.js new file mode 100644 index 0000000..4739892 --- /dev/null +++ b/lib/stopwords_st.js @@ -0,0 +1,30 @@ +/* Copyright 2016 Liam Doherty + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* A list of commonly used words that have little meaning and can be excluded +from analysis. +This list is frequency sorted. That means it can be sliced from the bottom +and be less agressive in excluding stopwords */ + +var words = [ + 'a', 'le', 'o', 'ba', 'ho', 'oa', 'ea', 'ka', 'hae', 'tselane', 'eaba', 'ke', + 'hore', 'ha', 'e', 'ne', 're', 'bona', 'me', 'limo', 'tsa', 'haholo', 'la', + 'empa', 'ngoanake', 'se', 'moo', 'm\'e', 'bane', 'mo', 'tse', 'sa', 'li', + 'ena', 'bina', 'pina', 'hape' +] + + +// Tell the world about the noise words. +exports.words = words diff --git a/lib/stopwords_zu.js b/lib/stopwords_zu.js new file mode 100644 index 0000000..8913d48 --- /dev/null +++ b/lib/stopwords_zu.js @@ -0,0 +1,31 @@ +/* Copyright 2016 Liam Doherty + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +/* A list of commonly used words that have little meaning and can be excluded +from analysis. +This list is frequency sorted. That means it can be sliced from the bottom +and be less agressive in excluding stopwords */ + +var words = [ + 'ukuthi', 'kodwa', 'futhi', 'kakhulu', 'wakhe', 'kusho', 'uma', 'wathi', + 'umama', 'kanye', 'phansi', 'ngesikhathi', 'lapho', 'u', 'zakhe', 'khona', + 'ukuba', 'nje', 'phezulu', 'yakhe', 'kungani', 'wase', 'la', 'mina', 'wami', + 'ukuze', 'unonkungu', 'wabona', 'wahamba', 'lakhe', 'yami', 'kanjani', + 'kwakukhona', 'ngelinye' +] + + +// Tell the world about the noise words. +exports.words = words diff --git a/package.json b/package.json index 30614db..9b58dcf 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "stopword", - "version": "0.2.0", - "description": "A module for node.js that takes in text and returns text that is stripped of stopwords. Has pre-defined stopword lists for 24 languages and also takes lists with custom stopwords as input.", + "version": "0.2.1", + "description": "A module for node.js that takes in text and returns text that is stripped of stopwords. Has pre-defined stopword lists for 27 languages and also takes lists with custom stopwords as input.", "main": "lib/stopword.js", "scripts": { "test": "mocha" diff --git a/test/test.js b/test/test.js index a8ea602..ce81930 100644 --- a/test/test.js +++ b/test/test.js @@ -164,6 +164,24 @@ describe('general stopwordiness:', function () { newString.should.eql(['ẹ̀gbà', 'ọrùn', 'gbẹ́', 'lére', 'àwòrán', 'akọni', 'obìrin', 'mọ̀', 'si', 'ìyá', 'wa', 'olorì', 'idia', 'ọ̀rundún', 'mẹ́rìndínlógún', 'ṣẹ́yìn']) }) + it('should remove zulu stopwords', function () { + const oldString = 'ukhisimusi isikhathi esiletha injabulo kubantu abaningi emhlabeni jikelele lesi sikhathi senza ukuba kube khona ukuhlangana nemindeni okudala yagcinana kuhlalwe kuphunyulwe futhi kuncokolwe'.split(' ') + const newString = sw.removeStopwords(oldString, sw.zu) + newString.should.eql(['ukhisimusi', 'isikhathi', 'esiletha', 'injabulo', 'kubantu', 'abaningi', 'emhlabeni', 'jikelele', 'lesi', 'sikhathi', 'senza', 'kube', 'ukuhlangana', 'nemindeni', 'okudala', 'yagcinana', 'kuhlalwe', 'kuphunyulwe', 'kuncokolwe']) + }) + + it('should remove sotho stopwords', function () { + const oldString = 'ke boloetse ba batho bo bokwang ke kokwanahloko ya ebola matshwao hantlentle a qala matsatsi a mabedi ho isa ho a mararo ka mora ho tshwaetswa ke kokwanahloko'.split(' ') + const newString = sw.removeStopwords(oldString, sw.st) + newString.should.eql(['boloetse', 'batho', 'bo', 'bokwang', 'kokwanahloko', 'ya', 'ebola', 'matshwao', 'hantlentle', 'qala', 'matsatsi', 'mabedi', 'isa', 'mararo', 'mora', 'tshwaetswa', 'kokwanahloko']) + }) + + it('should remove somali stopwords', function () { + const oldString = 'isku celcelis qaarada antarktika waa tan ugu qaboow qalalsan ee ugu dabaysha badan qaaradaha caalamka oo dhan'.split(' ') + const newString = sw.removeStopwords(oldString, sw.so) + newString.should.eql(['celcelis', 'qaarada', 'antarktika', 'tan', 'qaboow', 'qalalsan', 'ee', 'dabaysha', 'badan', 'qaaradaha', 'caalamka']) + }) + // Right to Left languages