Skip to content

Commit

Permalink
Merge pull request #40 from evseevleo/BPC-15151
Browse files Browse the repository at this point in the history
feat(translit): add keyboard layout switch&transliteration
  • Loading branch information
isqad authored Sep 25, 2019
2 parents eec174c + 87d134d commit aff7d58
Show file tree
Hide file tree
Showing 5 changed files with 325 additions and 14 deletions.
25 changes: 25 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,31 @@ Or install it yourself as:

TODO: Write usage instructions here

### Transliteration

Usage: ```StringTools.transliteration_variations(<string>)```.
Method returns an Array of Strings. Returned strings are: given string, string in different keboard layout and transliteration of whichever of first two string happens to be in Russian.
If there is a char in strng which isn't a part of RU <-> EN keyboard mapping, or string containes both Russian and English chars, only given string wrapped in Array is returned.
Examples:
```ruby
StringTools.transliteration_variations('"Мы почитаем всех нулями, А единицами — себя." - А. С. Пушкин')
=> ["\"Мы почитаем всех нулями, А единицами — себя.\" - А. С. Пушкин",
"@Vs gjxbnftv dct[ yekzvb? F tlbybwfvb — ct,z/@ - F/ C/ Geirby",
"\"My` pochitaem vsex nulyami, A ediniczami — sebya.\" - A. S. Pushkin"]
```
```ruby
StringTools.transliteration_variations('Ntrcn d ytdthyjq hfcrkflrt')
=> ["Ntrcn d ytdthyjq hfcrkflrt", "Текст в неверной раскладке", "Tekst v nevernoj raskladke"]
```
```ruby
StringTools.transliteration_variations('Еуче шт цкщтп лунищфкв дфнщгею')
=> ["Еуче шт цкщтп лунищфкв дфнщгею", "Text in wrong keyboard layout.", "Euche sht czkshhtp lunishhfkv dfns hhge."]
```
```ruby
StringTools.transliteration_variations('ﻮﻴﻜﻴﺒﻳﺪﻳ')
=> ["ﻮﻴﻜﻴﺒﻳﺪﻳ"]
```

## Development

After checking out the repo, run `bundle install` to install dependencies. Then, run `bin/console` for an interactive prompt that will allow you to experiment.
Expand Down
243 changes: 231 additions & 12 deletions lib/string_tools.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ module StringTools
autoload :HTML, 'string_tools/html'

module CharDet
CP1251_COMPATIBLE_ENCODINGS =
%w(windows-1253 windows-1254 windows-1255 windows-1256 windows-1258 EUC-TW ISO-8859-8).freeze

# Возвращает true если строка содержит допустимую
# последовательность байтов для кодировки utf8 и false в обратном случае
# см. http://en.wikipedia.org/wiki/UTF-8
Expand All @@ -29,18 +32,6 @@ def to_utf8(str)
def to_cp1251(str)
str.to_cp1251
end

def cp1251_compatible_encodings
[
'windows-1253',
'windows-1254',
'windows-1255',
'windows-1256',
'windows-1258',
'EUC-TW',
'ISO-8859-8'
]
end
end
extend CharDet

Expand Down Expand Up @@ -256,4 +247,232 @@ def add_params_to_url(url, params = nil)
end
end
extend Uri

module Transliteration
LAYOUT_EN_TO_RU_MAP = {
'q' => 'й', 'Q' => 'Й',
'w' => 'ц', 'W' => 'Ц',
'e' => 'у', 'E' => 'У',
'r' => 'к', 'R' => 'К',
't' => 'е', 'T' => 'Е',
'y' => 'н', 'Y' => 'Н',
'u' => 'г', 'U' => 'Г',
'i' => 'ш', 'I' => 'Ш',
'o' => 'щ', 'O' => 'Щ',
'p' => 'з', 'P' => 'З',
'[' => 'х',
'{' => 'Х',
']' => 'ъ',
'}' => 'Ъ',
'|' => '/',
'`' => 'ё',
'~' => 'Ё',
'a' => 'ф', 'A' => 'Ф',
's' => 'ы', 'S' => 'Ы',
'd' => 'в', 'D' => 'В',
'f' => 'а', 'F' => 'А',
'g' => 'п', 'G' => 'П',
'h' => 'р', 'H' => 'Р',
'j' => 'о', 'J' => 'О',
'k' => 'л', 'K' => 'Л',
'l' => 'д', 'L' => 'Д',
';' => 'ж',
':' => 'Ж',
"'" => 'э',
'"' => 'Э',
'z' => 'я', 'Z' => 'Я',
'x' => 'ч', 'X' => 'Ч',
'c' => 'с', 'C' => 'С',
'v' => 'м', 'V' => 'М',
'b' => 'и', 'B' => 'И',
'n' => 'т', 'N' => 'Т',
'm' => 'ь', 'M' => 'Ь',
',' => 'б',
'<' => 'Б',
'.' => 'ю',
'>' => 'Ю',
'/' => '.',
'?' => ',',
'@' => '"',
'#' => '№',
'$' => ';',
'^' => ':',
'&' => '?'
}.freeze
LAYOUT_RU_TO_EN_MAP = {
'й' => 'q', 'Й' => 'Q',
'ц' => 'w', 'Ц' => 'W',
'у' => 'e', 'У' => 'E',
'к' => 'r', 'К' => 'R',
'е' => 't', 'Е' => 'T',
'н' => 'y', 'Н' => 'Y',
'г' => 'u', 'Г' => 'U',
'ш' => 'i', 'Ш' => 'I',
'щ' => 'o', 'Щ' => 'O',
'з' => 'p', 'З' => 'P',
'х' => '[',
'Х' => '{',
'ъ' => ']',
'Ъ' => '}',
'/' => '|',
'ё' => '`',
'Ё' => '~',
'ф' => 'a', 'Ф' => 'A',
'ы' => 's', 'Ы' => 'S',
'в' => 'd', 'В' => 'D',
'а' => 'f', 'А' => 'F',
'п' => 'g', 'П' => 'G',
'р' => 'h', 'Р' => 'H',
'о' => 'j', 'О' => 'J',
'л' => 'k', 'Л' => 'K',
'д' => 'l', 'Д' => 'L',
'ж' => ';',
'Ж' => ':',
'э' => "'",
'Э' => '"',
'я' => 'z', 'Я' => 'Z',
'ч' => 'x', 'Ч' => 'X',
'с' => 'c', 'С' => 'C',
'м' => 'v', 'М' => 'V',
'и' => 'b', 'И' => 'B',
'т' => 'n', 'Т' => 'N',
'ь' => 'm', 'Ь' => 'M',
'б' => ',',
'Б' => '<',
'ю' => '.',
'Ю' => '>',
'.' => '/',
',' => '?',
'"' => '@',
'№' => '#',
';' => '$',
':' => '^',
'?' => '&'
}.freeze
LAYOUT_PERSISTENT = {
'0' => '0',
'1' => '1',
'2' => '2',
'3' => '3',
'4' => '4',
'5' => '5',
'6' => '6',
'7' => '7',
'8' => '8',
'9' => '9',
'!' => '!',
'*' => '*',
'(' => '(',
')' => ')',
' ' => ' ',
'-' => '-',
'—' => '—',
'_' => '_',
'=' => '=',
'+' => '+'
}.freeze
TRANSLIT_RU_TO_EN_MAP = {
'щ' => 'shh', 'Щ' => 'Shh',
'ё' => 'yo', 'Ё' => 'Yo',
'ж' => 'zh', 'Ж' => 'Zh',
'ц' => 'cz', 'Ц' => 'Cz',
'ч' => 'ch', 'Ч' => 'Ch',
'ш' => 'sh', 'Ш' => 'Sh',
'ъ' => '``', 'Ъ' => '``',
'ы' => 'y`', 'Ы' => 'Y`',
'э' => 'e`', 'Э' => 'E`',
'ю' => 'yu', 'Ю' => 'Yu',
'я' => 'ya', 'Я' => 'Ya',
'а' => 'a', 'А' => 'A',
'б' => 'b', 'Б' => 'B',
'в' => 'v', 'В' => 'V',
'г' => 'g', 'Г' => 'G',
'д' => 'd', 'Д' => 'D',
'е' => 'e', 'Е' => 'E',
'з' => 'z', 'З' => 'Z',
'и' => 'i', 'И' => 'I',
'й' => 'j', 'Й' => 'J',
'к' => 'k', 'К' => 'K',
'л' => 'l', 'Л' => 'L',
'м' => 'm', 'М' => 'M',
'н' => 'n', 'Н' => 'N',
'о' => 'o', 'О' => 'O',
'п' => 'p', 'П' => 'P',
'р' => 'r', 'Р' => 'R',
'с' => 's', 'С' => 'S',
'т' => 't', 'Т' => 'T',
'у' => 'u', 'У' => 'U',
'ф' => 'f', 'Ф' => 'F',
'х' => 'x', 'Х' => 'X',
'ь' => '`', 'Ь' => '`'
}.freeze

# Public: варианты строки с учетом смены раскладки и/или транслитерации для Русского и Английского языков
# Смена раскладки выполняется в обе стороны, транслитерация - с Русского на Английский.
#
# str - String
#
# Examples
# transliteration_variations('Ruby')
# => ['Ruby', 'Кгин', 'kgin']
# transliteration_variations('Слово')
# => ['Слово', 'ckjdj', 'slovo']
# transliteration_variations('КомпанияPro')
# => ['КомпанияPro']
# transliteration_variations('ويكيبيدي')
# => ['ويكيبيدي']
#
# returns Array of String
def transliteration_variations(str)
str_as_chars = str.chars
converted = convert_layout(str_as_chars)

layout_swap = converted[:chars].try(:join)
tranliterated = (converted[:was_ru] ? transliterate(str_as_chars) : transliterate(converted[:chars])).try(:join)

[str, layout_swap, tranliterated].tap(&:compact!)
end

private

# Internal: Смена раскладки массива символов, ru <-> en.
# Возвращает Hash с двумя ключами:
# :chars - Array, символы в другой раскладке(nil если не удалось сменить раскладку)
# :was_ru - Bool, принадлежали ли все символы русскому языку.
#
# splitted_string - Array of String
#
# Example:
# convert_layout(['a', 'b', 'c']) =>
# {chars: ['ф', 'и', 'с'], was_ru: false}
# convert_layout(['а', 'б', 'в']) =>
# {chars: ['f', ',', 'd'], was_ru: true}
# convert_layout(['ﻮ', 'ﻴ', 'ﻜ']) =>
# {chars: nil, was_ru: false}
#
# returns Array
def convert_layout(splitted_string)
str_arr = splitted_string.map do |char|
LAYOUT_RU_TO_EN_MAP[char] || LAYOUT_PERSISTENT[char] || break
end

return {chars: str_arr, was_ru: true} if str_arr

{chars: splitted_string.map { |char| LAYOUT_EN_TO_RU_MAP[char] || LAYOUT_PERSISTENT[char] || break },
was_ru: false}
end

# Internal: Транслитерация массива символов, ru -> en
# Если символа нет в словаре, не изменяет его.
#
# splitted string - Array of String
#
# Returns Array
def transliterate(splitted_string)
return unless splitted_string

splitted_string.map { |char| TRANSLIT_RU_TO_EN_MAP[char] || char }
end
end
extend Transliteration
end
3 changes: 2 additions & 1 deletion lib/string_tools/core_ext/string.rb
Original file line number Diff line number Diff line change
Expand Up @@ -154,10 +154,11 @@ def remove_colors
end
end

WIN_1251_ENCODING = 'windows-1251'.freeze
# shorthand
def detect_encoding
e = ::CharDet.detect(self)["encoding"]
e = 'windows-1251' if StringTools.cp1251_compatible_encodings.include?(e)
e = WIN_1251_ENCODING if StringTools::CharDet::CP1251_COMPATIBLE_ENCODINGS.include?(e)
e
end

Expand Down
2 changes: 1 addition & 1 deletion spec/spec_helper.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
$LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
$LOAD_PATH.unshift File.expand_path('../lib', __dir__)
require 'simplecov'

SimpleCov.start do
Expand Down
66 changes: 66 additions & 0 deletions spec/string_tools_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -171,4 +171,70 @@
it { expect(StringTools.valid_utf8?('foobar')).to be true }
it { expect(StringTools.valid_utf8?(nil)).to be false }
end

describe '#transliteration_variations' do
describe 'maps consitency' do
it do
expect(described_class::Transliteration::LAYOUT_EN_TO_RU_MAP.size).
to eq ::StringTools::Transliteration::LAYOUT_RU_TO_EN_MAP.keys.size
end
it do
expect(::StringTools::Transliteration::LAYOUT_EN_TO_RU_MAP.keys).
to match_array ::StringTools::Transliteration::LAYOUT_RU_TO_EN_MAP.values
end
it do
expect(::StringTools::Transliteration::LAYOUT_RU_TO_EN_MAP.keys).
to match_array ::StringTools::Transliteration::LAYOUT_EN_TO_RU_MAP.values
end
end

let(:subject) { described_class.transliteration_variations(str) }
context 'when english string' do
let(:str) { 'qwertyuiop[]asdfghjkl;\'zxcvbnm,./' }

it do
expect(subject).to match_array [str,
'йцукенгшщзхъфывапролджэячсмитьбю.',
'jczukengshshhzx``fy`vaproldzhe`yachsmit`byu.']
end
end

context 'when russian string' do
let(:str) { 'йцукенгшщзхъфывапролджэячсмитьбю.' }

it do
expect(subject).to match_array [str,
'qwertyuiop[]asdfghjkl;\'zxcvbnm,./',
'jczukengshshhzx``fy`vaproldzhe`yachsmit`byu.']
end
end

context 'when string has russian AND english chars' do
let(:str) { 'abc абв' }

it { expect(subject).to match_array [str] }
end

context 'when string has other language chars' do
let(:str) { 'ﻮﻴﻜﻴﺒﻳﺪﻳ' }

it { expect(subject).to match_array [str] }
end

context 'when upper case' do
let(:str) { 'AbCd' }

it 'preserve case' do
expect(subject).to match_array [str, 'ФиСв', 'FiSv']
end
end
context 'when string has other chars' do
let(:str) { '0123456789!*() -_=+ abc' }
it 'preserves them' do
expect(subject).to match_array [str,
'0123456789!*() -_=+ фис',
'0123456789!*() -_=+ fis']
end
end
end
end

0 comments on commit aff7d58

Please sign in to comment.