Skip to content

Commit

Permalink
fix: convert to_utf8 correctly, without hidding errors
Browse files Browse the repository at this point in the history
  • Loading branch information
isqad committed Jun 15, 2023
1 parent 9bc5cf5 commit 8e9fa48
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 20 deletions.
35 changes: 15 additions & 20 deletions lib/string_tools/core_ext/string.rb
Original file line number Diff line number Diff line change
Expand Up @@ -163,32 +163,21 @@ def detect_encoding
e
end

def to_utf8!
self.replace(self.to_utf8)
end

def to_utf8
# и так utf
return self if is_utf8?
def to_utf8(inplace = false)
return self if valid_encoding? && is_utf8?

enc = detect_encoding
source_enc = detect_encoding
return '' unless source_enc

# если utf или английские буквы, то тоже ок
return self if ['utf-8', 'ascii'].include?(enc)

# если неизвестная каша, то возвращаем пустую строку
return '' if enc.nil?
encode_utf8(source_enc, inplace)
end

# иначе пытаемся перекодировать
encode 'utf-8', enc, :undef => :replace, :invalid => :replace
rescue
''
def to_utf8!
to_utf8(true)
end

def to_cp1251
encode 'cp1251', :undef => :replace, :invalid => :replace
rescue
''
encode 'cp1251', undef: :replace, invalid: :replace, replace: ''
end

def to_cp1251!
Expand Down Expand Up @@ -230,6 +219,12 @@ def remove_nonprintable!

private

def encode_utf8(source_enc, inplace = false)
args = ['utf-8', source_enc, undef: :replace, invalid: :replace, replace: '']

inplace ? encode!(*args) : encode(*args)
end

def surround_with_ansi(ascii_seq)
"#{ascii_seq}#{protect_escape_of(ascii_seq)}#{ANSI_CLEAR}"
end
Expand Down
24 changes: 24 additions & 0 deletions spec/string_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,30 @@
require 'spec_helper'

describe String do
describe '#to_utf8' do
let(:invalid_str) { "кирилиц\xD0".dup }

it do
expect(invalid_str).not_to be_valid_encoding
expect(invalid_str.to_utf8).to be_valid_encoding
expect(invalid_str.to_utf8.is_utf8?).to eq(true)
expect(invalid_str.to_utf8).to eq('кирилиц')
end
end

describe '#to_utf8!' do
let(:invalid_str) { "кирилиц\xD0".dup }

it do
expect(invalid_str).not_to be_valid_encoding
invalid_str.to_utf8!

expect(invalid_str).to be_valid_encoding
expect(invalid_str.is_utf8?).to eq(true)
expect(invalid_str).to eq('кирилиц')
end
end

describe '#mb_downcase' do
it { expect("Кириллица".mb_downcase).to eq("кириллица") }
end
Expand Down

0 comments on commit 8e9fa48

Please sign in to comment.