Skip to content

Commit

Permalink
fix(html): кириллические ссылки в урлах
Browse files Browse the repository at this point in the history
https://jira.railsc.ru/browse/SERVICES-594

старая логика заменяла unicode ссылки на представление в URI-encoding:

```
[97] pry(main)> Nokogiri::HTML::DocumentFragment.parse('<a href="http://www.фермаежей.рф/index.html">ссылка</a>').to_s
=> "<a href=\"http://www.%D1%84%D0%B5%D1%80%D0%BC%D0%B0%D0%B5%D0%B6%D0%B5%D0%B9.%D1%80%D1%84/index.html\">ссылка</a>"
```
  • Loading branch information
DmitryBochkarev committed Oct 23, 2015
1 parent e05076f commit 4017e3d
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 8 deletions.
41 changes: 34 additions & 7 deletions lib/string_tools/html.rb
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
# coding: utf-8
require 'loofah'
require 'nokogiri'
require 'addressable/uri'
require 'simpleidn'

module StringTools
module HTML
# минимальная длина строки, в которой могут быть ссылки
TEXT_WITH_LINKS_MINIMUM_LENGTH = '<a href="'.length
HTML_SERIALIZE_OPTIONS = {
indent: 0,
# сериализуем в xhtml, поскольку при сериализации в html, libxml2 делает чуть больше, чем хотелось бы:
# http://stackoverflow.com/questions/24174032/prevent-nokogiri-from-url-encoding-src-attributes
save_with: Nokogiri::XML::Node::SaveOptions::AS_XHTML
}

# Public: Удаляет ссылки на неразрешенные домены
#
Expand Down Expand Up @@ -34,24 +41,37 @@ module HTML
def self.remove_links(html, options = {})
return html if html.length < TEXT_WITH_LINKS_MINIMUM_LENGTH

Loofah.fragment(html).scrub!(LinksRemoveScrubber.new(options)).to_s
doc = Nokogiri::HTML::DocumentFragment.parse(html)
scrubber = LinksRemoveScrubber.new(options)

doc.css('a'.freeze).each { |node| scrubber.call node }

if scrubber.done_changes?
doc.children.map { |node| node.serialize HTML_SERIALIZE_OPTIONS }.join
else
html
end
end

class LinksRemoveScrubber < Loofah::Scrubber
class LinksRemoveScrubber
def initialize(options)
@whitelist = options.fetch(:whitelist)
@is_have_done_changes = false
end

def done_changes?
@is_have_done_changes
end

def scrub(node)
return unless node.name == 'a'.freeze
def call(node)
href = node['href']
return if href.blank?
uri = Addressable::URI.parse(href).normalize
return unless uri.host
node.swap(node.children) unless whitelisted? uri.host
replace_with_contetn node unless whitelisted? SimpleIDN.to_unicode(uri.host)
rescue
# в любой непонятной ситуации просто удаляем ссылку
node.swap(node.children)
replace_with_content node
end

def whitelisted?(domain)
Expand All @@ -64,6 +84,13 @@ def whitelisted?(domain)
end
false
end

private

def replace_with_content(node)
node.swap(node.children)
@is_have_done_changes = true
end
end
end
end
18 changes: 18 additions & 0 deletions spec/html_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -122,5 +122,23 @@
end
end
end

context 'unicode domains' do
subject { StringTools::HTML.remove_links(html, whitelist: ['фермаежей.рф']) }

let(:html) do
<<-MARKUP
<a href="https://www.фермаежей.рф">www.фермаежей.рф</a>
<a href="https://www.мояфермаежей.рф">www.мояфермаежей.рф</a>
MARKUP
end

it 'should keep relative links' do
is_expected.to eq(<<-MARKUP)
<a href="https://www.фермаежей.рф">www.фермаежей.рф</a>
www.мояфермаежей.рф
MARKUP
end
end
end
end
3 changes: 2 additions & 1 deletion string_tools.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ Gem::Specification.new do |spec|
spec.add_runtime_dependency 'addressable', '~> 2.3.2'
spec.add_runtime_dependency 'ru_propisju', '~> 2.1.4'
spec.add_runtime_dependency 'sanitize', '>= 3.1.2'
spec.add_runtime_dependency 'loofah', '>= 2.0.0'
spec.add_runtime_dependency 'nokogiri'
spec.add_runtime_dependency 'simpleidn', '>= 0.0.5'

spec.add_development_dependency 'bundler', '~> 1.7'
spec.add_development_dependency 'rake', '~> 10.0'
Expand Down

0 comments on commit 4017e3d

Please sign in to comment.