Skip to content

Commit

Permalink
Merge pull request #49 from taleksei/29908
Browse files Browse the repository at this point in the history
feat: add options for sanitize outer links in css
  • Loading branch information
taleksei authored Aug 30, 2023
2 parents 79a34b2 + bed389b commit c5cf13b
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 2 deletions.
21 changes: 19 additions & 2 deletions lib/string_tools.rb
Original file line number Diff line number Diff line change
Expand Up @@ -155,13 +155,21 @@ class Base

TAGS_WITHOUT_ATTRIBUTES = %w(b strong i em sup sub ul ol li blockquote br tr u caption thead s).freeze

# Public: Sanitize string
# str - String for sanitize
# attrs - Hash, custom attributes, defaults empty hash
# remove_contents - Set of string, tags to be removed
# protocols - Array of string, protocols using in css properties urls
def sanitize(str, attrs = {})
# для корректного обрезания utf строчек режем через mb_chars
# для защиты от перегрузки парсера пропускаем максимум 1 мегабайт текста
# длина русского символа в utf-8 - 2 байта, 1Мб/2б = 524288 = 2**19 символов
# длина по символам с перестраховкой, т.к. латинские символы(теги, например) занимают 1 байт
str = str.mb_chars.slice(0..(2**19)).to_s

remove_contents = attrs.delete(:remove_contents)
protocols = attrs.delete(:protocols) || []

# Мерджим добавочные теги и атрибуты
attributes = TAGS_WITH_ATTRIBUTES.merge(attrs)
elements = attributes.keys | TAGS_WITHOUT_ATTRIBUTES
Expand All @@ -173,8 +181,8 @@ def sanitize(str, attrs = {})
str,
:attributes => attributes,
:elements => elements,
:css => {:properties => Sanitize::Config::RELAXED[:css][:properties]},
:remove_contents => %w(style script),
:css => {:properties => Sanitize::Config::RELAXED[:css][:properties], protocols: protocols},
:remove_contents => remove_contents || Set['style', 'script'],
:allow_comments => false,
:transformers => transformers
)
Expand All @@ -191,6 +199,7 @@ def call(env)
normalize_link node, 'href'
when 'img'
normalize_link node, 'src'
remove_links node, 'alt'
end
end

Expand All @@ -202,6 +211,14 @@ def normalize_link(node, attr_name)
rescue Addressable::URI::InvalidURIError
node.swap node.children
end

def remove_links(node, attr_name)
return unless node[attr_name]

node[attr_name] = node[attr_name].gsub(URI::DEFAULT_PARSER.make_regexp, '').squish

node.remove_attribute(attr_name) if node[attr_name].empty?
end
end

class IframeNormalizer
Expand Down
36 changes: 36 additions & 0 deletions spec/string_tools_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,42 @@
to eq('<iframe width="123" height="456" src="https://www.youtube.com/embed/abc" frameborder="0"></iframe>')
end

it 'removes outer link from css when protocols given' do
origin_str = '<div style="background-image: url(http://i54.tinypic.com/4zuxif.jpg)"></div>'
sanitized_string = described_class.sanitize(origin_str)
expect(sanitized_string).to eq('<div></div>')
end

it 'do not removes outer link from css when protocols given' do
origin_str = '<div style="background-image: url(http://i54.tinypic.com/4zuxif.jpg)"></div>'
sanitized_string = described_class.sanitize(origin_str, protocols: %w[http https])
expect(sanitized_string).to eq('<div style="background-image: url(http://i54.tinypic.com/4zuxif.jpg)"></div>')
end

it 'removes style content' do
origin_str = '<style type="text/css">body{color: red;}</style>'
sanitized_string = described_class.sanitize(origin_str)
expect(sanitized_string).to eq('')
end

it 'do not removes style content' do
origin_str = '<style type="text/css">body{color: red;}</style>'
sanitized_string = described_class.sanitize(origin_str, 'style' => %w(type), remove_contents: Set['script'])
expect(sanitized_string).to eq('<style type="text/css">body{color: red;}</style>')
end

it 'removes links in alt attribute of img tag' do
origin_str = '<img scr="http://test.test" alt="http://test.test test https://test.test alt">'
sanitized_string = described_class.sanitize(origin_str, 'img' => %w(scr alt))
expect(sanitized_string).to eq('<img scr="http://test.test" alt="test alt">')
end

it 'removes alt attribute of img tag if empty value' do
origin_str = '<img scr="http://test.test" alt="http://test.test">'
sanitized_string = described_class.sanitize(origin_str, 'img' => %w(scr alt))
expect(sanitized_string).to eq('<img scr="http://test.test">')
end

context 'multiple invocations of the method' do
it 'does not mess up default config' do
origin_str = '<p style="text-align: center;" title="foobar"></p>'
Expand Down

0 comments on commit c5cf13b

Please sign in to comment.