From 2588cf5a3f63f49d26e1174ac446bdfeb3bce3dc Mon Sep 17 00:00:00 2001 From: Michael Scrivo Date: Fri, 7 Jun 2024 10:11:09 -0400 Subject: [PATCH 1/3] Add rubocop --- .github/workflows/build.yml | 17 +++- Gemfile.lock | 28 ++++++ Rakefile | 2 +- html2text.gemspec | 35 +++---- lib/html2text.rb | 192 +++++++++++++++++------------------- lib/html2text/version.rb | 2 +- spec/examples_spec.rb | 20 ++-- spec/html2text_spec.rb | 46 ++++----- spec/spec_helper.rb | 6 +- 9 files changed, 190 insertions(+), 158 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 25a7eeb..fabcbe2 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -18,7 +18,6 @@ permissions: jobs: test: - runs-on: ubuntu-latest strategy: matrix: @@ -33,3 +32,19 @@ jobs: bundler-cache: true # runs 'bundle install' and caches installed gems automatically - name: Run tests run: bundle exec rake + + lint: + runs-on: ubuntu-latest + strategy: + matrix: + ruby-version: ['3.0', '3.1', '3.2', '3.3'] + + steps: + - uses: actions/checkout@v4 + - name: Set up Ruby + uses: ruby/setup-ruby@v1 + with: + ruby-version: ${{ matrix.ruby-version }} + bundler-cache: true # runs 'bundle install' and caches installed gems automatically + - name: Run Rubocop + run: bundle exec rubocop diff --git a/Gemfile.lock b/Gemfile.lock index c92f82b..b89805f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -7,17 +7,28 @@ PATH GEM remote: https://rubygems.org/ specs: + ast (2.4.2) bundler-audit (0.6.1) bundler (>= 1.2.0, < 3) thor (~> 0.18) colorize (0.7.7) diff-lcs (1.3) + json (2.7.2) + language_server-protocol (3.17.0.3) mini_portile2 (2.8.7) nokogiri (1.16.5) mini_portile2 (~> 2.8.2) racc (~> 1.4) + parallel (1.24.0) + parser (3.3.2.0) + ast (~> 2.4.1) + racc racc (1.8.0) + rainbow (3.1.1) rake (12.3.3) + regexp_parser (2.9.2) + rexml (3.2.8) + strscan (>= 3.0.9) rspec (3.8.0) rspec-core (~> 3.8.0) rspec-expectations (~> 3.8.0) @@ -33,7 +44,23 @@ GEM diff-lcs (>= 1.2.0, < 2.0) rspec-support (~> 3.8.0) rspec-support (3.8.0) + rubocop (1.64.1) + json (~> 2.3) + language_server-protocol (>= 3.17.0) + parallel (~> 1.10) + parser (>= 3.3.0.2) + rainbow (>= 2.2.2, < 4.0) + regexp_parser (>= 1.8, < 3.0) + rexml (>= 3.2.5, < 4.0) + rubocop-ast (>= 1.31.1, < 2.0) + ruby-progressbar (~> 1.7) + unicode-display_width (>= 2.4.0, < 3.0) + rubocop-ast (1.31.3) + parser (>= 3.3.1.0) + ruby-progressbar (1.13.0) + strscan (3.1.0) thor (0.20.3) + unicode-display_width (2.5.0) PLATFORMS ruby @@ -45,6 +72,7 @@ DEPENDENCIES rake rspec rspec-collection_matchers + rubocop BUNDLED WITH 2.1.4 diff --git a/Rakefile b/Rakefile index 70a846d..f34a48c 100644 --- a/Rakefile +++ b/Rakefile @@ -2,4 +2,4 @@ require 'rspec/core/rake_task' RSpec::Core::RakeTask.new(:spec) -task :default => :spec +task default: :spec diff --git a/html2text.gemspec b/html2text.gemspec index 9557811..9213100 100644 --- a/html2text.gemspec +++ b/html2text.gemspec @@ -1,27 +1,28 @@ -$:.push File.expand_path("../lib", __FILE__) +$:.push File.expand_path('lib', __dir__) # Maintain your gem's version: -require "html2text/version" +require 'html2text/version' # Describe your gem and declare its dependencies: Gem::Specification.new do |s| - s.name = "html2text" + s.name = 'html2text' s.version = Html2Text::VERSION - s.authors = ["Jevon Wright"] - s.email = ["jevon@jevon.org"] - s.homepage = "https://github.com/soundasleep/html2text_ruby" - s.summary = "Convert HTML into plain text." - s.description = "A Ruby component to convert HTML into a plain text format." - s.license = "MIT" + s.authors = ['Jevon Wright'] + s.email = ['jevon@jevon.org'] + s.homepage = 'https://github.com/soundasleep/html2text_ruby' + s.summary = 'Convert HTML into plain text.' + s.description = 'A Ruby component to convert HTML into a plain text format.' + s.license = 'MIT' - s.files = Dir["lib/**/*", "LICENSE.md", "README.md", "CHANGELOG.md"] - s.test_files = Dir["spec/**/*"] + s.files = Dir['lib/**/*', 'LICENSE.md', 'README.md', 'CHANGELOG.md'] + s.test_files = Dir['spec/**/*'] - s.add_dependency "nokogiri", ['>= 1.0', '< 2.0'] + s.add_dependency 'nokogiri', ['>= 1.0', '< 2.0'] - s.add_development_dependency "rspec" - s.add_development_dependency "rspec-collection_matchers" - s.add_development_dependency "colorize" - s.add_development_dependency "rake" - s.add_development_dependency "bundler-audit" + s.add_development_dependency 'bundler-audit' + s.add_development_dependency 'colorize' + s.add_development_dependency 'rake' + s.add_development_dependency 'rspec' + s.add_development_dependency 'rspec-collection_matchers' + s.add_development_dependency 'rubocop' end diff --git a/lib/html2text.rb b/lib/html2text.rb index d672cbc..7abc7d3 100644 --- a/lib/html2text.rb +++ b/lib/html2text.rb @@ -12,12 +12,12 @@ def self.convert(html) if is_office_document?(html) # Emulate the CSS rendering of Office documents - html = html.gsub("

", "
") - .gsub(" ", "
") - .gsub("", "") + html = html.gsub('

', '
') + .gsub(' ', '
') + .gsub('', '') end - if !html.include?(" tags html = "

#{html}
" end @@ -33,17 +33,17 @@ def self.fix_newlines(text) end def self.replace_entities(text) - text.gsub(" ", " ").gsub("\u00a0", " ").gsub("‌", "") + text.gsub(' ', ' ').gsub("\u00a0", ' ').gsub('‌', '') end def convert output = iterate_over(doc) output = remove_leading_and_trailing_whitespace(output) output = remove_unnecessary_empty_lines(output) - return output.strip + output.strip end - DO_NOT_TOUCH_WHITESPACE = "" + DO_NOT_TOUCH_WHITESPACE = '' def remove_leading_and_trailing_whitespace(text) # ignore any
 blocks, which we don't want to interact with
@@ -51,20 +51,20 @@ def remove_leading_and_trailing_whitespace(text)
 
     output = []
     pre_blocks.each.with_index do |block, index|
-      if index % 2 == 0
-        output << block.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t")
-      else
-        output << block
-      end
+      output << if index.even?
+                  block.gsub(/[ \t]*\n[ \t]*/im, "\n").gsub(/ *\t */im, "\t")
+                else
+                  block
+                end
     end
 
-    output.join("")
+    output.join('')
   end
 
   private
 
   def self.is_office_document?(text)
-    text.include?("urn:schemas-microsoft-com:office")
+    text.include?('urn:schemas-microsoft-com:office')
   end
 
   def remove_unnecessary_empty_lines(text)
@@ -75,191 +75,179 @@ def trimmed_whitespace(text)
     # Replace whitespace characters with a space (equivalent to \s)
     # and force any text encoding into UTF-8
     if text.valid_encoding?
-      text.gsub(/[\t\n\f\r ]+/im, " ")
+      text.gsub(/[\t\n\f\r ]+/im, ' ')
     else
-      text.force_encoding("WINDOWS-1252")
-      return trimmed_whitespace(text.encode("UTF-16be", invalid: :replace, replace: "?").encode('UTF-8'))
+      text.force_encoding('WINDOWS-1252')
+      trimmed_whitespace(text.encode('UTF-16be', invalid: :replace, replace: '?').encode('UTF-8'))
     end
   end
 
   def iterate_over(node)
-    return "\n" if node.name.downcase == "br" && next_node_is_text?(node)
+    return "\n" if node.name.downcase == 'br' && next_node_is_text?(node)
 
     return trimmed_whitespace(node.text) if node.text?
 
-    if ["style", "head", "title", "meta", "script"].include?(node.name.downcase)
-      return ""
-    end
+    return '' if %w[style head title meta script].include?(node.name.downcase)
 
-    if node.name.downcase == "pre"
-      return "\n#{DO_NOT_TOUCH_WHITESPACE}#{node.text}#{DO_NOT_TOUCH_WHITESPACE}"
-    end
+    return "\n#{DO_NOT_TOUCH_WHITESPACE}#{node.text}#{DO_NOT_TOUCH_WHITESPACE}" if node.name.downcase == 'pre'
 
     output = []
 
     output << prefix_whitespace(node)
     output += node.children.map do |child|
-      if !child.name.nil?
-        iterate_over(child)
-      end
+      iterate_over(child) unless child.name.nil?
     end
     output << suffix_whitespace(node)
 
-    output = output.compact.join("") || ""
+    output = output.compact.join('') || ''
 
-    if !node.name.nil?
-      if node.name.downcase == "a"
+    unless node.name.nil?
+      if node.name.downcase == 'a'
         output = wrap_link(node, output)
-      elsif node.name.downcase == "img"
+      elsif node.name.downcase == 'img'
         output = image_text(node)
       end
     end
 
-    return output
+    output
   end
 
   def prefix_whitespace(node)
     case node.name.downcase
-      when "hr"
-        "\n---------------------------------------------------------------\n"
+    when 'hr'
+      "\n---------------------------------------------------------------\n"
 
-      when "h1", "h2", "h3", "h4", "h5", "h6", "ol", "ul"
-        "\n\n"
+    when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul'
+      "\n\n"
 
-      when "p"
-        "\n\n"
+    when 'p'
+      "\n\n"
 
-      when "tr"
-        "\n"
+    when 'tr'
+      "\n"
 
-      when "div"
-        if node.parent.name == "div" && (node.parent.text.strip == node.text.strip)
-          ""
-        else
-          "\n"
-        end
+    when 'div'
+      if node.parent.name == 'div' && (node.parent.text.strip == node.text.strip)
+        ''
+      else
+        "\n"
+      end
 
-      when "td", "th"
-        "\t"
+    when 'td', 'th'
+      "\t"
 
-      when "li"
-        "- "
+    when 'li'
+      '- '
     end
   end
 
   def suffix_whitespace(node)
     case node.name.downcase
-      when "h1", "h2", "h3", "h4", "h5", "h6"
-        # add another line
-        "\n\n"
+    when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
+      # add another line
+      "\n\n"
 
-      when "p"
-        "\n\n"
+    when 'p'
+      "\n\n"
 
-      when "br"
-        if next_node_name(node) != "div" && next_node_name(node) != nil
-          "\n"
-        end
+    when 'br'
+      "\n" if next_node_name(node) != 'div' && !next_node_name(node).nil?
 
-      when "li"
-        "\n"
+    when 'li'
+      "\n"
 
-      when "div"
-        if next_node_is_text?(node)
-          "\n"
-        elsif next_node_name(node) != "div" && next_node_name(node) != nil
-          "\n"
-        end
+    when 'div'
+      if next_node_is_text?(node)
+        "\n"
+      elsif next_node_name(node) != 'div' && !next_node_name(node).nil?
+        "\n"
+      end
     end
   end
 
   # links are returned in [text](link) format
   def wrap_link(node, output)
-    href = node.attribute("href")
-    name = node.attribute("name")
+    href = node.attribute('href')
+    name = node.attribute('name')
 
     output = output.strip
 
     # remove double [[ ]]s from linking images
-    if output[0] == "[" && output[-1] == "]"
+    if output[0] == '[' && output[-1] == ']'
       output = output[1, output.length - 2]
 
       # for linking images, the title of the  overrides the title of the 
-      if node.attribute("title")
-        output = node.attribute("title").to_s
-      end
+      output = node.attribute('title').to_s if node.attribute('title')
     end
 
     # if there is no link text, but a title attr
-    if output.empty? && node.attribute("title")
-      output = node.attribute("title").to_s
-    end
+    output = node.attribute('title').to_s if output.empty? && node.attribute('title')
 
     if href.nil?
-      if !name.nil?
-        output = "[#{output}]"
-      end
+      output = "[#{output}]" unless name.nil?
     else
       href = href.to_s
 
       if href != output && href != "mailto:#{output}" &&
-          href != "http://#{output}" && href != "https://#{output}"
-        if output.empty?
-          output = href
-        else
-          output = "[#{output}](#{href})"
-        end
+         href != "http://#{output}" && href != "https://#{output}"
+        output = if output.empty?
+                   href
+                 else
+                   "[#{output}](#{href})"
+                 end
       end
     end
 
     case next_node_name(node)
-      when "h1", "h2", "h3", "h4", "h5", "h6"
-        output += "\n"
+    when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
+      output += "\n"
     end
 
     output
   end
 
   def image_text(node)
-    if node.attribute("title")
-      "[" + node.attribute("title").to_s + "]"
-    elsif node.attribute("alt")
-      "[" + node.attribute("alt").to_s + "]"
+    if node.attribute('title')
+      '[' + node.attribute('title').to_s + ']'
+    elsif node.attribute('alt')
+      '[' + node.attribute('alt').to_s + ']'
     else
-      ""
+      ''
     end
   end
 
   def next_node_name(node)
     next_node = node.next_sibling
-    while next_node != nil
+    until next_node.nil?
       break if next_node.element?
+
       next_node = next_node.next_sibling
     end
 
-    if next_node && next_node.element?
-      next_node.name.downcase
-    end
+    return unless next_node && next_node.element?
+
+    next_node.name.downcase
   end
 
   def next_node_is_text?(node)
-    return !node.next_sibling.nil? && node.next_sibling.text? && !node.next_sibling.text.strip.empty?
+    !node.next_sibling.nil? && node.next_sibling.text? && !node.next_sibling.text.strip.empty?
   end
 
   def previous_node_name(node)
     previous_node = node.previous_sibling
-    while previous_node != nil
+    until previous_node.nil?
       break if previous_node.element?
+
       previous_node = previous_node.previous_sibling
     end
 
-    if previous_node && previous_node.element?
-      previous_node.name.downcase
-    end
+    return unless previous_node && previous_node.element?
+
+    previous_node.name.downcase
   end
 
   def previous_node_is_text?(node)
-    return !node.previous_sibling.nil? && node.previous_sibling.text? && !node.previous_sibling.text.strip.empty?
+    !node.previous_sibling.nil? && node.previous_sibling.text? && !node.previous_sibling.text.strip.empty?
   end
 
   # def previous_node_is_not_text?(node)
diff --git a/lib/html2text/version.rb b/lib/html2text/version.rb
index 652f510..464f127 100644
--- a/lib/html2text/version.rb
+++ b/lib/html2text/version.rb
@@ -1,3 +1,3 @@
 class Html2Text
-  VERSION = "0.3.1"
+  VERSION = '0.3.1'
 end
diff --git a/spec/examples_spec.rb b/spec/examples_spec.rb
index 8ca39a2..c0e0010 100644
--- a/spec/examples_spec.rb
+++ b/spec/examples_spec.rb
@@ -1,32 +1,32 @@
-require "spec_helper"
+require 'spec_helper'
 
 describe Html2Text do
-  describe "#convert" do
+  describe '#convert' do
     let(:text) { Html2Text.convert(html) }
 
-    examples = Dir[File.dirname(__FILE__) + "/examples/*.html"]
+    examples = Dir[File.dirname(__FILE__) + '/examples/*.html']
 
     examples.each do |filename|
       context "#{filename}" do
         let(:html) { File.read(filename) }
-        let(:text_file) { filename.sub(".html", ".txt") }
+        let(:text_file) { filename.sub('.html', '.txt') }
         let(:expected) { Html2Text.fix_newlines(File.read(text_file)) }
 
-        it "has an expected output" do
+        it 'has an expected output' do
           expect(File.exist?(text_file)).to eq(true), "'#{text_file}' did not exist"
         end
 
-        it "converts to text" do
+        it 'converts to text' do
           # Write the output if it failed, for easier comparison
-          if !text.eql?(expected)
-            File.open(filename.sub(".html", ".output"), 'w') do |fp|
+          unless text.eql?(expected)
+            File.open(filename.sub('.html', '.output'), 'w') do |fp|
               fp.write(text)
             end
           end
 
           # Quick check, don't try to generate a 500kb+ diff,
           # which can halt the rspec for minutes+
-          expect(text.length).to eq expected.length if text.length > 10000
+          expect(text.length).to eq expected.length if text.length > 10_000
 
           # More complete check
           expect(text).to eq expected
@@ -34,7 +34,7 @@
       end
     end
 
-    it "has examples to test" do
+    it 'has examples to test' do
       expect(examples.size).to_not eq(0)
     end
   end
diff --git a/spec/html2text_spec.rb b/spec/html2text_spec.rb
index dd95bd6..1d28b25 100644
--- a/spec/html2text_spec.rb
+++ b/spec/html2text_spec.rb
@@ -1,56 +1,56 @@
-require "spec_helper"
+require 'spec_helper'
 
 describe Html2Text do
-  describe "#convert" do
+  describe '#convert' do
     let(:text) { Html2Text.convert(html) }
 
-    context "an empty line" do
-      let(:html) { "" }
+    context 'an empty line' do
+      let(:html) { '' }
 
-      it "is an empty line" do
-        expect(text).to eq("")
+      it 'is an empty line' do
+        expect(text).to eq('')
       end
     end
 
-    context "a simple string" do
-      let(:html) { "hello world" }
+    context 'a simple string' do
+      let(:html) { 'hello world' }
 
-      it "is an empty line" do
-        expect(text).to eq("hello world")
+      it 'is an empty line' do
+        expect(text).to eq('hello world')
       end
     end
 
-    context "input value is non-string" do
+    context 'input value is non-string' do
       let(:html) { nil }
       it '(nil)' do
-        expect(text).to eq("")
+        expect(text).to eq('')
       end
     end
 
-    context "input value is non-string" do
+    context 'input value is non-string' do
       let(:html) { 1234 }
-      it "(number)" do
-        expect(text).to eq("1234")
+      it '(number)' do
+        expect(text).to eq('1234')
       end
     end
 
-    context "input value is non-string" do
+    context 'input value is non-string' do
       let(:html) { 1234.5600 }
-      it "(float number)" do
-        expect(text).to eq("1234.56")
+      it '(float number)' do
+        expect(text).to eq('1234.56')
       end
     end
   end
 
-  describe "#remove_leading_and_trailing_whitespace" do
+  describe '#remove_leading_and_trailing_whitespace' do
     let(:subject) { Html2Text.new(nil).remove_leading_and_trailing_whitespace(input) }
 
-    context "an empty string" do
-      let(:input) { "" }
-      it { is_expected.to eq("") }
+    context 'an empty string' do
+      let(:input) { '' }
+      it { is_expected.to eq('') }
     end
 
-    context "many new lines" do
+    context 'many new lines' do
       let(:input) { "hello\n  world \n yes" }
       it { is_expected.to eq("hello\nworld\nyes") }
     end
diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb
index 6469064..a7cdbb6 100644
--- a/spec/spec_helper.rb
+++ b/spec/spec_helper.rb
@@ -1,4 +1,4 @@
-require "rspec"
-require "rspec/collection_matchers"
+require 'rspec'
+require 'rspec/collection_matchers'
 
-require File.join(File.dirname(__FILE__), "..", "lib", "html2text")
+require File.join(File.dirname(__FILE__), '..', 'lib', 'html2text')

From 6363861dbd61d37bbf19f46e813a27d8aa809673 Mon Sep 17 00:00:00 2001
From: Michael Scrivo 
Date: Fri, 7 Jun 2024 10:25:56 -0400
Subject: [PATCH 2/3] Fix remaining issues

---
 .rubocop.yml             | 35 +++++++++++++++++++++++++++++++++++
 Gemfile                  |  2 ++
 Gemfile.lock             |  9 ++++++++-
 Rakefile                 |  2 ++
 html2text.gemspec        | 10 ++++++++--
 lib/html2text.rb         | 26 ++++++++++++++++----------
 lib/html2text/version.rb |  2 ++
 spec/examples_spec.rb    | 12 +++++-------
 spec/html2text_spec.rb   |  2 ++
 spec/spec_helper.rb      |  2 ++
 10 files changed, 82 insertions(+), 20 deletions(-)
 create mode 100644 .rubocop.yml

diff --git a/.rubocop.yml b/.rubocop.yml
new file mode 100644
index 0000000..6e22451
--- /dev/null
+++ b/.rubocop.yml
@@ -0,0 +1,35 @@
+require:
+  - rubocop-performance
+  - rubocop-rake
+
+AllCops:
+  NewCops: enable
+  TargetRubyVersion: 3.0
+
+Metrics/MethodLength:
+  Max: 30
+
+Metrics/ClassLength:
+  Max: 200
+
+Metrics/ModuleLength:
+  Max: 200
+
+Metrics/BlockLength:
+  Max: 50
+
+Gemspec/DevelopmentDependencies:
+  EnforcedStyle: gemspec
+
+# TODO: Enable these cops after fixing the issues
+Metrics/CyclomaticComplexity:
+  Enabled: false
+
+Metrics/PerceivedComplexity:
+  Enabled: false
+
+Metrics/AbcSize:
+  Enabled: false
+
+Style/Documentation:
+  Enabled: false
diff --git a/Gemfile b/Gemfile
index fdc9cbb..bccc7f9 100644
--- a/Gemfile
+++ b/Gemfile
@@ -1,3 +1,5 @@
+# frozen_string_literal: true
+
 source 'https://rubygems.org'
 
 # Declare your gem's dependencies in whatever.gemspec.
diff --git a/Gemfile.lock b/Gemfile.lock
index b89805f..e4c0cd9 100644
--- a/Gemfile.lock
+++ b/Gemfile.lock
@@ -57,6 +57,11 @@ GEM
       unicode-display_width (>= 2.4.0, < 3.0)
     rubocop-ast (1.31.3)
       parser (>= 3.3.1.0)
+    rubocop-performance (1.21.0)
+      rubocop (>= 1.48.1, < 2.0)
+      rubocop-ast (>= 1.31.1, < 2.0)
+    rubocop-rake (0.6.0)
+      rubocop (~> 1.0)
     ruby-progressbar (1.13.0)
     strscan (3.1.0)
     thor (0.20.3)
@@ -73,6 +78,8 @@ DEPENDENCIES
   rspec
   rspec-collection_matchers
   rubocop
+  rubocop-performance
+  rubocop-rake
 
 BUNDLED WITH
-   2.1.4
+   2.5.11
diff --git a/Rakefile b/Rakefile
index f34a48c..cffdd09 100644
--- a/Rakefile
+++ b/Rakefile
@@ -1,3 +1,5 @@
+# frozen_string_literal: true
+
 require 'rspec/core/rake_task'
 
 RSpec::Core::RakeTask.new(:spec)
diff --git a/html2text.gemspec b/html2text.gemspec
index 9213100..4385634 100644
--- a/html2text.gemspec
+++ b/html2text.gemspec
@@ -1,4 +1,6 @@
-$:.push File.expand_path('lib', __dir__)
+# frozen_string_literal: true
+
+$LOAD_PATH.push File.expand_path('lib', __dir__)
 
 # Maintain your gem's version:
 require 'html2text/version'
@@ -13,9 +15,9 @@ Gem::Specification.new do |s|
   s.summary     = 'Convert HTML into plain text.'
   s.description = 'A Ruby component to convert HTML into a plain text format.'
   s.license     = 'MIT'
+  s.required_ruby_version = '>= 3.0'
 
   s.files = Dir['lib/**/*', 'LICENSE.md', 'README.md', 'CHANGELOG.md']
-  s.test_files = Dir['spec/**/*']
 
   s.add_dependency 'nokogiri', ['>= 1.0', '< 2.0']
 
@@ -25,4 +27,8 @@ Gem::Specification.new do |s|
   s.add_development_dependency 'rspec'
   s.add_development_dependency 'rspec-collection_matchers'
   s.add_development_dependency 'rubocop'
+  s.add_development_dependency 'rubocop-performance'
+  s.add_development_dependency 'rubocop-rake'
+
+  s.metadata['rubygems_mfa_required'] = 'true'
 end
diff --git a/lib/html2text.rb b/lib/html2text.rb
index 7abc7d3..a332387 100644
--- a/lib/html2text.rb
+++ b/lib/html2text.rb
@@ -1,3 +1,5 @@
+# frozen_string_literal: true
+
 require 'nokogiri'
 
 class Html2Text
@@ -10,7 +12,7 @@ def initialize(doc)
   def self.convert(html)
     html = html.to_s
 
-    if is_office_document?(html)
+    if office_document?(html)
       # Emulate the CSS rendering of Office documents
       html = html.gsub('

', '
') .gsub(' ', '
') @@ -58,15 +60,15 @@ def remove_leading_and_trailing_whitespace(text) end end - output.join('') + output.join end - private - - def self.is_office_document?(text) + private_class_method def self.office_document?(text) text.include?('urn:schemas-microsoft-com:office') end + private + def remove_unnecessary_empty_lines(text) text.gsub(/\n\n\n*/im, "\n\n") end @@ -99,7 +101,7 @@ def iterate_over(node) end output << suffix_whitespace(node) - output = output.compact.join('') || '' + output = output.compact.join || '' unless node.name.nil? if node.name.downcase == 'a' @@ -112,6 +114,7 @@ def iterate_over(node) output end + # rubocop:disable Lint/DuplicateBranch def prefix_whitespace(node) case node.name.downcase when 'hr' @@ -140,7 +143,9 @@ def prefix_whitespace(node) '- ' end end + # rubocop:enable Lint/DuplicateBranch + # rubocop:disable Lint/DuplicateBranch def suffix_whitespace(node) case node.name.downcase when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' @@ -164,6 +169,7 @@ def suffix_whitespace(node) end end end + # rubocop:enable Lint/DuplicateBranch # links are returned in [text](link) format def wrap_link(node, output) @@ -208,9 +214,9 @@ def wrap_link(node, output) def image_text(node) if node.attribute('title') - '[' + node.attribute('title').to_s + ']' + "[#{node.attribute('title')}]" elsif node.attribute('alt') - '[' + node.attribute('alt').to_s + ']' + "[#{node.attribute('alt')}]" else '' end @@ -224,7 +230,7 @@ def next_node_name(node) next_node = next_node.next_sibling end - return unless next_node && next_node.element? + return unless next_node&.element? next_node.name.downcase end @@ -241,7 +247,7 @@ def previous_node_name(node) previous_node = previous_node.previous_sibling end - return unless previous_node && previous_node.element? + return unless previous_node&.element? previous_node.name.downcase end diff --git a/lib/html2text/version.rb b/lib/html2text/version.rb index 464f127..3a6a6a4 100644 --- a/lib/html2text/version.rb +++ b/lib/html2text/version.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + class Html2Text VERSION = '0.3.1' end diff --git a/spec/examples_spec.rb b/spec/examples_spec.rb index c0e0010..0330eeb 100644 --- a/spec/examples_spec.rb +++ b/spec/examples_spec.rb @@ -1,13 +1,15 @@ +# frozen_string_literal: true + require 'spec_helper' describe Html2Text do describe '#convert' do let(:text) { Html2Text.convert(html) } - examples = Dir[File.dirname(__FILE__) + '/examples/*.html'] + examples = Dir["#{File.dirname(__FILE__)}/examples/*.html"] examples.each do |filename| - context "#{filename}" do + context filename.to_s do let(:html) { File.read(filename) } let(:text_file) { filename.sub('.html', '.txt') } let(:expected) { Html2Text.fix_newlines(File.read(text_file)) } @@ -18,11 +20,7 @@ it 'converts to text' do # Write the output if it failed, for easier comparison - unless text.eql?(expected) - File.open(filename.sub('.html', '.output'), 'w') do |fp| - fp.write(text) - end - end + File.write(filename.sub('.html', '.output'), text) unless text.eql?(expected) # Quick check, don't try to generate a 500kb+ diff, # which can halt the rspec for minutes+ diff --git a/spec/html2text_spec.rb b/spec/html2text_spec.rb index 1d28b25..516d64d 100644 --- a/spec/html2text_spec.rb +++ b/spec/html2text_spec.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + require 'spec_helper' describe Html2Text do diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index a7cdbb6..e2acb5a 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: true + require 'rspec' require 'rspec/collection_matchers' From 6ec1b697b43a4f42df7fb6ee320fdf68c585db41 Mon Sep 17 00:00:00 2001 From: Michael Scrivo Date: Fri, 7 Jun 2024 10:27:30 -0400 Subject: [PATCH 3/3] Disable cop --- lib/html2text.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/html2text.rb b/lib/html2text.rb index a332387..66e19e0 100644 --- a/lib/html2text.rb +++ b/lib/html2text.rb @@ -31,11 +31,15 @@ def self.convert(html) end def self.fix_newlines(text) + # rubocop:disable Performance/StringReplacement text.gsub("\r\n", "\n").gsub("\r", "\n") + # rubocop:enable Performance/StringReplacement end def self.replace_entities(text) + # rubocop:disable Performance/StringReplacement text.gsub(' ', ' ').gsub("\u00a0", ' ').gsub('‌', '') + # rubocop:enable Performance/StringReplacement end def convert