Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parser initialization parameter :allowed_error_codes #23

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion lib/wombat/processing/parser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,11 @@ module Processing
module Parser
attr_accessor :mechanize, :context, :response_code, :page

def initialize
def initialize(opts={})
@mechanize = Mechanize.new
if opts[:allowed_error_codes]
@mechanize.agent.allowed_error_codes = opts[:allowed_error_codes]
end
@mechanize.set_proxy(*Wombat.proxy_args) if Wombat.proxy_args
end

Expand Down
9 changes: 7 additions & 2 deletions lib/wombat/property/locators/follow.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,12 @@ def locate(context, page = nil)
super do
locate_nodes(context).flat_map do |node|
target_page = page.click node
context = target_page.parser
if target_page.respond_to? :parser
context = target_page.parser
else
# Mechanize returns different types depending on status code :/
context = Nokogiri::HTML(target_page.body)
end

filter_properties(context, page)
end
Expand All @@ -17,4 +22,4 @@ def locate(context, page = nil)
end
end
end
end
end
78 changes: 57 additions & 21 deletions spec/integration/integration_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -245,32 +245,68 @@
end
end

it 'should follow links' do
VCR.use_cassette('follow_links') do
crawler = Class.new
crawler.send(:include, Wombat::Crawler)
context "when following links" do
it "should be successful when all links are valid" do
VCR.use_cassette('follow_links') do
crawler = Class.new
crawler.send(:include, Wombat::Crawler)

crawler.base_url "https://www.github.com"
crawler.path "/"
crawler.base_url "https://www.github.com"
crawler.path "/"

crawler.github 'xpath=//ul[@class="footer_nav"][1]//a', :follow do
heading 'css=h1'
crawler.github 'xpath=//ul[@class="footer_nav"][1]//a', :follow do
heading 'css=h1'
end

crawler_instance = crawler.new
results = crawler_instance.crawl

results.should == {
"github" => [
{ "heading"=>"GitHub helps people build software together." },
{ "heading"=>nil },
{ "heading"=>"Features" },
{ "heading"=>"Contact GitHub" },
{ "heading"=>"GitHub Training — Git Training from the Experts" },
{ "heading"=>"GitHub on Your Servers" },
{ "heading"=>"Loading..." }
]
}
end
end

crawler_instance = crawler.new
results = crawler_instance.crawl
it "should be successful when respecting allowed_error_codes" do
VCR.use_cassette('follow_links') do
crawler = Class.new
crawler.send(:include, Wombat::Crawler)

crawler.base_url "https://www.github.com"
crawler.path "/"

# This takes precedence over the VCR cassette
FakeWeb.register_uri(:get, "https://github.com/contact",
body: "<h1>This is not the web page you are looking for.</h1>",
status: ["404", "Not Found"])

results.should == {
"github" => [
{ "heading"=>"GitHub helps people build software together." },
{ "heading"=>nil },
{ "heading"=>"Features" },
{ "heading"=>"Contact GitHub" },
{ "heading"=>"GitHub Training — Git Training from the Experts" },
{ "heading"=>"GitHub on Your Servers" },
{ "heading"=>"Loading..." }
]
}
crawler.github 'xpath=//ul[@class="footer_nav"][1]//a', :follow do
heading 'css=h1'
end

crawler_instance = crawler.new(allowed_error_codes: ['404'])
results = crawler_instance.crawl

results.should == {
"github" => [
{ "heading"=>"GitHub helps people build software together." },
{ "heading"=>nil },
{ "heading"=>"Features" },
{ "heading"=>"This is not the web page you are looking for." },
{ "heading"=>"GitHub Training — Git Training from the Experts" },
{ "heading"=>"GitHub on Your Servers" },
{ "heading"=>"Loading..." }
]
}
end
end
end
end