Skip to content

Commit

Permalink
Fix problem when DNS resolution fails one or two times, and then uri …
Browse files Browse the repository at this point in the history
…is excluded by robots.
  • Loading branch information
johnerikhalse committed Nov 22, 2021
1 parent 1e15023 commit cf0e0ce
Show file tree
Hide file tree
Showing 5 changed files with 94 additions and 13 deletions.
2 changes: 1 addition & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
<io.opentracing.version>0.33.0</io.opentracing.version>

<junit.jupiter.version>5.7.0</junit.jupiter.version>
<org.testcontainers.version>1.15.3</org.testcontainers.version>
<org.testcontainers.version>1.16.2</org.testcontainers.version>

<!-- Docker auth credentials -->
<!--suppress UnresolvedMavenProperty -->
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -423,15 +423,15 @@ private void removeQUri(JedisContext ctx, String id, String chgId, String eid, l
}
}

public boolean removeTmpCrawlHostGroup(QueuedUri qUri) {
return removeQUri(qUri, false);
public boolean removeTmpCrawlHostGroup(QueuedUri qUri, String tmpChgId, boolean deleteUri) {
return removeQUri(qUri, tmpChgId, deleteUri);
}

public boolean removeQUri(QueuedUri qUri) {
return removeQUri(qUri, true);
return removeQUri(qUri, qUri.getCrawlHostGroupId(), true);
}

private boolean removeQUri(QueuedUri qUri, boolean deleteUri) {
private boolean removeQUri(QueuedUri qUri, String chgId, boolean deleteUri) {
if (LOG.isTraceEnabled()) {
String stack = Arrays.stream(new RuntimeException().getStackTrace())
.filter(s -> s.getClassName().contains("no.nb.nna"))
Expand All @@ -443,7 +443,7 @@ private boolean removeQUri(QueuedUri qUri, boolean deleteUri) {
try (JedisContext ctx = JedisContext.forPool(jedisPool)) {
long numRemoved = uriRemoveScript.run(ctx,
qUri.getId(),
qUri.getCrawlHostGroupId(),
chgId,
qUri.getExecutionId(),
qUri.getSequence(),
qUri.getEarliestFetchTimeStamp().getSeconds(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ public boolean preFetch() throws DbException {
switch (check) {
case DENIED:
LOG.debug("DENIED");
status.removeCurrentUri(qUri).saveStatus();
status.saveStatus();
CrawlExecutionHelpers.postFetchFinally(frontier, status, qUri, 0);
return false;
case RETRY:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ public static ListenableFuture<PreconditionState> checkPreconditions(Frontier fr
default:
frontier.writeLog(frontier, qUri);
}
if (!qUri.isUnresolved()) {
status.removeCurrentUri(qUri);
}
status.incrementDocumentsOutOfScope();
frontier.getOutOfScopeHandlerClient().submitUri(qUri.getQueuedUri());
return Futures.immediateFuture(PreconditionState.DENIED);
Expand Down Expand Up @@ -124,10 +127,9 @@ public void onSuccess(InetSocketAddress result) {
ConfigObject politeness = frontier.getConfig(crawlConfig.getCrawlConfig().getPolitenessRef());
ConfigObject browserConfig = frontier.getConfig(crawlConfig.getCrawlConfig().getBrowserConfigRef());

boolean changedCrawlHostGroup = false;
String changedCrawlHostGroup = null;
if (!qUri.getCrawlHostGroupId().isEmpty() && !qUri.getQueuedUri().getId().isEmpty()) {
changedCrawlHostGroup = true;
frontier.getCrawlQueueManager().removeTmpCrawlHostGroup(qUri.getQueuedUri());
changedCrawlHostGroup = qUri.getCrawlHostGroupId();
}
qUri.setIp(result.getAddress().getHostAddress());
qUri.setResolved(politeness);
Expand Down Expand Up @@ -167,6 +169,9 @@ public void onFailure(Throwable t) {
LOG.error("Unable to update uri earliest fetch timestamp", e);
}
}
if (state == PreconditionState.DENIED && !qUri.getCrawlHostGroupId().isEmpty() && !qUri.getQueuedUri().getId().isEmpty()) {
status.removeCurrentUri(qUri);
}
future.set(state);
} catch (DbException e) {
future.setException(e);
Expand Down Expand Up @@ -203,13 +208,13 @@ public void onFailure(Throwable t) {
}

static class IsAllowedFunc implements Consumer<Boolean> {
private final boolean changedCrawlHostGroup;
private final String changedCrawlHostGroup;
private final Frontier frontier;
private final QueuedUriWrapper qUri;
private final StatusWrapper status;
private final SettableFuture<PreconditionState> future;

public IsAllowedFunc(boolean changedCrawlHostGroup, Frontier frontier, QueuedUriWrapper qUri, StatusWrapper status, SettableFuture<PreconditionState> future) {
public IsAllowedFunc(String changedCrawlHostGroup, Frontier frontier, QueuedUriWrapper qUri, StatusWrapper status, SettableFuture<PreconditionState> future) {
this.changedCrawlHostGroup = changedCrawlHostGroup;
this.frontier = frontier;
this.qUri = qUri;
Expand All @@ -221,13 +226,19 @@ public IsAllowedFunc(boolean changedCrawlHostGroup, Frontier frontier, QueuedUri
public void accept(Boolean isAllowed) {
try {
if (isAllowed) {
if (changedCrawlHostGroup) {
if (changedCrawlHostGroup != null) {
frontier.getCrawlQueueManager().removeTmpCrawlHostGroup(qUri.getQueuedUri(), changedCrawlHostGroup, false);
frontier.getCrawlQueueManager().addToCrawlHostGroup(qUri.getQueuedUri());
future.set(PreconditionState.RETRY);
} else {
future.set(PreconditionState.OK);
}
} else {
if (changedCrawlHostGroup != null) {
frontier.getCrawlQueueManager().removeTmpCrawlHostGroup(qUri.getQueuedUri(), changedCrawlHostGroup, true);
} else {
status.removeCurrentUri(qUri);
}
LOG.info("URI '{}' precluded by robots.txt", qUri.getUri());
qUri.setError(ExtraStatusCodes.PRECLUDED_BY_ROBOTS.toFetchError());
status.incrementDocumentsDenied(1L);
Expand Down
70 changes: 70 additions & 0 deletions src/test/java/no/nb/nna/veidemann/frontier/api/HarvestTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,76 @@ public void testDeniedByRobotsTxt() throws Exception {
.readyQueue().hasNumberOfElements(0);
}

@Test
public void testDnsFailureOnceThenDeniedByRobots() throws Exception {
int seedCount = 1;
int linksPerLevel = 0;
int maxHopsFromSeed = 1;

scopeCheckerServiceMock.withMaxHopsFromSeed(maxHopsFromSeed);
harvesterMock.withLinksPerLevel(linksPerLevel);
dnsResolverMock.withFetchErrorForHostRequests("a.seed-000000.com", 1, 1);
robotsEvaluatorMock
.withFetchDenialForUrl("http://a.seed-000000.com");

ConfigObject job = crawlRunner.genJob("job1");
List<SeedAndExecutions> seeds = crawlRunner.genSeeds(seedCount, "a.seed", job);
RunningCrawl crawl = crawlRunner.runCrawl(job, seeds);
crawlRunner.awaitCrawlFinished(crawl);

assertThat(logServiceMock.crawlLogs).hasNumberOfRequests(1)
.hasRequestSatisfying(r -> {
assertThat(r)
.hasWarcId()
.statusCodeEquals(PRECLUDED_BY_ROBOTS)
.requestedUriEquals("http://a.seed-000000.com")
.error().isNotNull().codeEquals(PRECLUDED_BY_ROBOTS);
});

assertThat(rethinkDbData)
.hasQueueTotalCount(0);

assertThat(rethinkDbData)
.jobExecutionStatuses().hasSize(1).hasEntrySatisfying(crawl.getStatus().getId(), j -> {
assertThat(j)
.hasState(JobExecutionStatus.State.FINISHED)
.hasStartTime(true)
.hasEndTime(true)
.documentsCrawledEquals(0)
.documentsDeniedEquals(1)
.documentsFailedEquals(0)
.documentsRetriedEquals(2)
.documentsOutOfScopeEquals(0);
});
String crawlExecutionId1 = seeds.get(0).getCrawlExecution(job).get().getId();
assertThat(rethinkDbData)
.crawlExecutionStatuses().hasSize(seedCount)
.hasEntrySatisfying(crawlExecutionId1, s -> {
assertThat(s)
.hasState(CrawlExecutionStatus.State.FINISHED)
.hasStartTime(true)
.hasEndTime(true)
.documentsCrawledEquals(0)
.documentsDeniedEquals(1)
.documentsFailedEquals(0)
.documentsRetriedEquals(2)
.documentsOutOfScopeEquals(0)
.currentUriIdCountIsEqualTo(0);
});

assertThat(rethinkDbData).jobStatsMatchesCrawlExecutions();

assertThat(redisData)
.hasQueueTotalCount(0)
.crawlHostGroups().hasNumberOfElements(0);
assertThat(redisData)
.crawlExecutionQueueCounts().hasNumberOfElements(0);
assertThat(redisData)
.sessionTokens().hasNumberOfElements(0);
assertThat(redisData)
.readyQueue().hasNumberOfElements(0);
}

@Test
public void testRecheckScope() throws Exception {
int seedCount = 1;
Expand Down

0 comments on commit cf0e0ce

Please sign in to comment.