From 1e15023ecfa9fcf349236cc7265ea599e22f0d20 Mon Sep 17 00:00:00 2001 From: johnerikhalse Date: Mon, 22 Nov 2021 14:54:44 +0100 Subject: [PATCH 1/2] Try to reduce probability of test failing on fast machines. --- .../java/no/nb/nna/veidemann/frontier/api/HarvestTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/java/no/nb/nna/veidemann/frontier/api/HarvestTest.java b/src/test/java/no/nb/nna/veidemann/frontier/api/HarvestTest.java index f9b7971..39dadba 100644 --- a/src/test/java/no/nb/nna/veidemann/frontier/api/HarvestTest.java +++ b/src/test/java/no/nb/nna/veidemann/frontier/api/HarvestTest.java @@ -617,7 +617,7 @@ public void testAbortCrawlExecutionLate() throws Exception { // Abort the second execution as soon as it is sleeping - await().pollDelay(10, TimeUnit.MILLISECONDS).pollInterval(10, TimeUnit.MILLISECONDS).atMost(30, TimeUnit.SECONDS) + await().pollDelay(5, TimeUnit.MILLISECONDS).pollInterval(5, TimeUnit.MILLISECONDS).atMost(30, TimeUnit.SECONDS) .until(() -> { CrawlExecutionStatus ces = DbService.getInstance().getExecutionsAdapter().getCrawlExecutionStatus(crawlExecutionId2); if (ces.getState() == CrawlExecutionStatus.State.SLEEPING) { @@ -629,7 +629,7 @@ public void testAbortCrawlExecutionLate() throws Exception { // Abort the third execution as soon as it is finished - await().pollDelay(10, TimeUnit.MILLISECONDS).pollInterval(10, TimeUnit.MILLISECONDS).atMost(30, TimeUnit.SECONDS) + await().pollDelay(5, TimeUnit.MILLISECONDS).pollInterval(5, TimeUnit.MILLISECONDS).atMost(30, TimeUnit.SECONDS) .until(() -> { CrawlExecutionStatus ces = DbService.getInstance().getExecutionsAdapter().getCrawlExecutionStatus(crawlExecutionId3); if (ces.getState() == CrawlExecutionStatus.State.FINISHED) { From cf0e0ce32967411984c906112cf541938b343315 Mon Sep 17 00:00:00 2001 From: johnerikhalse Date: Mon, 22 Nov 2021 14:58:59 +0100 Subject: [PATCH 2/2] Fix problem when DNS resolution fails one or two times, and then uri is excluded by robots. --- pom.xml | 2 +- .../frontier/db/CrawlQueueManager.java | 10 +-- .../frontier/worker/PreFetchHandler.java | 2 +- .../frontier/worker/Preconditions.java | 23 ++++-- .../veidemann/frontier/api/HarvestTest.java | 70 +++++++++++++++++++ 5 files changed, 94 insertions(+), 13 deletions(-) diff --git a/pom.xml b/pom.xml index a363034..1efe127 100644 --- a/pom.xml +++ b/pom.xml @@ -36,7 +36,7 @@ 0.33.0 5.7.0 - 1.15.3 + 1.16.2 diff --git a/src/main/java/no/nb/nna/veidemann/frontier/db/CrawlQueueManager.java b/src/main/java/no/nb/nna/veidemann/frontier/db/CrawlQueueManager.java index 629aa20..b7640aa 100644 --- a/src/main/java/no/nb/nna/veidemann/frontier/db/CrawlQueueManager.java +++ b/src/main/java/no/nb/nna/veidemann/frontier/db/CrawlQueueManager.java @@ -423,15 +423,15 @@ private void removeQUri(JedisContext ctx, String id, String chgId, String eid, l } } - public boolean removeTmpCrawlHostGroup(QueuedUri qUri) { - return removeQUri(qUri, false); + public boolean removeTmpCrawlHostGroup(QueuedUri qUri, String tmpChgId, boolean deleteUri) { + return removeQUri(qUri, tmpChgId, deleteUri); } public boolean removeQUri(QueuedUri qUri) { - return removeQUri(qUri, true); + return removeQUri(qUri, qUri.getCrawlHostGroupId(), true); } - private boolean removeQUri(QueuedUri qUri, boolean deleteUri) { + private boolean removeQUri(QueuedUri qUri, String chgId, boolean deleteUri) { if (LOG.isTraceEnabled()) { String stack = Arrays.stream(new RuntimeException().getStackTrace()) .filter(s -> s.getClassName().contains("no.nb.nna")) @@ -443,7 +443,7 @@ private boolean removeQUri(QueuedUri qUri, boolean deleteUri) { try (JedisContext ctx = JedisContext.forPool(jedisPool)) { long numRemoved = uriRemoveScript.run(ctx, qUri.getId(), - qUri.getCrawlHostGroupId(), + chgId, qUri.getExecutionId(), qUri.getSequence(), qUri.getEarliestFetchTimeStamp().getSeconds(), diff --git a/src/main/java/no/nb/nna/veidemann/frontier/worker/PreFetchHandler.java b/src/main/java/no/nb/nna/veidemann/frontier/worker/PreFetchHandler.java index ed783eb..a7cc63c 100644 --- a/src/main/java/no/nb/nna/veidemann/frontier/worker/PreFetchHandler.java +++ b/src/main/java/no/nb/nna/veidemann/frontier/worker/PreFetchHandler.java @@ -104,7 +104,7 @@ public boolean preFetch() throws DbException { switch (check) { case DENIED: LOG.debug("DENIED"); - status.removeCurrentUri(qUri).saveStatus(); + status.saveStatus(); CrawlExecutionHelpers.postFetchFinally(frontier, status, qUri, 0); return false; case RETRY: diff --git a/src/main/java/no/nb/nna/veidemann/frontier/worker/Preconditions.java b/src/main/java/no/nb/nna/veidemann/frontier/worker/Preconditions.java index f338aec..a26d704 100644 --- a/src/main/java/no/nb/nna/veidemann/frontier/worker/Preconditions.java +++ b/src/main/java/no/nb/nna/veidemann/frontier/worker/Preconditions.java @@ -75,6 +75,9 @@ public static ListenableFuture checkPreconditions(Frontier fr default: frontier.writeLog(frontier, qUri); } + if (!qUri.isUnresolved()) { + status.removeCurrentUri(qUri); + } status.incrementDocumentsOutOfScope(); frontier.getOutOfScopeHandlerClient().submitUri(qUri.getQueuedUri()); return Futures.immediateFuture(PreconditionState.DENIED); @@ -124,10 +127,9 @@ public void onSuccess(InetSocketAddress result) { ConfigObject politeness = frontier.getConfig(crawlConfig.getCrawlConfig().getPolitenessRef()); ConfigObject browserConfig = frontier.getConfig(crawlConfig.getCrawlConfig().getBrowserConfigRef()); - boolean changedCrawlHostGroup = false; + String changedCrawlHostGroup = null; if (!qUri.getCrawlHostGroupId().isEmpty() && !qUri.getQueuedUri().getId().isEmpty()) { - changedCrawlHostGroup = true; - frontier.getCrawlQueueManager().removeTmpCrawlHostGroup(qUri.getQueuedUri()); + changedCrawlHostGroup = qUri.getCrawlHostGroupId(); } qUri.setIp(result.getAddress().getHostAddress()); qUri.setResolved(politeness); @@ -167,6 +169,9 @@ public void onFailure(Throwable t) { LOG.error("Unable to update uri earliest fetch timestamp", e); } } + if (state == PreconditionState.DENIED && !qUri.getCrawlHostGroupId().isEmpty() && !qUri.getQueuedUri().getId().isEmpty()) { + status.removeCurrentUri(qUri); + } future.set(state); } catch (DbException e) { future.setException(e); @@ -203,13 +208,13 @@ public void onFailure(Throwable t) { } static class IsAllowedFunc implements Consumer { - private final boolean changedCrawlHostGroup; + private final String changedCrawlHostGroup; private final Frontier frontier; private final QueuedUriWrapper qUri; private final StatusWrapper status; private final SettableFuture future; - public IsAllowedFunc(boolean changedCrawlHostGroup, Frontier frontier, QueuedUriWrapper qUri, StatusWrapper status, SettableFuture future) { + public IsAllowedFunc(String changedCrawlHostGroup, Frontier frontier, QueuedUriWrapper qUri, StatusWrapper status, SettableFuture future) { this.changedCrawlHostGroup = changedCrawlHostGroup; this.frontier = frontier; this.qUri = qUri; @@ -221,13 +226,19 @@ public IsAllowedFunc(boolean changedCrawlHostGroup, Frontier frontier, QueuedUri public void accept(Boolean isAllowed) { try { if (isAllowed) { - if (changedCrawlHostGroup) { + if (changedCrawlHostGroup != null) { + frontier.getCrawlQueueManager().removeTmpCrawlHostGroup(qUri.getQueuedUri(), changedCrawlHostGroup, false); frontier.getCrawlQueueManager().addToCrawlHostGroup(qUri.getQueuedUri()); future.set(PreconditionState.RETRY); } else { future.set(PreconditionState.OK); } } else { + if (changedCrawlHostGroup != null) { + frontier.getCrawlQueueManager().removeTmpCrawlHostGroup(qUri.getQueuedUri(), changedCrawlHostGroup, true); + } else { + status.removeCurrentUri(qUri); + } LOG.info("URI '{}' precluded by robots.txt", qUri.getUri()); qUri.setError(ExtraStatusCodes.PRECLUDED_BY_ROBOTS.toFetchError()); status.incrementDocumentsDenied(1L); diff --git a/src/test/java/no/nb/nna/veidemann/frontier/api/HarvestTest.java b/src/test/java/no/nb/nna/veidemann/frontier/api/HarvestTest.java index 39dadba..6007c1b 100644 --- a/src/test/java/no/nb/nna/veidemann/frontier/api/HarvestTest.java +++ b/src/test/java/no/nb/nna/veidemann/frontier/api/HarvestTest.java @@ -454,6 +454,76 @@ public void testDeniedByRobotsTxt() throws Exception { .readyQueue().hasNumberOfElements(0); } + @Test + public void testDnsFailureOnceThenDeniedByRobots() throws Exception { + int seedCount = 1; + int linksPerLevel = 0; + int maxHopsFromSeed = 1; + + scopeCheckerServiceMock.withMaxHopsFromSeed(maxHopsFromSeed); + harvesterMock.withLinksPerLevel(linksPerLevel); + dnsResolverMock.withFetchErrorForHostRequests("a.seed-000000.com", 1, 1); + robotsEvaluatorMock + .withFetchDenialForUrl("http://a.seed-000000.com"); + + ConfigObject job = crawlRunner.genJob("job1"); + List seeds = crawlRunner.genSeeds(seedCount, "a.seed", job); + RunningCrawl crawl = crawlRunner.runCrawl(job, seeds); + crawlRunner.awaitCrawlFinished(crawl); + + assertThat(logServiceMock.crawlLogs).hasNumberOfRequests(1) + .hasRequestSatisfying(r -> { + assertThat(r) + .hasWarcId() + .statusCodeEquals(PRECLUDED_BY_ROBOTS) + .requestedUriEquals("http://a.seed-000000.com") + .error().isNotNull().codeEquals(PRECLUDED_BY_ROBOTS); + }); + + assertThat(rethinkDbData) + .hasQueueTotalCount(0); + + assertThat(rethinkDbData) + .jobExecutionStatuses().hasSize(1).hasEntrySatisfying(crawl.getStatus().getId(), j -> { + assertThat(j) + .hasState(JobExecutionStatus.State.FINISHED) + .hasStartTime(true) + .hasEndTime(true) + .documentsCrawledEquals(0) + .documentsDeniedEquals(1) + .documentsFailedEquals(0) + .documentsRetriedEquals(2) + .documentsOutOfScopeEquals(0); + }); + String crawlExecutionId1 = seeds.get(0).getCrawlExecution(job).get().getId(); + assertThat(rethinkDbData) + .crawlExecutionStatuses().hasSize(seedCount) + .hasEntrySatisfying(crawlExecutionId1, s -> { + assertThat(s) + .hasState(CrawlExecutionStatus.State.FINISHED) + .hasStartTime(true) + .hasEndTime(true) + .documentsCrawledEquals(0) + .documentsDeniedEquals(1) + .documentsFailedEquals(0) + .documentsRetriedEquals(2) + .documentsOutOfScopeEquals(0) + .currentUriIdCountIsEqualTo(0); + }); + + assertThat(rethinkDbData).jobStatsMatchesCrawlExecutions(); + + assertThat(redisData) + .hasQueueTotalCount(0) + .crawlHostGroups().hasNumberOfElements(0); + assertThat(redisData) + .crawlExecutionQueueCounts().hasNumberOfElements(0); + assertThat(redisData) + .sessionTokens().hasNumberOfElements(0); + assertThat(redisData) + .readyQueue().hasNumberOfElements(0); + } + @Test public void testRecheckScope() throws Exception { int seedCount = 1;