From 4edbbb77642b7a86ecca7184ce8dda5f22775c38 Mon Sep 17 00:00:00 2001 From: Richard Zowalla Date: Tue, 22 Oct 2024 10:01:06 +0200 Subject: [PATCH] Releae 5.1.0 --- README.md | 37 +++++++++---------- crawler4j-archetype/pom.xml | 2 +- crawler4j-boms/crawler4j-with-hsqldb/pom.xml | 2 +- .../crawler4j-with-sleepycat/pom.xml | 2 +- .../crawler4j-with-urlfrontier/pom.xml | 2 +- crawler4j-boms/pom.xml | 2 +- crawler4j-commons/pom.xml | 2 +- crawler4j-core/pom.xml | 2 +- .../crawler4j-examples-base/pom.xml | 2 +- .../crawler4j-examples-postgres/pom.xml | 2 +- crawler4j-examples/pom.xml | 2 +- .../crawler4j-frontier-hsqldb/pom.xml | 2 +- .../crawler4j-frontier-sleepycat/pom.xml | 2 +- .../crawler4j-frontier-urlfrontier/pom.xml | 2 +- crawler4j-frontier/pom.xml | 2 +- pom.xml | 23 +++++------- 16 files changed, 41 insertions(+), 47 deletions(-) diff --git a/README.md b/README.md index 2cf9d435..8684a5fc 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ This repository contains a fork of [yasserg/crawler4j](https://github.com/yasser --- crawler4j is an open source web crawler for Java which provides a simple interface for -crawling the Web. Using it, you can setup a multi-threaded web crawler in few minutes. +crawling the Web. Using it, you can set up a multithreaded web crawler in few minutes. ## Table of content @@ -25,21 +25,18 @@ crawling the Web. Using it, you can setup a multi-threaded web crawler in few mi ## Why you should use this fork? -This fork starts where the development of the previous main repository stalled. +This fork picks up where development on the original main repository left off, bringing several key improvements: -Some highlights include: - -- choice between multiple frontier implementations => avoid using a database with a license that doesn't comply with your use-case -- easy substitution of various parser implementations (not only for html, but also css, binary, and plain text) -- dynamic authentication -- improved exception handling, more versatile to customize -- fixes various parsing issues -- more documentation -- more tests and all tests are JUnit5 based (so no knowledge of Groovy and/or Spock needed anymore to maintain the codebase) -- uses Apache Maven as build tool -- provides a clean upgrade path by keeping backward compatibility in mind and deprecating methods before removing them -- more eyes have gone through the code, so readability and correctness have improved -- maintained, i.e. dependencies are often updated to their latest versions +- Offers a choice between multiple frontier implementations, allowing you to avoid databases with incompatible licenses for your use case. +- Simplifies swapping out parser implementations, supporting not just HTML but also CSS, binary, and plain text formats. +- Supports dynamic authentication. +- Enhances exception handling, making it easier to customize. +- Fixes various parsing issues. +- Includes expanded documentation. +- Features additional tests, now entirely based on JUnit 5 (eliminating the need for Groovy or Spock knowledge to maintain the codebase). +- Utilizes Apache Maven as the build tool. +- Ensures a smooth upgrade path with backward compatibility and method deprecation before removal. +- Improves code readability and correctness, with more contributors reviewing the code. ## Installation @@ -51,7 +48,7 @@ Add the following dependency to your pom.xml: de.hs-heilbronn.mi crawler4j-with-sleepycat - 5.0.2 + 5.1.0 ``` @@ -63,7 +60,7 @@ Otherwise, you can use `HSQLDB` instead de.hs-heilbronn.mi crawler4j-with-hsqldb - 5.0.2 + 5.1.0 ``` @@ -73,7 +70,7 @@ or you use an external [crawler-commons/url-frontier](https://github.com/crawler de.hs-heilbronn.mi crawler4j-with-urlfrontier - 5.0.2 + 5.1.0 ``` @@ -81,10 +78,10 @@ or you use an external [crawler-commons/url-frontier](https://github.com/crawler ### Archetype -Since `5.0.1`, we provide a Maven archetype to bootstrap crawler4j development. Just urn +We provide a Maven archetype to bootstrap crawler4j development. Just run ```bash -mvn archetype:generate -DarchetypeGroupId=de.hs-heilbronn.mi -DarchetypeArtifactId=crawler4j-archetype -DarchetypeVersion=5.0.1 +mvn archetype:generate -DarchetypeGroupId=de.hs-heilbronn.mi -DarchetypeArtifactId=crawler4j-archetype -DarchetypeVersion=5.1.0 ``` ### Manual diff --git a/crawler4j-archetype/pom.xml b/crawler4j-archetype/pom.xml index 24c8db4a..9dfab82a 100644 --- a/crawler4j-archetype/pom.xml +++ b/crawler4j-archetype/pom.xml @@ -5,7 +5,7 @@ crawler4j-parent de.hs-heilbronn.mi - 5.1.0-SNAPSHOT + 5.1.0 4.0.0 diff --git a/crawler4j-boms/crawler4j-with-hsqldb/pom.xml b/crawler4j-boms/crawler4j-with-hsqldb/pom.xml index 39a19c77..ad21a6f4 100644 --- a/crawler4j-boms/crawler4j-with-hsqldb/pom.xml +++ b/crawler4j-boms/crawler4j-with-hsqldb/pom.xml @@ -5,7 +5,7 @@ crawler4j-boms de.hs-heilbronn.mi - 5.1.0-SNAPSHOT + 5.1.0 4.0.0 diff --git a/crawler4j-boms/crawler4j-with-sleepycat/pom.xml b/crawler4j-boms/crawler4j-with-sleepycat/pom.xml index 300640df..7ad1807e 100644 --- a/crawler4j-boms/crawler4j-with-sleepycat/pom.xml +++ b/crawler4j-boms/crawler4j-with-sleepycat/pom.xml @@ -5,7 +5,7 @@ crawler4j-boms de.hs-heilbronn.mi - 5.1.0-SNAPSHOT + 5.1.0 4.0.0 pom diff --git a/crawler4j-boms/crawler4j-with-urlfrontier/pom.xml b/crawler4j-boms/crawler4j-with-urlfrontier/pom.xml index 630aeefe..6be20921 100644 --- a/crawler4j-boms/crawler4j-with-urlfrontier/pom.xml +++ b/crawler4j-boms/crawler4j-with-urlfrontier/pom.xml @@ -5,7 +5,7 @@ crawler4j-boms de.hs-heilbronn.mi - 5.1.0-SNAPSHOT + 5.1.0 4.0.0 pom diff --git a/crawler4j-boms/pom.xml b/crawler4j-boms/pom.xml index 29e627a1..aae583e2 100644 --- a/crawler4j-boms/pom.xml +++ b/crawler4j-boms/pom.xml @@ -5,7 +5,7 @@ crawler4j-parent de.hs-heilbronn.mi - 5.1.0-SNAPSHOT + 5.1.0 4.0.0 diff --git a/crawler4j-commons/pom.xml b/crawler4j-commons/pom.xml index a411efde..d5e55729 100644 --- a/crawler4j-commons/pom.xml +++ b/crawler4j-commons/pom.xml @@ -5,7 +5,7 @@ crawler4j-parent de.hs-heilbronn.mi - 5.1.0-SNAPSHOT + 5.1.0 4.0.0 diff --git a/crawler4j-core/pom.xml b/crawler4j-core/pom.xml index 73d7feff..c547bf55 100644 --- a/crawler4j-core/pom.xml +++ b/crawler4j-core/pom.xml @@ -5,7 +5,7 @@ crawler4j-parent de.hs-heilbronn.mi - 5.1.0-SNAPSHOT + 5.1.0 crawler4j-core diff --git a/crawler4j-examples/crawler4j-examples-base/pom.xml b/crawler4j-examples/crawler4j-examples-base/pom.xml index d09cc0d8..c882b034 100644 --- a/crawler4j-examples/crawler4j-examples-base/pom.xml +++ b/crawler4j-examples/crawler4j-examples-base/pom.xml @@ -3,7 +3,7 @@ crawler4j-examples de.hs-heilbronn.mi - 5.1.0-SNAPSHOT + 5.1.0 crawler4j-examples-base ${project.groupId}:${project.artifactId} diff --git a/crawler4j-examples/crawler4j-examples-postgres/pom.xml b/crawler4j-examples/crawler4j-examples-postgres/pom.xml index bc2041ce..f9cfaa7d 100644 --- a/crawler4j-examples/crawler4j-examples-postgres/pom.xml +++ b/crawler4j-examples/crawler4j-examples-postgres/pom.xml @@ -3,7 +3,7 @@ crawler4j-examples de.hs-heilbronn.mi - 5.1.0-SNAPSHOT + 5.1.0 ${project.groupId}:${project.artifactId} crawler4j-examples-postgres diff --git a/crawler4j-examples/pom.xml b/crawler4j-examples/pom.xml index 14f88807..eba5e5fa 100644 --- a/crawler4j-examples/pom.xml +++ b/crawler4j-examples/pom.xml @@ -3,7 +3,7 @@ crawler4j-parent de.hs-heilbronn.mi - 5.1.0-SNAPSHOT + 5.1.0 ${project.groupId}:${project.artifactId} diff --git a/crawler4j-frontier/crawler4j-frontier-hsqldb/pom.xml b/crawler4j-frontier/crawler4j-frontier-hsqldb/pom.xml index 9057e660..8b09bee6 100644 --- a/crawler4j-frontier/crawler4j-frontier-hsqldb/pom.xml +++ b/crawler4j-frontier/crawler4j-frontier-hsqldb/pom.xml @@ -5,7 +5,7 @@ crawler4j-frontier de.hs-heilbronn.mi - 5.1.0-SNAPSHOT + 5.1.0 4.0.0 diff --git a/crawler4j-frontier/crawler4j-frontier-sleepycat/pom.xml b/crawler4j-frontier/crawler4j-frontier-sleepycat/pom.xml index 7549f344..a3ba88da 100644 --- a/crawler4j-frontier/crawler4j-frontier-sleepycat/pom.xml +++ b/crawler4j-frontier/crawler4j-frontier-sleepycat/pom.xml @@ -5,7 +5,7 @@ crawler4j-frontier de.hs-heilbronn.mi - 5.1.0-SNAPSHOT + 5.1.0 4.0.0 diff --git a/crawler4j-frontier/crawler4j-frontier-urlfrontier/pom.xml b/crawler4j-frontier/crawler4j-frontier-urlfrontier/pom.xml index 2f2e7423..b9443245 100644 --- a/crawler4j-frontier/crawler4j-frontier-urlfrontier/pom.xml +++ b/crawler4j-frontier/crawler4j-frontier-urlfrontier/pom.xml @@ -5,7 +5,7 @@ crawler4j-frontier de.hs-heilbronn.mi - 5.1.0-SNAPSHOT + 5.1.0 4.0.0 diff --git a/crawler4j-frontier/pom.xml b/crawler4j-frontier/pom.xml index 35339f75..5a134668 100644 --- a/crawler4j-frontier/pom.xml +++ b/crawler4j-frontier/pom.xml @@ -5,7 +5,7 @@ crawler4j-parent de.hs-heilbronn.mi - 5.1.0-SNAPSHOT + 5.1.0 4.0.0 diff --git a/pom.xml b/pom.xml index 7347cc31..5a24eaaf 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ de.hs-heilbronn.mi crawler4j-parent pom - 5.1.0-SNAPSHOT + 5.1.0 ${project.groupId}:${project.artifactId} Open Source Web Crawler for Java @@ -23,18 +23,6 @@ scm:git:git@github.com:rzo1/crawler4j.git HEAD - - - ossrh - Sonatype Nexus snapshot repository - https://oss.sonatype.org/content/repositories/snapshots - - - ossrh - Sonatype Nexus release repository - https://oss.sonatype.org/service/local/staging/deploy/maven2/ - - yasserg @@ -168,6 +156,15 @@ org.apache.maven.plugins maven-gpg-plugin + + org.sonatype.central + central-publishing-maven-plugin + 0.6.0 + true + + central + +