diff --git a/project.clj b/project.clj index 7d9ef8e..f2ab364 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject run.avelino/boilerpipe-clj "0.3.1" +(defproject run.avelino/boilerpipe-clj "0.3.2" :description "A simple wrapper around the Boilerpipe library for extracting text from html articles/pages" :url "https://avelino.run" :license {:name "Apache License, Version 2.0" diff --git a/src/boilerpipe_clj/core.clj b/src/boilerpipe_clj/core.clj index 2b88d91..dfbd4d0 100644 --- a/src/boilerpipe_clj/core.clj +++ b/src/boilerpipe_clj/core.clj @@ -1,5 +1,6 @@ (ns boilerpipe-clj.core - (:require [boilerpipe-clj.extractors :as ext]) + (:require [boilerpipe-clj.extractors :as ext] + [clojure.java.io :refer [as-url]]) (:import (de.l3s.boilerpipe.extractors ExtractorBase))) (defn get-text @@ -11,3 +12,9 @@ (get-text source ext/article-extractor)) ([^String source ^ExtractorBase extractor] (.getText extractor source))) + +(defn get-images + "Takes the URL of the page and return list of Image" + [^String url] + (.process ext/image-extractor (as-url url) ext/default-extractor)) + diff --git a/src/boilerpipe_clj/extractors.clj b/src/boilerpipe_clj/extractors.clj index 94be0d4..fb416e7 100644 --- a/src/boilerpipe_clj/extractors.clj +++ b/src/boilerpipe_clj/extractors.clj @@ -2,9 +2,11 @@ (:import (de.l3s.boilerpipe.extractors ArticleExtractor ArticleSentencesExtractor - DefaultExtractor))) + DefaultExtractor) + (de.l3s.boilerpipe.sax ImageExtractor))) (defonce article-extractor (ArticleExtractor/getInstance)) (defonce default-extractor (DefaultExtractor/getInstance)) +(defonce image-extractor (ImageExtractor/INSTANCE)) (defonce article-sentence-extractor (ArticleSentencesExtractor/getInstance))