Skip to content

Commit

Permalink
OCR (#65)
Browse files Browse the repository at this point in the history
* Add AutoDetectTextExtractor

* Add bytedeco ImageTextExtractor

* Add PDF text extractor

* Add max length validation

* Add white chars filter

* Add minimum word length validator
  • Loading branch information
yuliiabuchko authored May 14, 2020
1 parent fd34656 commit 4355720
Show file tree
Hide file tree
Showing 12 changed files with 219 additions and 5 deletions.
3 changes: 3 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ dependencies {
compile("org.glassfish.jaxb:jaxb-runtime:2.3.2")
compile("commons-io:commons-io:2.6")
compile("org.apache.tika:tika-parsers:1.24")
compile("org.apache.pdfbox:pdfbox:2.0.19")
compile("net.sourceforge.tess4j:tess4j:4.5.1")
compile("org.bytedeco:tesseract-platform:4.1.1-1.5.3")

// Remove devtools for release
runtime('org.springframework.boot:spring-boot-devtools')
Expand Down
4 changes: 2 additions & 2 deletions gradle/wrapper/gradle-wrapper.properties
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#Mon May 04 21:03:43 CEST 2020
distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.2-all.zip
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.2-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package pl.edu.uj.ii.ksi.mordor.configuration

import org.bytedeco.tesseract.TessBaseAPI
import org.springframework.context.annotation.Bean
import org.springframework.context.annotation.Configuration

@Configuration
class TessBaseConfig {
@Bean
fun tessBaseAPI(): TessBaseAPI {
val api = TessBaseAPI()
api.Init("./src/main/resources/tessdata/", "eng")
api.Init("./src/main/resources/tessdata/", "pol")
return api
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,21 @@ package pl.edu.uj.ii.ksi.mordor.services

import java.io.File
import javax.persistence.EntityManager
import org.springframework.beans.factory.annotation.Qualifier
import org.springframework.stereotype.Service
import org.springframework.transaction.annotation.Transactional
import pl.edu.uj.ii.ksi.mordor.persistence.entities.FileContent
import pl.edu.uj.ii.ksi.mordor.persistence.entities.FileEntry
import pl.edu.uj.ii.ksi.mordor.persistence.entities.FileMetadata
import pl.edu.uj.ii.ksi.mordor.persistence.repositories.FileMetadataRepository
import pl.edu.uj.ii.ksi.mordor.services.hash.FileHashProvider
import pl.edu.uj.ii.ksi.mordor.services.text.extractor.FileTextExtractor

@Service
class FileEntryCreator(
private val metadataExtractor: MetadataExtractor,
private val entityManager: EntityManager,
private val fileTextExtractor: FileTextExtractor,
@Qualifier("autoDetectTextExtractor") private val fileTextExtractor: FileTextExtractor,
private val hashProvider: FileHashProvider,
private val metadataRepository: FileMetadataRepository
) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package pl.edu.uj.ii.ksi.mordor.services.text.extractor

import java.io.File
import java.io.IOException
import java.lang.StringBuilder
import org.apache.tika.Tika
import org.bytedeco.tesseract.TessBaseAPI
import org.slf4j.LoggerFactory
import org.springframework.stereotype.Service
import pl.edu.uj.ii.ksi.mordor.services.repository.RepositoryService

@Service
class AutoDetectTextExtractor(
private val tika: Tika,
private val tessBaseAPI: TessBaseAPI
) : FileTextExtractor {

companion object {
private val logger = LoggerFactory.getLogger(RepositoryService::class.java)
}

private val minWordLength = 4

override fun extract(file: File, maxLength: Int): String? {
val content = cleanSmallWords(extractRaw(file, maxLength))
return if (FileContentValidator().isValid(content)) content else null
}

private fun extractRaw(file: File, maxLength: Int): String? {
try {
val tikaContent = TikaFileTextExtractor(tika).extract(file, maxLength)
if (!isScanned(tikaContent)) {
logger.info("Extracted text from " + file.absolutePath + " using Tika")
return tikaContent
}
val type = tika.detect(file)
if (type == "application/pdf") {
logger.info("Extracted text from " + file.absolutePath + " using Bytedeco for PDF")
return PDFTextExtractor(tessBaseAPI).extract(file, maxLength)
}
if (type.startsWith("image")) {
logger.info("Extracted text from " + file.absolutePath + " using Bytedeco")
return ImageTextExtractor(tessBaseAPI).extract(file, maxLength)
}
} catch (e: IOException) {
logger.error("File can not be read: " + file.absolutePath, e)
}
return null
}

private fun isScanned(content: String?): Boolean {
return content?.trim()?.isEmpty() ?: true
}

private fun cleanSmallWords(content: String?): String? {
if (content == null) {
return null
}
val result = StringBuilder()
for (seq in content.split("\\s".toRegex())) {
if (seq.isNotEmpty() && seq.length > minWordLength) {
result.append("$seq ")
}
}
return result.toString()
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package pl.edu.uj.ii.ksi.mordor.services.text.extractor

import org.slf4j.LoggerFactory
import pl.edu.uj.ii.ksi.mordor.services.repository.RepositoryService

class FileContentValidator {

private val minAlphanumericsPercent = 0.6

companion object {
private val logger = LoggerFactory.getLogger(RepositoryService::class.java)
}

fun isValid(content: String?): Boolean {
return content.isNullOrEmpty() || whiteSpaceFilter(content)
}

private fun whiteSpaceFilter(content: String): Boolean {
val letters = content.filter { c -> c.isLetterOrDigit() }.length
if (letters.toFloat().div(content.length) < minAlphanumericsPercent) {
logger.warn("Number of alphanumeric chars is less than 60%. OCR result will be turned to null")
return false
}
return true
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package pl.edu.uj.ii.ksi.mordor.services
package pl.edu.uj.ii.ksi.mordor.services.text.extractor

import java.io.File

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package pl.edu.uj.ii.ksi.mordor.services.text.extractor

import java.io.File
import org.bytedeco.leptonica.global.lept.pixRead
import org.bytedeco.tesseract.TessBaseAPI

class ImageTextExtractor(private val tessBaseAPI: TessBaseAPI) : FileTextExtractor {

override fun extract(file: File, maxLength: Int): String? {
tessBaseAPI.SetImage(pixRead(file.absolutePath))

val res = tessBaseAPI.GetUTF8Text().string.trimIndent()
if (maxLength < 0) {
return res
}
return res.take(maxLength)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
package pl.edu.uj.ii.ksi.mordor.services.text.extractor

import com.recognition.software.jdeskew.ImageDeskew
import java.awt.image.BufferedImage
import java.io.File
import java.io.IOException
import java.util.LinkedList
import javax.imageio.ImageIO
import net.sourceforge.tess4j.util.ImageHelper
import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject
import org.bytedeco.tesseract.TessBaseAPI
import org.slf4j.LoggerFactory
import org.springframework.stereotype.Service
import pl.edu.uj.ii.ksi.mordor.services.repository.RepositoryService

@Service
class PDFTextExtractor(private val tessBaseAPI: TessBaseAPI) : FileTextExtractor {

private val maxSkewAngle = 0.05

companion object {
private val logger = LoggerFactory.getLogger(RepositoryService::class.java)
}

override fun extract(file: File, maxLength: Int): String? {
val bufferedImages = formatPDF(file)
if (bufferedImages.isEmpty()) {
return null
}

val extracted = StringBuilder()
val outputFile = File.createTempFile("temp", "jpg")
for (image in bufferedImages) {
if (maxLength >= 0 && extracted.length > maxLength) {
break
}

try {
ImageIO.write(correctTwisted(image), "jpg", outputFile)
val text: String? = ImageTextExtractor(tessBaseAPI).extract(outputFile, maxLength)
if (text != null) {
extracted.append(text)
}
} catch (e: IOException) {
logger.error("Could not retrieve text from " + file.absolutePath)
}
}
outputFile.delete()

return if (maxLength >= 0 && maxLength < extracted.length) {
extracted.toString().substring(0, maxLength)
} else {
extracted.toString()
}
}

private fun formatPDF(pdfFile: File): LinkedList<BufferedImage?> {
val bufferedImages = LinkedList<BufferedImage?>()
val doc: PDDocument = PDDocument.load(pdfFile)
doc.use {
for (page in doc.pages) {
val resources = page.resources
for (xObjectName in resources.xObjectNames) {
val xObject = resources.getXObject(xObjectName)
if (xObject is PDImageXObject) {
bufferedImages.add(xObject.image)
}
}
}
}
return bufferedImages
}

private fun correctTwisted(image: BufferedImage?): BufferedImage? {
val imageSkewAngle = ImageDeskew(image).skewAngle
if (kotlin.math.abs(imageSkewAngle) > maxSkewAngle) {
return ImageHelper.rotateImage(ImageHelper.convertImageToGrayscale(image), -imageSkewAngle)
}
return ImageHelper.convertImageToGrayscale(image)
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package pl.edu.uj.ii.ksi.mordor.services
package pl.edu.uj.ii.ksi.mordor.services.text.extractor

import java.io.File
import java.io.IOException
Expand Down
Binary file added src/main/resources/tessdata/eng.traineddata
Binary file not shown.
Binary file added src/main/resources/tessdata/pol.traineddata
Binary file not shown.

0 comments on commit 4355720

Please sign in to comment.