Skip to content

Commit

Permalink
Merge pull request #2134 from openturing/feature/ocr-metadata
Browse files Browse the repository at this point in the history
Feature/ocr metadata
  • Loading branch information
alegauss authored Sep 17, 2024
2 parents c5c4ce7 + 608d383 commit 212ed8a
Show file tree
Hide file tree
Showing 20 changed files with 514 additions and 113 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.util.List;
import java.util.Objects;
import java.util.Optional;

@Slf4j
Expand Down Expand Up @@ -98,8 +99,7 @@ public RedactionScript validateText(@RequestParam("text") String text)
@PostMapping(value = "/entity/file/blazon")
public RedactionScript validateFile(@RequestParam("file") MultipartFile multipartFile)
throws IOException, InterruptedException {
final String text = TurFileUtils.documentToText(multipartFile);
return getEntities(text);
return getEntities(Objects.requireNonNull(TurFileUtils.documentToText(multipartFile)).getContent());

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@

import java.util.Arrays;
import java.util.List;
import java.util.Objects;

@Slf4j
@RestController
Expand Down Expand Up @@ -104,7 +105,7 @@ public TurData turDataAdd(@RequestBody TurData turData) {
@Transactional
public String turDataImport(@RequestParam("file") MultipartFile multipartFile) {
String[] sentences = turOpenNLPConnector.sentenceDetect(turNLPProcess.getDefaultNLPInstance(),
TurFileUtils.documentToText(multipartFile));
Objects.requireNonNull(TurFileUtils.documentToText(multipartFile)).getContent());
TurData turData = new TurData();
turData.setName(multipartFile.getOriginalFilename());
turData.setType(FilenameUtils.getExtension(multipartFile.getOriginalFilename()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@
import com.viglet.turing.api.nlp.bean.TurNLPEntityValidateResponse;
import com.viglet.turing.api.nlp.bean.TurNLPValidateDocument;
import com.viglet.turing.api.nlp.bean.TurNLPValidateResponse;
import com.viglet.turing.commons.utils.TurCommonsUtils;
import com.viglet.turing.filesystem.commons.TurFileAttributes;
import com.viglet.turing.filesystem.commons.TurFileUtils;
import com.viglet.turing.filesystem.commons.TurTikaFileAttributes;
import com.viglet.turing.nlp.TurNLPProcess;
import com.viglet.turing.nlp.TurNLPResponse;
import com.viglet.turing.nlp.TurNLPUtils;
Expand All @@ -54,6 +53,7 @@
import java.io.File;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;

Expand Down Expand Up @@ -134,7 +134,7 @@ public TurNLPInstance turNLPInstanceAdd(@RequestBody TurNLPInstance turNLPInstan

@PostMapping(value = "/{id}/validate/file/blazon", produces = MediaType.APPLICATION_XML_VALUE)
public RedactionScript validateFile(@RequestParam("file") MultipartFile multipartFile, @PathVariable String id) {
final String text = TurFileUtils.documentToText(multipartFile);
final String text = Objects.requireNonNull(TurFileUtils.documentToText(multipartFile)).getContent();
TurNLPTextValidate textValidate = new TurNLPTextValidate();
textValidate.setText(text);
return this.turNLPInstanceRepository.findById(id).map(turNLPInstance ->
Expand All @@ -149,18 +149,18 @@ public TurNLPValidateResponse validateDocument(@PathVariable String id,
@RequestParam("config") String turNLPValidateDocumentRequest) {

File file = TurSpringUtils.getFileFromMultipart(multipartFile);
TurFileAttributes turFileAttributes = TurFileUtils.readFile(file);
TurTikaFileAttributes turTikaFileAttributes = TurFileUtils.readFile(file);
return this.turNLPInstanceRepository.findById(id)
.map(turNLPInstance -> {
try {
TurNLPValidateDocument turNLPValidateDocument = new ObjectMapper().readValue(turNLPValidateDocumentRequest,
TurNLPValidateDocument.class);
if (turFileAttributes != null && turNLPValidateDocument != null) {
if (turTikaFileAttributes != null && turNLPValidateDocument != null) {
TurNLPResponse turNLPResponse = turNLPProcess.processTextByNLP(turNLPInstance,
turFileAttributes.getContent(), turNLPValidateDocument.getEntities());
turTikaFileAttributes.getContent(), turNLPValidateDocument.getEntities());
List<String> terms = getNLPTerms(turNLPResponse);
turNLPUtils.redactPdf(file, terms);
return createNLPValidateResponse(turNLPInstance, turNLPResponse, turFileAttributes.getContent());
return createNLPValidateResponse(turNLPInstance, turNLPResponse, turTikaFileAttributes.getContent());
}
} catch (JsonProcessingException e) {
log.error(e.getMessage(), e);
Expand Down
45 changes: 39 additions & 6 deletions turing-app/src/main/java/com/viglet/turing/api/ocr/TurOcrAPI.java
Original file line number Diff line number Diff line change
@@ -1,22 +1,55 @@
/*
*
* Copyright (C) 2016-2024 the original author or authors.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

package com.viglet.turing.api.ocr;

import com.viglet.turing.commons.file.TurFileAttributes;
import com.viglet.turing.filesystem.commons.TurFileUtils;
import io.swagger.v3.oas.annotations.tags.Tag;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
import java.net.MalformedURLException;
import java.net.URI;

/**
* @author Alexandre Oliveira
* @since 0.3.9
*/
@Slf4j
@RestController
@RequestMapping("/api/ocr")
@Tag(name = "OCR", description = "OCR API")
public class TurOcrAPI {

@PostMapping
public String convertToText(@RequestParam("file") MultipartFile multipartFile) {
@PostMapping("/file")
public TurFileAttributes fileToText(@RequestParam("file") MultipartFile multipartFile) {
return TurFileUtils.documentToText(multipartFile);
}

@PostMapping("/url")
public TurFileAttributes urlToText(@RequestBody TurOcrFromUrl turOcrFromUrl) {
try {
return TurFileUtils.urlContentToText(URI.create(turOcrFromUrl.getUrl()).toURL());
}
catch (MalformedURLException e) {
log.error(e.getMessage(), e);
}
return new TurFileAttributes();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
*
* Copyright (C) 2016-2024 the original author or authors.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

package com.viglet.turing.api.ocr;

import lombok.Getter;
import lombok.Setter;

/**
* @author Alexandre Oliveira
* @since 0.3.9
*/
@Getter
@Setter
public class TurOcrFromUrl {
private String url;
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
@Setter
@Builder
public class TurSolrCoreExists {
String name;
boolean exists;
private String name;
private boolean exists;

@Tolerate
public TurSolrCoreExists() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@
import com.viglet.turing.client.sn.job.TurSNJobItem;
import com.viglet.turing.client.sn.job.TurSNJobItems;
import com.viglet.turing.commons.utils.TurCommonsUtils;
import com.viglet.turing.filesystem.commons.TurFileAttributes;
import com.viglet.turing.filesystem.commons.TurFileUtils;
import com.viglet.turing.filesystem.commons.TurTikaFileAttributes;
import com.viglet.turing.persistence.repository.sn.TurSNSiteRepository;
import com.viglet.turing.sn.TurSNConstants;
import com.viglet.turing.spring.utils.TurSpringUtils;
Expand Down Expand Up @@ -108,16 +108,11 @@ public boolean turSNImportZipFileBroker(@RequestParam("file") MultipartFile mult
private void extractTextOfFileAttribute(File extractFolder, Map.Entry<String, Object> attribute) {
if (attribute.getValue().toString().startsWith(TurSNConstants.FILE_PROTOCOL)) {
String fileName = attribute.getValue().toString().replace(TurSNConstants.FILE_PROTOCOL, "");
try (FileInputStream fileInputStreamAttribute = new FileInputStream(
extractFolder.getAbsolutePath() + File.separator + fileName)) {
TurFileAttributes turFileAttributes = TurFileUtils.parseFile(fileInputStreamAttribute, null);
Optional.ofNullable(turFileAttributes)
.map(TurFileAttributes::getContent)
.ifPresent(content -> attribute.setValue(TurCommonsUtils.cleanTextContent(content)));
} catch (IOException e) {
log.error(e.getMessage(), e);
}

File file = new File(extractFolder.getAbsolutePath().concat(File.separator).concat(fileName));
TurTikaFileAttributes turTikaFileAttributes = TurFileUtils.parseFile(file);
Optional.ofNullable(turTikaFileAttributes)
.map(TurTikaFileAttributes::getContent)
.ifPresent(content -> attribute.setValue(TurCommonsUtils.cleanTextContent(content)));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ SecurityFilterChain filterChain(HttpSecurity http, MvcRequestMatcher.Builder mvc
mvc.pattern("/error/**"),
mvc.pattern("/logout"),
mvc.pattern("/api/nlp/**"),
mvc.pattern("/api/ocr"),
mvc.pattern("/api/ocr/**"),
mvc.pattern("/api/llm/**"),
mvc.pattern("/api/v2/guest/**"),
AntPathRequestMatcher.antMatcher("/h2/**")))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
*
* Copyright (C) 2016-2024 the original author or authors.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package com.viglet.turing.commons.file;

import lombok.*;

import java.util.Date;
import java.util.Map;

/**
*
* @author Alexandre Oliveira
*
* @since 0.3.9
*
**/

@Builder(toBuilder = true)
@NoArgsConstructor
@AllArgsConstructor
@Setter
@Getter
@ToString
public class TurFileAttributes {
private String content;
private String name;
private String title;
private String extension;
private TurFileSize size;
private Date lastModified;
private Map<String, String> metadata;


}
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
*
* Copyright (C) 2016-2024 the original author or authors.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

package com.viglet.turing.commons.file;

import lombok.*;

import java.math.BigDecimal;
import java.math.RoundingMode;

/**
* @author Alexandre Oliveira
* @since 0.3.9
**/
@Setter
@Getter
@AllArgsConstructor
@ToString
public class TurFileSize {
private final float bytes;
private final float kiloBytes;
private final float megaBytes;
public TurFileSize() {
this(0f);
}
public TurFileSize(float bytes) {
this.bytes = twoDecimalFloat(bytes);
this.kiloBytes = twoDecimalFloat(this.bytes / 1024);
this.megaBytes = twoDecimalFloat(this.kiloBytes / 1024);
}

private float twoDecimalFloat(float value) {
return BigDecimal.valueOf(value).setScale(2, RoundingMode.HALF_UP).floatValue();
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import com.viglet.turing.commons.cache.TurCustomClassCache;
import com.viglet.turing.connector.db.ext.TurDbExtCustomImpl;
import com.viglet.turing.connector.db.format.TurDbFormatValue;
import com.viglet.turing.filesystem.commons.TurFileAttributes;
import com.viglet.turing.filesystem.commons.TurTikaFileAttributes;
import com.viglet.turing.filesystem.commons.TurFileUtils;
import lombok.Getter;
import lombok.Setter;
Expand Down Expand Up @@ -302,23 +302,23 @@ private void addDBFieldsAsAttributes(ResultSet rs, Map<String, Object> attribute

private void addFileAttributes(Map<String, Object> attributes) {
if (filePathField != null && attributes.containsKey(filePathField)) {
TurFileAttributes turFileAttributes = TurFileUtils.readFile((String) attributes.get(filePathField));
if (turFileAttributes != null) {
addFileSizeAttribute(attributes, turFileAttributes);
addFileContentAttribute(attributes, turFileAttributes);
TurTikaFileAttributes turTikaFileAttributes = TurFileUtils.readFile((String) attributes.get(filePathField));
if (turTikaFileAttributes != null) {
addFileSizeAttribute(attributes, turTikaFileAttributes);
addFileContentAttribute(attributes, turTikaFileAttributes);
}
}
}

private void addFileContentAttribute(Map<String, Object> attributes, TurFileAttributes turFileAttributes) {
private void addFileContentAttribute(Map<String, Object> attributes, TurTikaFileAttributes turTikaFileAttributes) {
if (fileContentField != null) {
long maxContentByteSize = maxContentMegaByteSize * MEGA_BYTE;

if (turFileAttributes.getContent().getBytes().length <= maxContentByteSize) {
attributes.put(fileContentField, turFileAttributes.getContent());
if (turTikaFileAttributes.getContent().getBytes().length <= maxContentByteSize) {
attributes.put(fileContentField, turTikaFileAttributes.getContent());
} else {
attributes.put(fileContentField,
turFileAttributes.getContent().substring(0, Math.toIntExact(maxContentByteSize)));
turTikaFileAttributes.getContent().substring(0, Math.toIntExact(maxContentByteSize)));
if (log.isDebugEnabled()) {
log.debug("File size greater than {}, truncating content ...:",
FileUtils.byteCountToDisplaySize(maxContentByteSize));
Expand All @@ -329,14 +329,14 @@ private void addFileContentAttribute(Map<String, Object> attributes, TurFileAttr
}
}

private void addFileSizeAttribute(Map<String, Object> attributes, TurFileAttributes turFileAttributes) {
if (fileSizeField != null && turFileAttributes.getFile() != null) {
attributes.put(fileSizeField, turFileAttributes.getFile().length());
private void addFileSizeAttribute(Map<String, Object> attributes, TurTikaFileAttributes turTikaFileAttributes) {
if (fileSizeField != null && turTikaFileAttributes.getFile() != null) {
attributes.put(fileSizeField, turTikaFileAttributes.getFile().length());
if (log.isDebugEnabled()) {
log.debug("File: {}", turFileAttributes.getFile().getAbsolutePath());
log.debug("File size: {}", FileUtils.byteCountToDisplaySize(turFileAttributes.getFile().length()));
log.debug("File: {}", turTikaFileAttributes.getFile().getAbsolutePath());
log.debug("File size: {}", FileUtils.byteCountToDisplaySize(turTikaFileAttributes.getFile().length()));
log.debug("File - Content size: {}",
FileUtils.byteCountToDisplaySize(turFileAttributes.getContent().getBytes().length));
FileUtils.byteCountToDisplaySize(turTikaFileAttributes.getContent().getBytes().length));
}
} else {
log.debug("File without size: {}", filePathField);
Expand Down
Loading

0 comments on commit 212ed8a

Please sign in to comment.