Skip to content

Commit

Permalink
Add API implementation (#1)
Browse files Browse the repository at this point in the history
* Set version 11 to avoid conflicts

* Add FontFace and Color properties to Paragraph

* Add FontFace and Color properties to Word

* Fix exceptions

* Add api skeleton

* Modified PdfApi and PdfService

* modified PsfService to accept a url

* Improve exceptions handling and move models in a package

* Add docs

---------

Co-authored-by: AnnaMarika01 <[email protected]>
  • Loading branch information
andreaponti5 and AnnaMarika01 authored May 27, 2024
1 parent 00d4db9 commit 1f53bba
Show file tree
Hide file tree
Showing 8 changed files with 346 additions and 5 deletions.
51 changes: 51 additions & 0 deletions pdfact-api/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>pdfact</groupId>
<artifactId>pdfact-parent</artifactId>
<version>0.0.1-SNAPSHOT</version>
</parent>

<artifactId>pdfact-api</artifactId>

<dependencies>
<dependency>
<groupId>pdfact</groupId>
<artifactId>pdfact-core</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>pdfact</groupId>
<artifactId>pdfact-cli</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<!-- Spark Java -->
<dependency>
<groupId>com.sparkjava</groupId>
<artifactId>spark-core</artifactId>
<version>2.9.3</version>
</dependency>
<!-- JSON Handling -->
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.8</version>
</dependency>
<dependency>
<groupId>pdfact</groupId>
<artifactId>pdfact-core</artifactId>
<version>0.0.1-SNAPSHOT</version>
<scope>compile</scope>
</dependency>
</dependencies>

<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

</project>
60 changes: 60 additions & 0 deletions pdfact-api/src/main/java/pdfact/api/PdfApi.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package pdfact.api;

import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.JsonObject;
import pdfact.api.model.RequestPayload;
import pdfact.core.util.exception.PdfActException;
import spark.Request;
import spark.Response;

import java.io.IOException;

import static spark.Spark.post;

/**
* The API to parse a pdf file.
*/
public class PdfApi {

public static void main(String[] args) {
PdfService pdfService = new PdfService();
Gson gson = new GsonBuilder().disableHtmlEscaping().create();

post("/api/pdf/parse", (request, response) -> parsePdf(request, response, pdfService, gson), gson::toJson);
}

private static Object parsePdf(Request request, Response response, PdfService pdfService, Gson gson) {
String body = request.body();
RequestPayload requestPayload = gson.fromJson(body, RequestPayload.class);

if (requestPayload == null || requestPayload.getUrl() == null || requestPayload.getUrl().isEmpty()) {
response.status(400);
JsonObject errorResponse = new JsonObject();
errorResponse.addProperty("error", "File url is required");
return errorResponse;
}

JsonObject jsonResult;

try {
String jsonString = pdfService.parsePdf(requestPayload.getUrl(), requestPayload.getUnit(), requestPayload.getRoles());
jsonResult = gson.fromJson(jsonString, JsonObject.class);
response.status(200);
} catch (IllegalArgumentException e) {
response.status(422);
jsonResult = new JsonObject();
jsonResult.addProperty("error", "Illegal arguments. " + e.getMessage());
} catch (IOException e) {
response.status(400);
jsonResult = new JsonObject();
jsonResult.addProperty("error", "An error occurred while downloading the pdf file. " + e.getMessage());
} catch (PdfActException e) {
response.status(500);
jsonResult = new JsonObject();
jsonResult.addProperty("error", "An error occurred while processing the pdf file.");
}
return jsonResult;
}
}

122 changes: 122 additions & 0 deletions pdfact-api/src/main/java/pdfact/api/PdfService.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
package pdfact.api;

import pdfact.cli.PdfAct;
import pdfact.cli.model.ExtractionUnit;
import pdfact.cli.pipes.serialize.PdfJsonSerializer;
import pdfact.core.model.Document;
import pdfact.core.model.SemanticRole;
import pdfact.core.util.exception.PdfActException;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;


/**
* The service to process pdf file and extract text.
*/
public class PdfService {

/**
* Download a pdf file and create a json representation of its content.
*
* @param fileUrl: The url to access the pdf file.
* @param unitSelected: The unit to split text on (e.g., paragraphs, words, characters, etc.).
* @param rolesSelected: The roles to extract (e.g., body, title, etc.).
* @throws IOException: If the file download/load goes wrong.
* @throws PdfActException: If the pdf processing or text extraction goes wrong.
* @throws IllegalArgumentException: If wrong roles or units are passed by.
* @return: A json representation of the extracted text.
*/
public String parsePdf(String fileUrl, String unitSelected, List<String> rolesSelected) throws IOException, PdfActException, IllegalArgumentException {
PdfAct pdfAct = new PdfAct();
String jsonString;
Set<ExtractionUnit> unit = new HashSet<>();
Set<SemanticRole> roles;

if (unitSelected != null) {
unit = getExtractionUnitSet(unitSelected);
pdfAct.setExtractionUnits(unit);
} else {
unit.add(ExtractionUnit.PARAGRAPH);
}
if (rolesSelected != null) {
roles = convertToSemanticRoles(rolesSelected);
pdfAct.setSemanticRoles(roles);
} else {
roles = new HashSet<>(Arrays.asList(SemanticRole.values()));
}

Path tempFile = downloadFileFromUrl(fileUrl);
Document pdf = pdfAct.parse(tempFile.toString());
PdfJsonSerializer serializer = new PdfJsonSerializer(unit, roles);
byte[] serializedPdf = serializer.serialize(pdf);
jsonString = new String(serializedPdf, StandardCharsets.UTF_8);

return jsonString;

}

/**
* Download a pdf file.
*
* @param fileUrl: The url to access the pdf file.
* @throws IOException: If the file download/load goes wrong.
* @return: The path to the downloaded pdf file.
*/
private Path downloadFileFromUrl(String fileUrl) throws IOException {
URL url = new URL(fileUrl);
Path tempFile = Files.createTempFile("temp", ".pdf");
try (InputStream in = url.openStream()) {
Files.copy(in, tempFile, StandardCopyOption.REPLACE_EXISTING);
}
return tempFile;
}

/**
* Validate the given unit.
*
* @param unit: The unit to split text on (e.g., paragraphs, words, characters, etc.).
* @throws IllegalArgumentException: If wrong units are passed by.
* @return: The validated unit.
*/
public Set<ExtractionUnit> getExtractionUnitSet(String unit) throws IllegalArgumentException {
Set<ExtractionUnit> unitSelected = new HashSet<>();
try {
ExtractionUnit extractionUnit = ExtractionUnit.valueOf(unit.toUpperCase());
unitSelected.add(extractionUnit);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("The extraction unit `" + unit + "` is not valid.", e);
}
return unitSelected;
}

/**
* Validate the list of roles.
*
* @param rolesList: The roles to extract (e.g., body, title, etc.).
* @throws IllegalArgumentException: If wrong roles are passed by.
* @return: The validated list of roles.
*/
public Set<SemanticRole> convertToSemanticRoles(List<String> rolesList) throws IllegalArgumentException {
Set<SemanticRole> roles = new HashSet<>();
for (String role : rolesList) {
try {
SemanticRole semanticRole = SemanticRole.valueOf(role.toUpperCase());
roles.add(semanticRole);
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("The role `" + role + "` is not valid.", e);
}
}
return roles;
}

}
51 changes: 51 additions & 0 deletions pdfact-api/src/main/java/pdfact/api/model/RequestPayload.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package pdfact.api.model;

import java.util.List;

/**
* The expected request payload.
*/
public class RequestPayload {
/**
* The url to access the pdf file.
*/
private String url;
/**
* The unit to split text on (e.g., paragraphs, words, characters, etc.).
*/
private String unit;
/**
* The roles to extract (e.g., body, title, etc.).
*/
private List<String> roles;

// ==============================================================================================

public String getUrl() {
return url;
}

public void setUrl(String url) {
this.url = url;
}

// ==============================================================================================

public String getUnit() {
return unit;
}

public void setUnit(String unit) {
this.unit = unit;
}

// ==============================================================================================

public List<String> getRoles() {
return roles;
}

public void setRoles(List<String> roles) {
this.roles = roles;
}
}
13 changes: 13 additions & 0 deletions pdfact-cli/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@
<artifactId>json</artifactId>
<version>20160810</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.5</version>
</dependency>
</dependencies>

<build>
Expand Down Expand Up @@ -105,6 +110,14 @@
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>11</source>
<target>11</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
25 changes: 23 additions & 2 deletions pdfact-core/src/main/java/pdfact/core/model/Paragraph.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import java.util.List;

import org.apache.commons.lang3.NotImplementedException;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.apache.commons.lang3.builder.HashCodeBuilder;

Expand All @@ -12,7 +13,7 @@
*
* @author Claudius Korzen
*/
public class Paragraph extends Element implements HasWords, HasText, HasPositions, HasSemanticRole {
public class Paragraph extends Element implements HasWords, HasText, HasPositions, HasSemanticRole, HasColor, HasFontFace {
/**
* The words of this paragraph.
*/
Expand Down Expand Up @@ -194,7 +195,7 @@ public boolean equals(Object other) {
builder.append(getPositions(), otherParagraph.getPositions());
builder.append(getSemanticRole(), otherParagraph.getSemanticRole());
builder.append(getSecondarySemanticRole(),
otherParagraph.getSecondarySemanticRole());
otherParagraph.getSecondarySemanticRole());

return builder.isEquals();
}
Expand All @@ -211,4 +212,24 @@ public int hashCode() {
builder.append(getSecondarySemanticRole());
return builder.hashCode();
}

@Override
public Color getColor() {
return this.characterStatistic.colorFrequencies.getMostCommonObject();
}

@Override
public void setColor(Color color) {
throw new NotImplementedException("Paragraph does not allow to explicitly set the Color");
}

@Override
public FontFace getFontFace() {
return this.characterStatistic.fontFaceFrequencies.getMostCommonObject();
}

@Override
public void setFontFace(FontFace fontFace) {
throw new NotImplementedException("Paragraph does not allow to explicitly set the FontFace");
}
}
Loading

0 comments on commit 1f53bba

Please sign in to comment.