forked from ad-freiburg/pdfact
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Set version 11 to avoid conflicts * Add FontFace and Color properties to Paragraph * Add FontFace and Color properties to Word * Fix exceptions * Add api skeleton * Modified PdfApi and PdfService * modified PsfService to accept a url * Improve exceptions handling and move models in a package * Add docs --------- Co-authored-by: AnnaMarika01 <[email protected]>
- Loading branch information
1 parent
00d4db9
commit 1f53bba
Showing
8 changed files
with
346 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
<parent> | ||
<groupId>pdfact</groupId> | ||
<artifactId>pdfact-parent</artifactId> | ||
<version>0.0.1-SNAPSHOT</version> | ||
</parent> | ||
|
||
<artifactId>pdfact-api</artifactId> | ||
|
||
<dependencies> | ||
<dependency> | ||
<groupId>pdfact</groupId> | ||
<artifactId>pdfact-core</artifactId> | ||
<version>0.0.1-SNAPSHOT</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>pdfact</groupId> | ||
<artifactId>pdfact-cli</artifactId> | ||
<version>0.0.1-SNAPSHOT</version> | ||
</dependency> | ||
<!-- Spark Java --> | ||
<dependency> | ||
<groupId>com.sparkjava</groupId> | ||
<artifactId>spark-core</artifactId> | ||
<version>2.9.3</version> | ||
</dependency> | ||
<!-- JSON Handling --> | ||
<dependency> | ||
<groupId>com.google.code.gson</groupId> | ||
<artifactId>gson</artifactId> | ||
<version>2.8.8</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>pdfact</groupId> | ||
<artifactId>pdfact-core</artifactId> | ||
<version>0.0.1-SNAPSHOT</version> | ||
<scope>compile</scope> | ||
</dependency> | ||
</dependencies> | ||
|
||
<properties> | ||
<maven.compiler.source>17</maven.compiler.source> | ||
<maven.compiler.target>17</maven.compiler.target> | ||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | ||
</properties> | ||
|
||
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
package pdfact.api; | ||
|
||
import com.google.gson.Gson; | ||
import com.google.gson.GsonBuilder; | ||
import com.google.gson.JsonObject; | ||
import pdfact.api.model.RequestPayload; | ||
import pdfact.core.util.exception.PdfActException; | ||
import spark.Request; | ||
import spark.Response; | ||
|
||
import java.io.IOException; | ||
|
||
import static spark.Spark.post; | ||
|
||
/** | ||
* The API to parse a pdf file. | ||
*/ | ||
public class PdfApi { | ||
|
||
public static void main(String[] args) { | ||
PdfService pdfService = new PdfService(); | ||
Gson gson = new GsonBuilder().disableHtmlEscaping().create(); | ||
|
||
post("/api/pdf/parse", (request, response) -> parsePdf(request, response, pdfService, gson), gson::toJson); | ||
} | ||
|
||
private static Object parsePdf(Request request, Response response, PdfService pdfService, Gson gson) { | ||
String body = request.body(); | ||
RequestPayload requestPayload = gson.fromJson(body, RequestPayload.class); | ||
|
||
if (requestPayload == null || requestPayload.getUrl() == null || requestPayload.getUrl().isEmpty()) { | ||
response.status(400); | ||
JsonObject errorResponse = new JsonObject(); | ||
errorResponse.addProperty("error", "File url is required"); | ||
return errorResponse; | ||
} | ||
|
||
JsonObject jsonResult; | ||
|
||
try { | ||
String jsonString = pdfService.parsePdf(requestPayload.getUrl(), requestPayload.getUnit(), requestPayload.getRoles()); | ||
jsonResult = gson.fromJson(jsonString, JsonObject.class); | ||
response.status(200); | ||
} catch (IllegalArgumentException e) { | ||
response.status(422); | ||
jsonResult = new JsonObject(); | ||
jsonResult.addProperty("error", "Illegal arguments. " + e.getMessage()); | ||
} catch (IOException e) { | ||
response.status(400); | ||
jsonResult = new JsonObject(); | ||
jsonResult.addProperty("error", "An error occurred while downloading the pdf file. " + e.getMessage()); | ||
} catch (PdfActException e) { | ||
response.status(500); | ||
jsonResult = new JsonObject(); | ||
jsonResult.addProperty("error", "An error occurred while processing the pdf file."); | ||
} | ||
return jsonResult; | ||
} | ||
} | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
package pdfact.api; | ||
|
||
import pdfact.cli.PdfAct; | ||
import pdfact.cli.model.ExtractionUnit; | ||
import pdfact.cli.pipes.serialize.PdfJsonSerializer; | ||
import pdfact.core.model.Document; | ||
import pdfact.core.model.SemanticRole; | ||
import pdfact.core.util.exception.PdfActException; | ||
|
||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.net.URL; | ||
import java.nio.charset.StandardCharsets; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.nio.file.StandardCopyOption; | ||
import java.util.Arrays; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Set; | ||
|
||
|
||
/** | ||
* The service to process pdf file and extract text. | ||
*/ | ||
public class PdfService { | ||
|
||
/** | ||
* Download a pdf file and create a json representation of its content. | ||
* | ||
* @param fileUrl: The url to access the pdf file. | ||
* @param unitSelected: The unit to split text on (e.g., paragraphs, words, characters, etc.). | ||
* @param rolesSelected: The roles to extract (e.g., body, title, etc.). | ||
* @throws IOException: If the file download/load goes wrong. | ||
* @throws PdfActException: If the pdf processing or text extraction goes wrong. | ||
* @throws IllegalArgumentException: If wrong roles or units are passed by. | ||
* @return: A json representation of the extracted text. | ||
*/ | ||
public String parsePdf(String fileUrl, String unitSelected, List<String> rolesSelected) throws IOException, PdfActException, IllegalArgumentException { | ||
PdfAct pdfAct = new PdfAct(); | ||
String jsonString; | ||
Set<ExtractionUnit> unit = new HashSet<>(); | ||
Set<SemanticRole> roles; | ||
|
||
if (unitSelected != null) { | ||
unit = getExtractionUnitSet(unitSelected); | ||
pdfAct.setExtractionUnits(unit); | ||
} else { | ||
unit.add(ExtractionUnit.PARAGRAPH); | ||
} | ||
if (rolesSelected != null) { | ||
roles = convertToSemanticRoles(rolesSelected); | ||
pdfAct.setSemanticRoles(roles); | ||
} else { | ||
roles = new HashSet<>(Arrays.asList(SemanticRole.values())); | ||
} | ||
|
||
Path tempFile = downloadFileFromUrl(fileUrl); | ||
Document pdf = pdfAct.parse(tempFile.toString()); | ||
PdfJsonSerializer serializer = new PdfJsonSerializer(unit, roles); | ||
byte[] serializedPdf = serializer.serialize(pdf); | ||
jsonString = new String(serializedPdf, StandardCharsets.UTF_8); | ||
|
||
return jsonString; | ||
|
||
} | ||
|
||
/** | ||
* Download a pdf file. | ||
* | ||
* @param fileUrl: The url to access the pdf file. | ||
* @throws IOException: If the file download/load goes wrong. | ||
* @return: The path to the downloaded pdf file. | ||
*/ | ||
private Path downloadFileFromUrl(String fileUrl) throws IOException { | ||
URL url = new URL(fileUrl); | ||
Path tempFile = Files.createTempFile("temp", ".pdf"); | ||
try (InputStream in = url.openStream()) { | ||
Files.copy(in, tempFile, StandardCopyOption.REPLACE_EXISTING); | ||
} | ||
return tempFile; | ||
} | ||
|
||
/** | ||
* Validate the given unit. | ||
* | ||
* @param unit: The unit to split text on (e.g., paragraphs, words, characters, etc.). | ||
* @throws IllegalArgumentException: If wrong units are passed by. | ||
* @return: The validated unit. | ||
*/ | ||
public Set<ExtractionUnit> getExtractionUnitSet(String unit) throws IllegalArgumentException { | ||
Set<ExtractionUnit> unitSelected = new HashSet<>(); | ||
try { | ||
ExtractionUnit extractionUnit = ExtractionUnit.valueOf(unit.toUpperCase()); | ||
unitSelected.add(extractionUnit); | ||
} catch (IllegalArgumentException e) { | ||
throw new IllegalArgumentException("The extraction unit `" + unit + "` is not valid.", e); | ||
} | ||
return unitSelected; | ||
} | ||
|
||
/** | ||
* Validate the list of roles. | ||
* | ||
* @param rolesList: The roles to extract (e.g., body, title, etc.). | ||
* @throws IllegalArgumentException: If wrong roles are passed by. | ||
* @return: The validated list of roles. | ||
*/ | ||
public Set<SemanticRole> convertToSemanticRoles(List<String> rolesList) throws IllegalArgumentException { | ||
Set<SemanticRole> roles = new HashSet<>(); | ||
for (String role : rolesList) { | ||
try { | ||
SemanticRole semanticRole = SemanticRole.valueOf(role.toUpperCase()); | ||
roles.add(semanticRole); | ||
} catch (IllegalArgumentException e) { | ||
throw new IllegalArgumentException("The role `" + role + "` is not valid.", e); | ||
} | ||
} | ||
return roles; | ||
} | ||
|
||
} |
51 changes: 51 additions & 0 deletions
51
pdfact-api/src/main/java/pdfact/api/model/RequestPayload.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
package pdfact.api.model; | ||
|
||
import java.util.List; | ||
|
||
/** | ||
* The expected request payload. | ||
*/ | ||
public class RequestPayload { | ||
/** | ||
* The url to access the pdf file. | ||
*/ | ||
private String url; | ||
/** | ||
* The unit to split text on (e.g., paragraphs, words, characters, etc.). | ||
*/ | ||
private String unit; | ||
/** | ||
* The roles to extract (e.g., body, title, etc.). | ||
*/ | ||
private List<String> roles; | ||
|
||
// ============================================================================================== | ||
|
||
public String getUrl() { | ||
return url; | ||
} | ||
|
||
public void setUrl(String url) { | ||
this.url = url; | ||
} | ||
|
||
// ============================================================================================== | ||
|
||
public String getUnit() { | ||
return unit; | ||
} | ||
|
||
public void setUnit(String unit) { | ||
this.unit = unit; | ||
} | ||
|
||
// ============================================================================================== | ||
|
||
public List<String> getRoles() { | ||
return roles; | ||
} | ||
|
||
public void setRoles(List<String> roles) { | ||
this.roles = roles; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.