Add API implementation (#1)

* Set version 11 to avoid conflicts * Add FontFace and Color properties to Paragraph * Add FontFace and Color properties to Word * Fix exceptions * Add api skeleton * Modified PdfApi and PdfService * modified PsfService to accept a url * Improve exceptions handling and move models in a package * Add docs --------- Co-authored-by: AnnaMarika01 <[email protected]>
data-house · May 27, 2024 · 1f53bba · 1f53bba
1 parent 00d4db9
commit 1f53bba
Show file tree

Hide file tree

Showing 8 changed files with 346 additions and 5 deletions.
diff --git a/pdfact-api/pom.xml b/pdfact-api/pom.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>pdfact</groupId>
+        <artifactId>pdfact-parent</artifactId>
+        <version>0.0.1-SNAPSHOT</version>
+    </parent>
+
+    <artifactId>pdfact-api</artifactId>
+
+    <dependencies>
+        <dependency>
+            <groupId>pdfact</groupId>
+            <artifactId>pdfact-core</artifactId>
+            <version>0.0.1-SNAPSHOT</version>
+        </dependency>
+        <dependency>
+            <groupId>pdfact</groupId>
+            <artifactId>pdfact-cli</artifactId>
+            <version>0.0.1-SNAPSHOT</version>
+        </dependency>
+        <!-- Spark Java -->
+        <dependency>
+            <groupId>com.sparkjava</groupId>
+            <artifactId>spark-core</artifactId>
+            <version>2.9.3</version>
+        </dependency>
+        <!-- JSON Handling -->
+        <dependency>
+            <groupId>com.google.code.gson</groupId>
+            <artifactId>gson</artifactId>
+            <version>2.8.8</version>
+        </dependency>
+        <dependency>
+            <groupId>pdfact</groupId>
+            <artifactId>pdfact-core</artifactId>
+            <version>0.0.1-SNAPSHOT</version>
+            <scope>compile</scope>
+        </dependency>
+    </dependencies>
+
+    <properties>
+        <maven.compiler.source>17</maven.compiler.source>
+        <maven.compiler.target>17</maven.compiler.target>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>
diff --git a/pdfact-api/src/main/java/pdfact/api/PdfApi.java b/pdfact-api/src/main/java/pdfact/api/PdfApi.java
@@ -0,0 +1,60 @@
+package pdfact.api;
+
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+import com.google.gson.JsonObject;
+import pdfact.api.model.RequestPayload;
+import pdfact.core.util.exception.PdfActException;
+import spark.Request;
+import spark.Response;
+
+import java.io.IOException;
+
+import static spark.Spark.post;
+
+/**
+ * The API to parse a pdf file.
+ */
+public class PdfApi {
+
+    public static void main(String[] args) {
+        PdfService pdfService = new PdfService();
+        Gson gson = new GsonBuilder().disableHtmlEscaping().create();
+
+        post("/api/pdf/parse", (request, response) -> parsePdf(request, response, pdfService, gson), gson::toJson);
+    }
+
+    private static Object parsePdf(Request request, Response response, PdfService pdfService, Gson gson) {
+        String body = request.body();
+        RequestPayload requestPayload = gson.fromJson(body, RequestPayload.class);
+
+        if (requestPayload == null || requestPayload.getUrl() == null || requestPayload.getUrl().isEmpty()) {
+            response.status(400);
+            JsonObject errorResponse = new JsonObject();
+            errorResponse.addProperty("error", "File url is required");
+            return errorResponse;
+        }
+
+        JsonObject jsonResult;
+
+        try {
+            String jsonString = pdfService.parsePdf(requestPayload.getUrl(), requestPayload.getUnit(), requestPayload.getRoles());
+            jsonResult = gson.fromJson(jsonString, JsonObject.class);
+            response.status(200);
+        } catch (IllegalArgumentException e) {
+            response.status(422);
+            jsonResult = new JsonObject();
+            jsonResult.addProperty("error", "Illegal arguments. " + e.getMessage());
+        } catch (IOException e) {
+            response.status(400);
+            jsonResult = new JsonObject();
+            jsonResult.addProperty("error", "An error occurred while downloading the pdf file. " + e.getMessage());
+        } catch (PdfActException e) {
+            response.status(500);
+            jsonResult = new JsonObject();
+            jsonResult.addProperty("error", "An error occurred while processing the pdf file.");
+        }
+        return jsonResult;
+    }
+}
+
diff --git a/pdfact-api/src/main/java/pdfact/api/PdfService.java b/pdfact-api/src/main/java/pdfact/api/PdfService.java
@@ -0,0 +1,122 @@
+package pdfact.api;
+
+import pdfact.cli.PdfAct;
+import pdfact.cli.model.ExtractionUnit;
+import pdfact.cli.pipes.serialize.PdfJsonSerializer;
+import pdfact.core.model.Document;
+import pdfact.core.model.SemanticRole;
+import pdfact.core.util.exception.PdfActException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+
+/**
+ * The service to process pdf file and extract text.
+ */
+public class PdfService {
+
+    /**
+     * Download a pdf file and create a json representation of its content.
+     *
+     * @param fileUrl:       The url to access the pdf file.
+     * @param unitSelected:  The unit to split text on (e.g., paragraphs, words, characters, etc.).
+     * @param rolesSelected: The roles to extract (e.g., body, title, etc.).
+     * @throws IOException:              If the file download/load goes wrong.
+     * @throws PdfActException:          If the pdf processing or text extraction goes wrong.
+     * @throws IllegalArgumentException: If wrong roles or units are passed by.
+     * @return: A json representation of the extracted text.
+     */
+    public String parsePdf(String fileUrl, String unitSelected, List<String> rolesSelected) throws IOException, PdfActException, IllegalArgumentException {
+        PdfAct pdfAct = new PdfAct();
+        String jsonString;
+        Set<ExtractionUnit> unit = new HashSet<>();
+        Set<SemanticRole> roles;
+
+        if (unitSelected != null) {
+            unit = getExtractionUnitSet(unitSelected);
+            pdfAct.setExtractionUnits(unit);
+        } else {
+            unit.add(ExtractionUnit.PARAGRAPH);
+        }
+        if (rolesSelected != null) {
+            roles = convertToSemanticRoles(rolesSelected);
+            pdfAct.setSemanticRoles(roles);
+        } else {
+            roles = new HashSet<>(Arrays.asList(SemanticRole.values()));
+        }
+
+        Path tempFile = downloadFileFromUrl(fileUrl);
+        Document pdf = pdfAct.parse(tempFile.toString());
+        PdfJsonSerializer serializer = new PdfJsonSerializer(unit, roles);
+        byte[] serializedPdf = serializer.serialize(pdf);
+        jsonString = new String(serializedPdf, StandardCharsets.UTF_8);
+
+        return jsonString;
+
+    }
+
+    /**
+     * Download a pdf file.
+     *
+     * @param fileUrl: The url to access the pdf file.
+     * @throws IOException: If the file download/load goes wrong.
+     * @return: The path to the downloaded pdf file.
+     */
+    private Path downloadFileFromUrl(String fileUrl) throws IOException {
+        URL url = new URL(fileUrl);
+        Path tempFile = Files.createTempFile("temp", ".pdf");
+        try (InputStream in = url.openStream()) {
+            Files.copy(in, tempFile, StandardCopyOption.REPLACE_EXISTING);
+        }
+        return tempFile;
+    }
+
+    /**
+     * Validate the given unit.
+     *
+     * @param unit: The unit to split text on (e.g., paragraphs, words, characters, etc.).
+     * @throws IllegalArgumentException: If wrong units are passed by.
+     * @return: The validated unit.
+     */
+    public Set<ExtractionUnit> getExtractionUnitSet(String unit) throws IllegalArgumentException {
+        Set<ExtractionUnit> unitSelected = new HashSet<>();
+        try {
+            ExtractionUnit extractionUnit = ExtractionUnit.valueOf(unit.toUpperCase());
+            unitSelected.add(extractionUnit);
+        } catch (IllegalArgumentException e) {
+            throw new IllegalArgumentException("The extraction unit `" + unit + "` is not valid.", e);
+        }
+        return unitSelected;
+    }
+
+    /**
+     * Validate the list of roles.
+     *
+     * @param rolesList: The roles to extract (e.g., body, title, etc.).
+     * @throws IllegalArgumentException: If wrong roles are passed by.
+     * @return: The validated list of roles.
+     */
+    public Set<SemanticRole> convertToSemanticRoles(List<String> rolesList) throws IllegalArgumentException {
+        Set<SemanticRole> roles = new HashSet<>();
+        for (String role : rolesList) {
+            try {
+                SemanticRole semanticRole = SemanticRole.valueOf(role.toUpperCase());
+                roles.add(semanticRole);
+            } catch (IllegalArgumentException e) {
+                throw new IllegalArgumentException("The role `" + role + "` is not valid.", e);
+            }
+        }
+        return roles;
+    }
+
+}
diff --git a/pdfact-api/src/main/java/pdfact/api/model/RequestPayload.java b/pdfact-api/src/main/java/pdfact/api/model/RequestPayload.java
@@ -0,0 +1,51 @@
+package pdfact.api.model;
+
+import java.util.List;
+
+/**
+ * The expected request payload.
+ */
+public class RequestPayload {
+    /**
+     * The url to access the pdf file.
+     */
+    private String url;
+    /**
+     * The unit to split text on (e.g., paragraphs, words, characters, etc.).
+     */
+    private String unit;
+    /**
+     * The roles to extract (e.g., body, title, etc.).
+     */
+    private List<String> roles;
+
+    // ==============================================================================================
+
+    public String getUrl() {
+        return url;
+    }
+
+    public void setUrl(String url) {
+        this.url = url;
+    }
+
+    // ==============================================================================================
+
+    public String getUnit() {
+        return unit;
+    }
+
+    public void setUnit(String unit) {
+        this.unit = unit;
+    }
+
+    // ==============================================================================================
+
+    public List<String> getRoles() {
+        return roles;
+    }
+
+    public void setRoles(List<String> roles) {
+        this.roles = roles;
+    }
+}
diff --git a/pdfact-cli/pom.xml b/pdfact-cli/pom.xml
@@ -39,6 +39,11 @@
       <artifactId>json</artifactId>
       <version>20160810</version>
     </dependency>
+    <dependency>
+      <groupId>com.google.code.gson</groupId>
+      <artifactId>gson</artifactId>
+      <version>2.8.5</version>
+    </dependency>
   </dependencies>
 
   <build>
@@ -105,6 +110,14 @@
           </execution>
         </executions>
       </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <configuration>
+          <source>11</source>
+          <target>11</target>
+        </configuration>
+      </plugin>
     </plugins>
   </build>
 </project>
diff --git a/pdfact-core/src/main/java/pdfact/core/model/Paragraph.java b/pdfact-core/src/main/java/pdfact/core/model/Paragraph.java
@@ -2,6 +2,7 @@
 
 import java.util.List;
 
+import org.apache.commons.lang3.NotImplementedException;
 import org.apache.commons.lang3.builder.EqualsBuilder;
 import org.apache.commons.lang3.builder.HashCodeBuilder;
 
@@ -12,7 +13,7 @@
  * 
  * @author Claudius Korzen
  */
-public class Paragraph extends Element implements HasWords, HasText, HasPositions, HasSemanticRole {
+public class Paragraph extends Element implements HasWords, HasText, HasPositions, HasSemanticRole, HasColor, HasFontFace {
   /**
    * The words of this paragraph.
    */
@@ -194,7 +195,7 @@ public boolean equals(Object other) {
       builder.append(getPositions(), otherParagraph.getPositions());
       builder.append(getSemanticRole(), otherParagraph.getSemanticRole());
       builder.append(getSecondarySemanticRole(),
-          otherParagraph.getSecondarySemanticRole());
+              otherParagraph.getSecondarySemanticRole());
 
       return builder.isEquals();
     }
@@ -211,4 +212,24 @@ public int hashCode() {
     builder.append(getSecondarySemanticRole());
     return builder.hashCode();
   }
+
+  @Override
+  public Color getColor() {
+    return this.characterStatistic.colorFrequencies.getMostCommonObject();
+  }
+
+  @Override
+  public void setColor(Color color) {
+    throw new NotImplementedException("Paragraph does not allow to explicitly set the Color");
+  }
+
+  @Override
+  public FontFace getFontFace() {
+    return this.characterStatistic.fontFaceFrequencies.getMostCommonObject();
+  }
+
+  @Override
+  public void setFontFace(FontFace fontFace) {
+    throw new NotImplementedException("Paragraph does not allow to explicitly set the FontFace");
+  }
 }