Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add PDF with OCR samples #73

Merged
merged 2 commits into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions DotNET/Endpoint Examples/JSON Payload/pdf-with-ocr-text.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@

using Newtonsoft.Json.Linq;
using System.Text;

using (var httpClient = new HttpClient { BaseAddress = new Uri("https://api.pdfrest.com") })
{
using (var uploadRequest = new HttpRequestMessage(HttpMethod.Post, "upload"))
{
uploadRequest.Headers.TryAddWithoutValidation("Api-Key", "xxxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx");
uploadRequest.Headers.Accept.Add(new("application/json"));

var uploadByteArray = File.ReadAllBytes("/path/to/file");
var uploadByteAryContent = new ByteArrayContent(uploadByteArray);
uploadByteAryContent.Headers.TryAddWithoutValidation("Content-Type", "application/octet-stream");
uploadByteAryContent.Headers.TryAddWithoutValidation("Content-Filename", "filename.pdf");


uploadRequest.Content = uploadByteAryContent;
var uploadResponse = await httpClient.SendAsync(uploadRequest);

var uploadResult = await uploadResponse.Content.ReadAsStringAsync();

Console.WriteLine("Upload response received.");
Console.WriteLine(uploadResult);

JObject uploadResultJson = JObject.Parse(uploadResult);
var uploadedID = uploadResultJson["files"][0]["id"];
using (var ocrTextRequest = new HttpRequestMessage(HttpMethod.Post, "pdf-with-ocr-text"))
{
ocrTextRequest.Headers.TryAddWithoutValidation("Api-Key", "xxxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx");
ocrTextRequest.Headers.Accept.Add(new("application/json"));

ocrTextRequest.Headers.TryAddWithoutValidation("Content-Type", "application/json");


JObject parameterJson = new JObject
{
["id"] = uploadedID,
};

ocrTextRequest.Content = new StringContent(parameterJson.ToString(), Encoding.UTF8, "application/json"); ;
var ocrTextResponse = await httpClient.SendAsync(ocrTextRequest);

var ocrTextResult = await ocrTextResponse.Content.ReadAsStringAsync();

Console.WriteLine("Processing response received.");
Console.WriteLine(ocrTextResult);
}
}
}
28 changes: 28 additions & 0 deletions DotNET/Endpoint Examples/Multipart Payload/pdf-with-ocr-text.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
using System.Text;

using (var httpClient = new HttpClient { BaseAddress = new Uri("https://api.pdfrest.com") })
{
using (var request = new HttpRequestMessage(HttpMethod.Post, "pdf-with-ocr-text"))
{
request.Headers.TryAddWithoutValidation("Api-Key", "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx");
request.Headers.Accept.Add(new("application/json"));
var multipartContent = new MultipartFormDataContent();

var byteArray = File.ReadAllBytes("/path/to/file");
var byteAryContent = new ByteArrayContent(byteArray);
multipartContent.Add(byteAryContent, "file", "file_name");
byteAryContent.Headers.TryAddWithoutValidation("Content-Type", "application/pdf");

var byteArrayOption = new ByteArrayContent(Encoding.UTF8.GetBytes("converted"));
multipartContent.Add(byteArrayOption, "output");


request.Content = multipartContent;
var response = await httpClient.SendAsync(request);

var apiResult = await response.Content.ReadAsStringAsync();

Console.WriteLine("API response received.");
Console.WriteLine(apiResult);
}
}
96 changes: 96 additions & 0 deletions Java/Endpoint Examples/JSON Payload/PDFWithOCRText.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import io.github.cdimascio.dotenv.Dotenv;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import okhttp3.*;
import org.json.JSONArray;
import org.json.JSONObject;

public class PDFWithOCRText {

// Specify the path to your file here, or as the first argument when running the program.
private static final String DEFAULT_FILE_PATH = "/path/to/file.pdf";

// Specify your API key here, or in the environment variable PDFREST_API_KEY.
// You can also put the environment variable in a .env file.
private static final String DEFAULT_API_KEY = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx";

public static void main(String[] args) {
File inputFile;
if (args.length > 0) {
inputFile = new File(args[0]);
} else {
inputFile = new File(DEFAULT_FILE_PATH);
}
final Dotenv dotenv = Dotenv.configure().ignoreIfMalformed().ignoreIfMissing().load();

String uploadString = uploadFile(inputFile);
JSONObject uploadJSON = new JSONObject(uploadString);
if (uploadJSON.has("error")) {
System.out.println("Error during upload: " + uploadString);
return;
}
JSONArray fileArray = uploadJSON.getJSONArray("files");

JSONObject fileObject = fileArray.getJSONObject(0);

String uploadedID = fileObject.get("id").toString();

String JSONString = String.format("{\"id\":\"%s\"}", uploadedID);

final RequestBody requestBody =
RequestBody.create(JSONString, MediaType.parse("application/json"));

Request request =
new Request.Builder()
.header("Api-Key", dotenv.get("PDFREST_API_KEY", DEFAULT_API_KEY))
.url("https://api.pdfrest.com/pdf-with-ocr-text")
.post(requestBody)
.build();
try {
OkHttpClient client =
new OkHttpClient().newBuilder().readTimeout(60, TimeUnit.SECONDS).build();

Response response = client.newCall(request).execute();
System.out.println("Processing Result code " + response.code());
if (response.body() != null) {
System.out.println(prettyJson(response.body().string()));
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}

private static String prettyJson(String json) {
// https://stackoverflow.com/a/9583835/11996393
return new JSONObject(json).toString(4);
}

// This function is just a copy of the 'Upload.java' file to upload a binary file
private static String uploadFile(File inputFile) {

final Dotenv dotenv = Dotenv.configure().ignoreIfMalformed().ignoreIfMissing().load();

final RequestBody requestBody =
RequestBody.create(inputFile, MediaType.parse("application/pdf"));

Request request =
new Request.Builder()
.header("Api-Key", dotenv.get("PDFREST_API_KEY", DEFAULT_API_KEY))
.header("Content-Filename", "File.pdf")
.url("https://api.pdfrest.com/upload")
.post(requestBody)
.build();
try {
OkHttpClient client = new OkHttpClient().newBuilder().build();
Response response = client.newCall(request).execute();
System.out.println("Upload Result code " + response.code());
if (response.body() != null) {
return response.body().string();
}
} catch (IOException e) {
throw new RuntimeException(e);
}
return "";
}
}
63 changes: 63 additions & 0 deletions Java/Endpoint Examples/Multipart Payload/PDFWithOCRText.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import io.github.cdimascio.dotenv.Dotenv;
import java.io.File;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import okhttp3.MediaType;
import okhttp3.MultipartBody;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.RequestBody;
import okhttp3.Response;
import org.json.JSONObject;

public class PDFWithOCRText {

// Specify the path to your file here, or as the first argument when running the program.
private static final String DEFAULT_FILE_PATH = "/path/to/file.pdf";

// Specify your API key here, or in the environment variable PDFREST_API_KEY.
// You can also put the environment variable in a .env file.
private static final String DEFAULT_API_KEY = "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx";

public static void main(String[] args) {
File inputFile;
if (args.length > 0) {
inputFile = new File(args[0]);
} else {
inputFile = new File(DEFAULT_FILE_PATH);
}

final Dotenv dotenv = Dotenv.configure().ignoreIfMalformed().ignoreIfMissing().load();

final RequestBody inputFileRequestBody =
RequestBody.create(inputFile, MediaType.parse("application/pdf"));
RequestBody requestBody =
new MultipartBody.Builder()
.setType(MultipartBody.FORM)
.addFormDataPart("file", inputFile.getName(), inputFileRequestBody)
.addFormDataPart("output", "pdfrest_pdf-with-ocr-text")
.build();
Request request =
new Request.Builder()
.header("Api-Key", dotenv.get("PDFREST_API_KEY", DEFAULT_API_KEY))
.url("https://api.pdfrest.com/pdf-with-ocr-text")
.post(requestBody)
.build();
try {
OkHttpClient client =
new OkHttpClient().newBuilder().readTimeout(60, TimeUnit.SECONDS).build();
Response response = client.newCall(request).execute();
System.out.println("Result code " + response.code());
if (response.body() != null) {
System.out.println(prettyJson(response.body().string()));
}
} catch (IOException e) {
throw new RuntimeException(e);
}
}

private static String prettyJson(String json) {
// https://stackoverflow.com/a/9583835/11996393
return new JSONObject(json).toString(4);
}
}
47 changes: 47 additions & 0 deletions JavaScript/Endpoint Examples/JSON Payload/pdf-with-ocr-text.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
var axios = require("axios");
var FormData = require("form-data");
var fs = require("fs");

var upload_data = fs.createReadStream("/path/to/file");

var upload_config = {
method: "post",
maxBodyLength: Infinity,
url: "https://api.pdfrest.com/upload",
headers: {
"Api-Key": "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx", // Replace with your API key
"Content-Filename": "filename.pdf",
"Content-Type": "application/octet-stream",
},
data: upload_data, // set the data to be sent with the request
};

// send request and handle response or error
axios(upload_config)
.then(function (upload_response) {
console.log(JSON.stringify(upload_response.data));
var uploaded_id = upload_response.data.files[0].id;

var pdf_with_ocr_text_config = {
method: "post",
maxBodyLength: Infinity,
url: "https://api.pdfrest.com/pdf-with-ocr-text",
headers: {
"Api-Key": "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx", // Replace with your API key
"Content-Type": "application/json",
},
data: { id: uploaded_id }, // set the data to be sent with the request
};

// send request and handle response or error
axios(pdf_with_ocr_text_config)
.then(function (pdf_with_ocr_text_response) {
console.log(JSON.stringify(pdf_with_ocr_text_response.data));
})
.catch(function (error) {
console.log(error);
});
})
.catch(function (error) {
console.log(error);
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// This request demonstrates how to apply OCR to a PDF document and insert text behind images of text.
var axios = require('axios');
var FormData = require('form-data');
var fs = require('fs');

// Create a new form data instance and append the PDF file and parameters to it
var data = new FormData();
data.append('file', fs.createReadStream('/path/to/file'));
data.append('output', 'pdfrest_pdf-with-ocr-text');

// define configuration options for axios request
var config = {
method: 'post',
maxBodyLength: Infinity, // set maximum length of the request body
url: 'https://api.pdfrest.com/pdf-with-ocr-text',
headers: {
'Api-Key': 'xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx', // Replace with your API key
...data.getHeaders() // set headers for the request
},
data : data // set the data to be sent with the request
};

// send request and handle response or error
axios(config)
.then(function (response) {
console.log(JSON.stringify(response.data));
})
.catch(function (error) {
console.log(error);
});

// If you would like to download the file instead of getting the JSON response, please see the 'get-resource-id-endpoint.js' sample.
33 changes: 33 additions & 0 deletions PHP/Endpoint Examples/JSON Payload/pdf-with-ocr-text.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
<?php
require 'vendor/autoload.php'; // Require the autoload file to load Guzzle HTTP client.

use GuzzleHttp\Client; // Import the Guzzle HTTP client namespace.
use GuzzleHttp\Psr7\Request; // Import the PSR-7 Request class.
use GuzzleHttp\Psr7\Utils; // Import the PSR-7 Utils class for working with streams.

$upload_client = new Client(['http_errors' => false]);
$upload_headers = [
'api-key' => 'xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx',
'content-filename' => 'filename.pdf',
'Content-Type' => 'application/octet-stream'
];
$upload_body = file_get_contents('/path/to/file');
$upload_request = new Request('POST', 'https://api.pdfrest.com/upload', $upload_headers, $upload_body);
$upload_res = $upload_client->sendAsync($upload_request)->wait();
echo $upload_res->getBody() . PHP_EOL;

$upload_response_json = json_decode($upload_res->getBody());

$uploaded_id = $upload_response_json->{'files'}[0]->{'id'};

echo "Successfully uploaded with an id of: " . $uploaded_id . PHP_EOL;

$pdf_with_ocr_text_client = new Client(['http_errors' => false]);
$pdf_with_ocr_text_headers = [
'api-key' => 'xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx',
'Content-Type' => 'application/json'
];
$pdf_with_ocr_text_body = '{"id":"'.$uploaded_id.'"}';
$pdf_with_ocr_text_request = new Request('POST', 'https://api.pdfrest.com/pdf-with-ocr-text', $pdf_with_ocr_text_headers, $pdf_with_ocr_text_body);
$pdf_with_ocr_text_res = $pdf_with_ocr_text_client->sendAsync($pdf_with_ocr_text_request)->wait();
echo $pdf_with_ocr_text_res->getBody() . PHP_EOL;
35 changes: 35 additions & 0 deletions PHP/Endpoint Examples/Multipart Payload/pdf-with-ocr-text.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
<?php
require 'vendor/autoload.php'; // Require the autoload file to load Guzzle HTTP client.

use GuzzleHttp\Client; // Import the Guzzle HTTP client namespace.
use GuzzleHttp\Psr7\Request; // Import the PSR-7 Request class.
use GuzzleHttp\Psr7\Utils; // Import the PSR-7 Utils class for working with streams.

$client = new Client(); // Create a new instance of the Guzzle HTTP client.

$headers = [
'Api-Key' => 'xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx' // Set the API key in the headers for authentication.
];

$options = [
'multipart' => [
[
'name' => 'file', // Specify the field name for the file.
'contents' => Utils::tryFopen('/path/to/file', 'r'), // Open the file specified by the '/path/to/file' for reading.
'filename' => '/path/to/file', // Set the filename for the file to be processed, in this case, '/path/to/file'.
'headers' => [
'Content-Type' => '<Content-type header>' // Set the Content-Type header for the file.
]
],
[
'name' => 'output', // Specify the field name for the output option.
'contents' => 'pdfrest_pdf-with-ocr-text' // Set the value for the output option (in this case, 'pdfrest_pdf-with-ocr-text').
]
]
];

$request = new Request('POST', 'https://api.pdfrest.com/pdf-with-ocr-text', $headers); // Create a new HTTP POST request with the API endpoint and headers.

$res = $client->sendAsync($request, $options)->wait(); // Send the asynchronous request and wait for the response.

echo $res->getBody(); // Output the response body, which contains the document with text from OCR added.
Loading
Loading