Skip to content

Commit

Permalink
Add unstructured client
Browse files Browse the repository at this point in the history
  • Loading branch information
hupe1980 committed Nov 27, 2023
1 parent fb3f434 commit 0e38862
Show file tree
Hide file tree
Showing 2 changed files with 221 additions and 0 deletions.
134 changes: 134 additions & 0 deletions integration/unstructured.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
package integration

import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"mime/multipart"
"net/http"
"os"
)

// UnstructuredOptions represents options for configuring the Unstructured client.
type UnstructuredOptions struct {
// BaseURL is the base URL of the Unstructured API.
BaseURL string

// HTTPClient is the HTTP client to use for making API requests.
HTTPClient HTTPClient
}

// Unstructured is a client for interacting with the Unstructured API.
type Unstructured struct {
apiKey string
opts UnstructuredOptions
}

// NewUnstructured creates a new Unstructured client with the provided API key.
func NewUnstructured(apiKey string, optFns ...func(o *UnstructuredOptions)) *Unstructured {
opts := UnstructuredOptions{
BaseURL: "https://api.unstructured.io/general/v0/general",
HTTPClient: http.DefaultClient,
}

for _, fn := range optFns {
fn(&opts)
}

return &Unstructured{
apiKey: apiKey,
opts: opts,
}
}

// PartitionInput represents the input for the Partition method.
type PartitionInput struct {
File *os.File
}

// PartitionOutput represents the output of the Partition method.
type PartitionOutput struct {
Type string `json:"type"`
ElementID string `json:"element_id"`
Metadata struct {
Filetype string `json:"filetype"`
Languages []string `json:"languages"`
PageNumber int `json:"page_number"`
Filename string `json:"filename"`
} `json:"metadata"`
Text string `json:"text"`
}

// Partition sends a file to the Unstructured API for partitioning and returns the partitioned content.
func (c *Unstructured) Partition(ctx context.Context, input *PartitionInput) ([]PartitionOutput, error) {
fields := map[string]string{
"strategy": "hi_res",
"include_page_breaks": "true",
}

res, err := c.doMultipartRequest(ctx, c.opts.BaseURL, input.File, fields)
if err != nil {
return nil, err
}

output := []PartitionOutput{}
if err := json.Unmarshal(res, &output); err != nil {
return nil, err
}

return output, nil
}

// doMultipartRequest performs a multipart request to the Unstructured API.
func (c *Unstructured) doMultipartRequest(ctx context.Context, url string, file *os.File, fields map[string]string) ([]byte, error) {
var b bytes.Buffer

w := multipart.NewWriter(&b)

fw, err := w.CreateFormFile("files", file.Name())
if err != nil {
return nil, err
}

if _, cErr := io.Copy(fw, file); cErr != nil {
return nil, cErr
}

for k, v := range fields {
if wErr := w.WriteField(k, v); wErr != nil {
return nil, wErr
}
}

// Close finishes the multipart message and writes the trailing boundary end line to the output.
w.Close()

httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, &b)
if err != nil {
return nil, err
}

httpReq.Header.Set("Accept", "application/json")
httpReq.Header.Set("Content-Type", w.FormDataContentType())
httpReq.Header.Set("unstructured-api-key", c.apiKey)

res, err := c.opts.HTTPClient.Do(httpReq)
if err != nil {
return nil, err
}

defer res.Body.Close()

resBody, err := io.ReadAll(res.Body)
if err != nil {
return nil, err
}

if res.StatusCode != http.StatusOK {
return nil, fmt.Errorf("unstructured API returned unexpected status code: %d", res.StatusCode)
}

return resBody, nil
}
87 changes: 87 additions & 0 deletions integration/unstructured_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package integration

import (
"bytes"
"context"
"errors"
"io"
"net/http"
"os"
"testing"

"github.com/stretchr/testify/assert"
)

func TestUnstructuredPartition(t *testing.T) {
// Create a temporary file for testing
file, err := os.CreateTemp("", "testfile")
assert.NoError(t, err)

defer os.Remove(file.Name())

// Create a sample response JSON
responseJSON := `[{"type":"NarrativeText","element_id":"mock_element_id","metadata":{"filetype":"application/pdf","languages":["eng"],"page_number":1,"filename":"testfile.pdf"},"text":"Mock text"}]`

// Create a mock HTTP client with a predefined response
mockUnstructuredHTTPClient := &mockUnstructuredHTTPClient{
DoFunc: func(req *http.Request) (*http.Response, error) {
assert.Equal(t, http.MethodPost, req.Method)
assert.Equal(t, "application/json", req.Header.Get("Accept"))
assert.Contains(t, req.Header.Get("Content-Type"), "multipart/form-data")
assert.Equal(t, "mock_api_key", req.Header.Get("unstructured-api-key"))

// Simulate a successful response
return &http.Response{
StatusCode: http.StatusOK,
Body: io.NopCloser(bytes.NewBufferString(responseJSON)),
}, nil
},
}

// Create an instance of Unstructured with the mock HTTP client
unstructuredClient := NewUnstructured("mock_api_key", func(o *UnstructuredOptions) {
o.HTTPClient = mockUnstructuredHTTPClient
})

// Create a test case
t.Run("Partition", func(t *testing.T) {
// Call the Partition method with the mock file
output, err := unstructuredClient.Partition(context.Background(), &PartitionInput{File: file})
assert.NoError(t, err)

// Assert the expected output
expectedOutput := []PartitionOutput{
{
Type: "NarrativeText",
ElementID: "mock_element_id",
Metadata: struct {
Filetype string `json:"filetype"`
Languages []string `json:"languages"`
PageNumber int `json:"page_number"`
Filename string `json:"filename"`
}{
Filetype: "application/pdf",
Languages: []string{"eng"},
PageNumber: 1,
Filename: "testfile.pdf",
},
Text: "Mock text",
},
}
assert.Equal(t, expectedOutput, output)
})
}

// mockUnstructuredHTTPClient is a custom mock for the HTTP client.
type mockUnstructuredHTTPClient struct {
DoFunc func(req *http.Request) (*http.Response, error)
}

// Do is the implementation of the Do method for the mock.
func (m *mockUnstructuredHTTPClient) Do(req *http.Request) (*http.Response, error) {
if m.DoFunc != nil {
return m.DoFunc(req)
}

return nil, errors.New("mockUnstructuredHTTPClient: DoFunc not set")
}

0 comments on commit 0e38862

Please sign in to comment.