From 0e38862a5e7aa14532c28b0cc89b2783caeb3ee5 Mon Sep 17 00:00:00 2001 From: hupe1980 Date: Mon, 27 Nov 2023 20:03:27 +0100 Subject: [PATCH] Add unstructured client --- integration/unstructured.go | 134 +++++++++++++++++++++++++++++++ integration/unstructured_test.go | 87 ++++++++++++++++++++ 2 files changed, 221 insertions(+) create mode 100644 integration/unstructured.go create mode 100644 integration/unstructured_test.go diff --git a/integration/unstructured.go b/integration/unstructured.go new file mode 100644 index 0000000..83d9155 --- /dev/null +++ b/integration/unstructured.go @@ -0,0 +1,134 @@ +package integration + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "mime/multipart" + "net/http" + "os" +) + +// UnstructuredOptions represents options for configuring the Unstructured client. +type UnstructuredOptions struct { + // BaseURL is the base URL of the Unstructured API. + BaseURL string + + // HTTPClient is the HTTP client to use for making API requests. + HTTPClient HTTPClient +} + +// Unstructured is a client for interacting with the Unstructured API. +type Unstructured struct { + apiKey string + opts UnstructuredOptions +} + +// NewUnstructured creates a new Unstructured client with the provided API key. +func NewUnstructured(apiKey string, optFns ...func(o *UnstructuredOptions)) *Unstructured { + opts := UnstructuredOptions{ + BaseURL: "https://api.unstructured.io/general/v0/general", + HTTPClient: http.DefaultClient, + } + + for _, fn := range optFns { + fn(&opts) + } + + return &Unstructured{ + apiKey: apiKey, + opts: opts, + } +} + +// PartitionInput represents the input for the Partition method. +type PartitionInput struct { + File *os.File +} + +// PartitionOutput represents the output of the Partition method. +type PartitionOutput struct { + Type string `json:"type"` + ElementID string `json:"element_id"` + Metadata struct { + Filetype string `json:"filetype"` + Languages []string `json:"languages"` + PageNumber int `json:"page_number"` + Filename string `json:"filename"` + } `json:"metadata"` + Text string `json:"text"` +} + +// Partition sends a file to the Unstructured API for partitioning and returns the partitioned content. +func (c *Unstructured) Partition(ctx context.Context, input *PartitionInput) ([]PartitionOutput, error) { + fields := map[string]string{ + "strategy": "hi_res", + "include_page_breaks": "true", + } + + res, err := c.doMultipartRequest(ctx, c.opts.BaseURL, input.File, fields) + if err != nil { + return nil, err + } + + output := []PartitionOutput{} + if err := json.Unmarshal(res, &output); err != nil { + return nil, err + } + + return output, nil +} + +// doMultipartRequest performs a multipart request to the Unstructured API. +func (c *Unstructured) doMultipartRequest(ctx context.Context, url string, file *os.File, fields map[string]string) ([]byte, error) { + var b bytes.Buffer + + w := multipart.NewWriter(&b) + + fw, err := w.CreateFormFile("files", file.Name()) + if err != nil { + return nil, err + } + + if _, cErr := io.Copy(fw, file); cErr != nil { + return nil, cErr + } + + for k, v := range fields { + if wErr := w.WriteField(k, v); wErr != nil { + return nil, wErr + } + } + + // Close finishes the multipart message and writes the trailing boundary end line to the output. + w.Close() + + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, &b) + if err != nil { + return nil, err + } + + httpReq.Header.Set("Accept", "application/json") + httpReq.Header.Set("Content-Type", w.FormDataContentType()) + httpReq.Header.Set("unstructured-api-key", c.apiKey) + + res, err := c.opts.HTTPClient.Do(httpReq) + if err != nil { + return nil, err + } + + defer res.Body.Close() + + resBody, err := io.ReadAll(res.Body) + if err != nil { + return nil, err + } + + if res.StatusCode != http.StatusOK { + return nil, fmt.Errorf("unstructured API returned unexpected status code: %d", res.StatusCode) + } + + return resBody, nil +} diff --git a/integration/unstructured_test.go b/integration/unstructured_test.go new file mode 100644 index 0000000..f0f04c6 --- /dev/null +++ b/integration/unstructured_test.go @@ -0,0 +1,87 @@ +package integration + +import ( + "bytes" + "context" + "errors" + "io" + "net/http" + "os" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestUnstructuredPartition(t *testing.T) { + // Create a temporary file for testing + file, err := os.CreateTemp("", "testfile") + assert.NoError(t, err) + + defer os.Remove(file.Name()) + + // Create a sample response JSON + responseJSON := `[{"type":"NarrativeText","element_id":"mock_element_id","metadata":{"filetype":"application/pdf","languages":["eng"],"page_number":1,"filename":"testfile.pdf"},"text":"Mock text"}]` + + // Create a mock HTTP client with a predefined response + mockUnstructuredHTTPClient := &mockUnstructuredHTTPClient{ + DoFunc: func(req *http.Request) (*http.Response, error) { + assert.Equal(t, http.MethodPost, req.Method) + assert.Equal(t, "application/json", req.Header.Get("Accept")) + assert.Contains(t, req.Header.Get("Content-Type"), "multipart/form-data") + assert.Equal(t, "mock_api_key", req.Header.Get("unstructured-api-key")) + + // Simulate a successful response + return &http.Response{ + StatusCode: http.StatusOK, + Body: io.NopCloser(bytes.NewBufferString(responseJSON)), + }, nil + }, + } + + // Create an instance of Unstructured with the mock HTTP client + unstructuredClient := NewUnstructured("mock_api_key", func(o *UnstructuredOptions) { + o.HTTPClient = mockUnstructuredHTTPClient + }) + + // Create a test case + t.Run("Partition", func(t *testing.T) { + // Call the Partition method with the mock file + output, err := unstructuredClient.Partition(context.Background(), &PartitionInput{File: file}) + assert.NoError(t, err) + + // Assert the expected output + expectedOutput := []PartitionOutput{ + { + Type: "NarrativeText", + ElementID: "mock_element_id", + Metadata: struct { + Filetype string `json:"filetype"` + Languages []string `json:"languages"` + PageNumber int `json:"page_number"` + Filename string `json:"filename"` + }{ + Filetype: "application/pdf", + Languages: []string{"eng"}, + PageNumber: 1, + Filename: "testfile.pdf", + }, + Text: "Mock text", + }, + } + assert.Equal(t, expectedOutput, output) + }) +} + +// mockUnstructuredHTTPClient is a custom mock for the HTTP client. +type mockUnstructuredHTTPClient struct { + DoFunc func(req *http.Request) (*http.Response, error) +} + +// Do is the implementation of the Do method for the mock. +func (m *mockUnstructuredHTTPClient) Do(req *http.Request) (*http.Response, error) { + if m.DoFunc != nil { + return m.DoFunc(req) + } + + return nil, errors.New("mockUnstructuredHTTPClient: DoFunc not set") +}