-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
221 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
package integration | ||
|
||
import ( | ||
"bytes" | ||
"context" | ||
"encoding/json" | ||
"fmt" | ||
"io" | ||
"mime/multipart" | ||
"net/http" | ||
"os" | ||
) | ||
|
||
// UnstructuredOptions represents options for configuring the Unstructured client. | ||
type UnstructuredOptions struct { | ||
// BaseURL is the base URL of the Unstructured API. | ||
BaseURL string | ||
|
||
// HTTPClient is the HTTP client to use for making API requests. | ||
HTTPClient HTTPClient | ||
} | ||
|
||
// Unstructured is a client for interacting with the Unstructured API. | ||
type Unstructured struct { | ||
apiKey string | ||
opts UnstructuredOptions | ||
} | ||
|
||
// NewUnstructured creates a new Unstructured client with the provided API key. | ||
func NewUnstructured(apiKey string, optFns ...func(o *UnstructuredOptions)) *Unstructured { | ||
opts := UnstructuredOptions{ | ||
BaseURL: "https://api.unstructured.io/general/v0/general", | ||
HTTPClient: http.DefaultClient, | ||
} | ||
|
||
for _, fn := range optFns { | ||
fn(&opts) | ||
} | ||
|
||
return &Unstructured{ | ||
apiKey: apiKey, | ||
opts: opts, | ||
} | ||
} | ||
|
||
// PartitionInput represents the input for the Partition method. | ||
type PartitionInput struct { | ||
File *os.File | ||
} | ||
|
||
// PartitionOutput represents the output of the Partition method. | ||
type PartitionOutput struct { | ||
Type string `json:"type"` | ||
ElementID string `json:"element_id"` | ||
Metadata struct { | ||
Filetype string `json:"filetype"` | ||
Languages []string `json:"languages"` | ||
PageNumber int `json:"page_number"` | ||
Filename string `json:"filename"` | ||
} `json:"metadata"` | ||
Text string `json:"text"` | ||
} | ||
|
||
// Partition sends a file to the Unstructured API for partitioning and returns the partitioned content. | ||
func (c *Unstructured) Partition(ctx context.Context, input *PartitionInput) ([]PartitionOutput, error) { | ||
fields := map[string]string{ | ||
"strategy": "hi_res", | ||
"include_page_breaks": "true", | ||
} | ||
|
||
res, err := c.doMultipartRequest(ctx, c.opts.BaseURL, input.File, fields) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
output := []PartitionOutput{} | ||
if err := json.Unmarshal(res, &output); err != nil { | ||
return nil, err | ||
} | ||
|
||
return output, nil | ||
} | ||
|
||
// doMultipartRequest performs a multipart request to the Unstructured API. | ||
func (c *Unstructured) doMultipartRequest(ctx context.Context, url string, file *os.File, fields map[string]string) ([]byte, error) { | ||
var b bytes.Buffer | ||
|
||
w := multipart.NewWriter(&b) | ||
|
||
fw, err := w.CreateFormFile("files", file.Name()) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
if _, cErr := io.Copy(fw, file); cErr != nil { | ||
return nil, cErr | ||
} | ||
|
||
for k, v := range fields { | ||
if wErr := w.WriteField(k, v); wErr != nil { | ||
return nil, wErr | ||
} | ||
} | ||
|
||
// Close finishes the multipart message and writes the trailing boundary end line to the output. | ||
w.Close() | ||
|
||
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, &b) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
httpReq.Header.Set("Accept", "application/json") | ||
httpReq.Header.Set("Content-Type", w.FormDataContentType()) | ||
httpReq.Header.Set("unstructured-api-key", c.apiKey) | ||
|
||
res, err := c.opts.HTTPClient.Do(httpReq) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
defer res.Body.Close() | ||
|
||
resBody, err := io.ReadAll(res.Body) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
if res.StatusCode != http.StatusOK { | ||
return nil, fmt.Errorf("unstructured API returned unexpected status code: %d", res.StatusCode) | ||
} | ||
|
||
return resBody, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
package integration | ||
|
||
import ( | ||
"bytes" | ||
"context" | ||
"errors" | ||
"io" | ||
"net/http" | ||
"os" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/assert" | ||
) | ||
|
||
func TestUnstructuredPartition(t *testing.T) { | ||
// Create a temporary file for testing | ||
file, err := os.CreateTemp("", "testfile") | ||
assert.NoError(t, err) | ||
|
||
defer os.Remove(file.Name()) | ||
|
||
// Create a sample response JSON | ||
responseJSON := `[{"type":"NarrativeText","element_id":"mock_element_id","metadata":{"filetype":"application/pdf","languages":["eng"],"page_number":1,"filename":"testfile.pdf"},"text":"Mock text"}]` | ||
|
||
// Create a mock HTTP client with a predefined response | ||
mockUnstructuredHTTPClient := &mockUnstructuredHTTPClient{ | ||
DoFunc: func(req *http.Request) (*http.Response, error) { | ||
assert.Equal(t, http.MethodPost, req.Method) | ||
assert.Equal(t, "application/json", req.Header.Get("Accept")) | ||
assert.Contains(t, req.Header.Get("Content-Type"), "multipart/form-data") | ||
assert.Equal(t, "mock_api_key", req.Header.Get("unstructured-api-key")) | ||
|
||
// Simulate a successful response | ||
return &http.Response{ | ||
StatusCode: http.StatusOK, | ||
Body: io.NopCloser(bytes.NewBufferString(responseJSON)), | ||
}, nil | ||
}, | ||
} | ||
|
||
// Create an instance of Unstructured with the mock HTTP client | ||
unstructuredClient := NewUnstructured("mock_api_key", func(o *UnstructuredOptions) { | ||
o.HTTPClient = mockUnstructuredHTTPClient | ||
}) | ||
|
||
// Create a test case | ||
t.Run("Partition", func(t *testing.T) { | ||
// Call the Partition method with the mock file | ||
output, err := unstructuredClient.Partition(context.Background(), &PartitionInput{File: file}) | ||
assert.NoError(t, err) | ||
|
||
// Assert the expected output | ||
expectedOutput := []PartitionOutput{ | ||
{ | ||
Type: "NarrativeText", | ||
ElementID: "mock_element_id", | ||
Metadata: struct { | ||
Filetype string `json:"filetype"` | ||
Languages []string `json:"languages"` | ||
PageNumber int `json:"page_number"` | ||
Filename string `json:"filename"` | ||
}{ | ||
Filetype: "application/pdf", | ||
Languages: []string{"eng"}, | ||
PageNumber: 1, | ||
Filename: "testfile.pdf", | ||
}, | ||
Text: "Mock text", | ||
}, | ||
} | ||
assert.Equal(t, expectedOutput, output) | ||
}) | ||
} | ||
|
||
// mockUnstructuredHTTPClient is a custom mock for the HTTP client. | ||
type mockUnstructuredHTTPClient struct { | ||
DoFunc func(req *http.Request) (*http.Response, error) | ||
} | ||
|
||
// Do is the implementation of the Do method for the mock. | ||
func (m *mockUnstructuredHTTPClient) Do(req *http.Request) (*http.Response, error) { | ||
if m.DoFunc != nil { | ||
return m.DoFunc(req) | ||
} | ||
|
||
return nil, errors.New("mockUnstructuredHTTPClient: DoFunc not set") | ||
} |