Skip to content

Commit

Permalink
Add article summary feature with OpenAI integration
Browse files Browse the repository at this point in the history
- Introduce 'summary' query parameter in /api/content/v1/parser endpoint
- Integrate OpenAI API for generating article summaries
- Add OpenAIKey field to Server struct and corresponding command-line
flag
- Update extractArticleEmulateReadability to handle summary requests
- Add generateSummary method using OpenAI's GPT-4o model (turns out to
be faster than even 4o mini)
- Add OpenAIClient interface and mock for testing
- Update README.md with new configuration options and API details

This feature allows users to request a summary of extracted articles
using OpenAI's GPT-4o model. To ensure secure usage, summary generation
requires a valid server token. The changes include comprehensive error
handling and test coverage for various scenarios, including token
validation and server misconfiguration.
  • Loading branch information
paskal committed Oct 9, 2024
1 parent 9642f97 commit 7229311
Show file tree
Hide file tree
Showing 52 changed files with 6,948 additions and 12 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
| address | UKEEPER_ADDRESS | all interfaces | web server listening address |
| port | UKEEPER_PORT | `8080` | web server port |
| mongo_uri | MONGO_URI | none | MongoDB connection string, _required_ |
| openai_key | OPENAI_KEY | none | OpenAI API key for summary generation |
| frontend_dir | FRONTEND_DIR | `/srv/web` | directory with frontend files |
| token | TOKEN | none | token for /content/v1/parser endpoint auth |
| mongo-delay | MONGO_DELAY | `0` | mongo initial delay |
Expand All @@ -20,7 +21,7 @@

### API

GET /api/content/v1/parser?token=secret&url=http://aa.com/blah - extract content (emulate Readability API parse call)
GET /api/content/v1/parser?token=secret&summary=true&url=http://aa.com/blah - extract content (emulate Readability API parse call), summary is optional and requires OpenAI key and token to be enabled
POST /api/v1/extract {url: http://aa.com/blah} - extract content

## Development
Expand Down
82 changes: 82 additions & 0 deletions backend/extractor/openai_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions backend/extractor/pics.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (
log "github.com/go-pkgz/lgr"
)

func (f UReadability) extractPics(iselect *goquery.Selection, url string) (mainImage string, allImages []string, ok bool) {
func (f *UReadability) extractPics(iselect *goquery.Selection, url string) (mainImage string, allImages []string, ok bool) {
images := make(map[int]string)

type imgInfo struct {
Expand Down Expand Up @@ -58,7 +58,7 @@ func (f UReadability) extractPics(iselect *goquery.Selection, url string) (mainI
}

// getImageSize loads image to get size
func (f UReadability) getImageSize(url string) (size int) {
func (f *UReadability) getImageSize(url string) (size int) {
httpClient := &http.Client{Timeout: time.Second * 30}
req, err := http.NewRequest("GET", url, nil)
if err != nil {
Expand Down
51 changes: 46 additions & 5 deletions backend/extractor/readability.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,17 @@ import (
"github.com/PuerkitoBio/goquery"
log "github.com/go-pkgz/lgr"
"github.com/mauidude/go-readability"
"github.com/sashabaranov/go-openai"
"go.mongodb.org/mongo-driver/bson/primitive"

"github.com/ukeeper/ukeeper-redabilty/backend/datastore"
)

//go:generate moq -out openai_mock.go . OpenAIClient
type OpenAIClient interface {
CreateChatCompletion(ctx context.Context, request openai.ChatCompletionRequest) (openai.ChatCompletionResponse, error)
}

// Rules interface with all methods to access datastore
type Rules interface {
Get(ctx context.Context, rURL string) (datastore.Rule, bool)
Expand All @@ -33,10 +39,14 @@ type UReadability struct {
TimeOut time.Duration
SnippetSize int
Rules Rules
OpenAIKey string

openAIClient OpenAIClient
}

// Response from api calls
type Response struct {
Summary string `json:"summary,omitempty"`
Content string `json:"content"`
Rich string `json:"rich_content"`
Domain string `json:"domain"`
Expand All @@ -59,17 +69,48 @@ var (
const userAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15"

// Extract fetches page and retrieves article
func (f UReadability) Extract(ctx context.Context, reqURL string) (*Response, error) {
func (f *UReadability) Extract(ctx context.Context, reqURL string) (*Response, error) {
return f.extractWithRules(ctx, reqURL, nil)
}

// ExtractByRule fetches page and retrieves article using a specific rule
func (f UReadability) ExtractByRule(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) {
func (f *UReadability) ExtractByRule(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) {
return f.extractWithRules(ctx, reqURL, rule)
}

func (f *UReadability) GenerateSummary(ctx context.Context, content string) (string, error) {
if f.OpenAIKey == "" {
return "", fmt.Errorf("OpenAI key is not set")
}
if f.openAIClient == nil {
f.openAIClient = openai.NewClient(f.OpenAIKey)
}
resp, err := f.openAIClient.CreateChatCompletion(
ctx,
openai.ChatCompletionRequest{
Model: openai.GPT4o,
Messages: []openai.ChatCompletionMessage{
{
Role: openai.ChatMessageRoleSystem,
Content: "You are a helpful assistant that summarizes articles. Please summarize the main points in a few sentences as TLDR style (don't add a TLDR label). Then, list up to five detailed bullet points. Provide the response in plain text. Do not add any additional information. Do not add a Summary at the beginning of the response. If detailed bullet points are too similar to the summary, don't include them at all:",
},
{
Role: openai.ChatMessageRoleUser,
Content: content,
},
},
},
)

if err != nil {
return "", err
}

return resp.Choices[0].Message.Content, nil
}

// ExtractWithRules is the core function that handles extraction with or without a specific rule
func (f UReadability) extractWithRules(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) {
func (f *UReadability) extractWithRules(ctx context.Context, reqURL string, rule *datastore.Rule) (*Response, error) {
log.Printf("[INFO] extract %s", reqURL)
rb := &Response{}

Expand Down Expand Up @@ -140,7 +181,7 @@ func (f UReadability) extractWithRules(ctx context.Context, reqURL string, rule
// getContent retrieves content from raw body string, both content (text only) and rich (with html tags)
// if rule is provided, it uses custom rule, otherwise tries to retrieve one from the storage,
// and at last tries to use general readability parser
func (f UReadability) getContent(ctx context.Context, body, reqURL string, rule *datastore.Rule) (content, rich string, err error) {
func (f *UReadability) getContent(ctx context.Context, body, reqURL string, rule *datastore.Rule) (content, rich string, err error) {
// general parser
genParser := func(body, _ string) (content, rich string, err error) {
doc, err := readability.NewDocument(body)
Expand Down Expand Up @@ -192,7 +233,7 @@ func (f UReadability) getContent(ctx context.Context, body, reqURL string, rule
}

// makes all links absolute and returns all found links
func (f UReadability) normalizeLinks(data string, reqContext *http.Request) (result string, links []string) {
func (f *UReadability) normalizeLinks(data string, reqContext *http.Request) (result string, links []string) {
absoluteLink := func(link string) (absLink string, changed bool) {
if r, err := reqContext.URL.Parse(link); err == nil {
return r.String(), r.String() != link
Expand Down
60 changes: 60 additions & 0 deletions backend/extractor/readability_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ import (
"testing"
"time"

"github.com/sashabaranov/go-openai"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.mongodb.org/mongo-driver/bson/primitive"

"github.com/ukeeper/ukeeper-redabilty/backend/datastore"
Expand Down Expand Up @@ -176,3 +178,61 @@ func TestGetContentCustom(t *testing.T) {
assert.Equal(t, 6988, len(content))
assert.Equal(t, 7169, len(rich))
}

func TestUReadability_GenerateSummary(t *testing.T) {
mockOpenAI := &OpenAIClientMock{
CreateChatCompletionFunc: func(ctx context.Context, request openai.ChatCompletionRequest) (openai.ChatCompletionResponse, error) {
return openai.ChatCompletionResponse{
Choices: []openai.ChatCompletionChoice{
{
Message: openai.ChatCompletionMessage{
Content: "This is a summary of the article.",
},
},
},
}, nil
},
}

tests := []struct {
name string
content string
openAIKey string
expectedResult string
expectedError string
}{
{
name: "Valid OpenAI Key and content",
content: "This is a test article content.",
openAIKey: "test-key",
expectedResult: "This is a summary of the article.",
expectedError: "",
},
{
name: "No OpenAI Key",
content: "This is a test article content.",
openAIKey: "",
expectedResult: "",
expectedError: "OpenAI key is not set",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
readability := UReadability{
OpenAIKey: tt.openAIKey,
openAIClient: mockOpenAI,
}

result, err := readability.GenerateSummary(context.Background(), tt.content)

if tt.expectedError != "" {
require.Error(t, err)
assert.Contains(t, err.Error(), tt.expectedError)
} else {
require.NoError(t, err)
assert.Equal(t, tt.expectedResult, result)
}
})
}
}
6 changes: 3 additions & 3 deletions backend/extractor/text.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (
)

// get clean text from html content
func (f UReadability) getText(content, title string) string {
func (f *UReadability) getText(content, title string) string {
cleanText := sanitize.HTML(content)
cleanText = strings.Replace(cleanText, title, "", 1) // get rid of title in snippet
cleanText = strings.ReplaceAll(cleanText, "\t", " ")
Expand All @@ -32,7 +32,7 @@ func (f UReadability) getText(content, title string) string {
}

// get snippet from clean text content
func (f UReadability) getSnippet(cleanText string) string {
func (f *UReadability) getSnippet(cleanText string) string {
cleanText = strings.ReplaceAll(cleanText, "\n", " ")
size := len([]rune(cleanText))
if size > f.SnippetSize {
Expand All @@ -50,7 +50,7 @@ func (f UReadability) getSnippet(cleanText string) string {
}

// detect encoding, content type and convert content to utf8
func (f UReadability) toUtf8(content []byte, header http.Header) (contentType, origEncoding, result string) {
func (f *UReadability) toUtf8(content []byte, header http.Header) (contentType, origEncoding, result string) {
getContentTypeAndEncoding := func(str string) (contentType, encoding string) { // from "text/html; charset=windows-1251"
elems := strings.Split(str, ";")
contentType = strings.TrimSpace(elems[0])
Expand Down
1 change: 1 addition & 0 deletions backend/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ require (
github.com/jessevdk/go-flags v1.6.1
github.com/kennygrant/sanitize v1.2.4
github.com/mauidude/go-readability v0.0.0-20220221173116-a9b3620098b7
github.com/sashabaranov/go-openai v1.32.0
github.com/stretchr/testify v1.9.0
go.mongodb.org/mongo-driver v1.16.1
golang.org/x/net v0.28.0
Expand Down
4 changes: 4 additions & 0 deletions backend/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,10 @@ github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6So
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts=
github.com/sashabaranov/go-openai v1.28.1 h1:aREx6faUTeOZNMDTNGAY8B9vNmmN7qoGvDV0Ke2J1Mc=
github.com/sashabaranov/go-openai v1.28.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
github.com/sashabaranov/go-openai v1.32.0 h1:Yk3iE9moX3RBXxrof3OBtUBrE7qZR0zF9ebsoO4zVzI=
github.com/sashabaranov/go-openai v1.32.0/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo=
Expand Down
3 changes: 2 additions & 1 deletion backend/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ var opts struct {
MongoURI string `short:"m" long:"mongo_uri" env:"MONGO_URI" required:"true" description:"MongoDB connection string"`
MongoDelay time.Duration `long:"mongo-delay" env:"MONGO_DELAY" default:"0" description:"mongo initial delay"`
MongoDB string `long:"mongo-db" env:"MONGO_DB" default:"ureadability" description:"mongo database name"`
OpenAIKey string `long:"openai_key" env:"OPENAI_KEY" description:"OpenAI API key for summary generation"`
Debug bool `long:"dbg" env:"DEBUG" description:"debug mode"`
}

Expand All @@ -41,7 +42,7 @@ func main() {
log.Fatalf("[ERROR] can't connect to mongo %v", err)
}
srv := rest.Server{
Readability: extractor.UReadability{TimeOut: 30, SnippetSize: 300, Rules: db.GetStores()},
Readability: extractor.UReadability{TimeOut: 30, SnippetSize: 300, Rules: db.GetStores(), OpenAIKey: opts.OpenAIKey},
Token: opts.Token,
Credentials: opts.Credentials,
Version: revision,
Expand Down
Loading

0 comments on commit 7229311

Please sign in to comment.