Skip to content

Commit

Permalink
Add use_browser_user_agent field
Browse files Browse the repository at this point in the history
  • Loading branch information
traut committed Jan 2, 2025
1 parent 40f12bd commit 77b62ce
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 5 deletions.
7 changes: 7 additions & 0 deletions docs/plugins/builtin/data-sources/rss.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ data rss {
# Default value:
fill_in_content = false
# If the data source should pretend to be a browser while fetching the feed and the feed items.
# If set to "false", the default user-agent value "blackstork-rss/0.0.1" will be used.
#
# Optional bool.
# Default value:
use_browser_user_agent = false
# Maximum number of items to fill the content in per feed.
#
# Optional number.
Expand Down
3 changes: 2 additions & 1 deletion docs/plugins/plugins.json
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,8 @@
"fill_in_content",
"fill_in_max_items",
"only_items_after_time",
"url"
"url",
"use_browser_user_agent"
]
},
{
Expand Down
25 changes: 21 additions & 4 deletions internal/builtin/data_rss.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (

const (
defaultRequestTimeout = 30 * time.Second
defaultUserAgent = "blackstork-rss/0.0.1"
)

// https://techblog.willshouse.com/2012/01/03/most-common-user-agents/
Expand Down Expand Up @@ -69,6 +70,16 @@ func makeRSSDataSource() *plugin.DataSource {
If the full content should be added when it's not present in the feed items.
`,
},
{
Name: "use_browser_user_agent",
Type: cty.Bool,
DefaultVal: cty.BoolVal(false),
Constraints: constraint.NonNull,
Doc: fmt.Sprintf(`
If the data source should pretend to be a browser while fetching the feed and the feed items.
If set to "false", the default user-agent value "%s" will be used.
`, defaultUserAgent),
},
{
Name: "fill_in_max_items",
Type: cty.Number,
Expand Down Expand Up @@ -150,7 +161,7 @@ func filterItems(ctx context.Context, feed *gofeed.Feed, from time.Time) *gofeed
return feed
}

func fetchFeedItems(ctx context.Context, feed *gofeed.Feed, itemsCap int) *gofeed.Feed {
func fetchFeedItems(ctx context.Context, feed *gofeed.Feed, userAgent string, itemsCap int) *gofeed.Feed {
log := slog.Default()
log = log.With("feed_url", feed.Link, "items_cap", itemsCap)
log.InfoContext(ctx, "Fetching content for the items in the feed")
Expand Down Expand Up @@ -192,7 +203,7 @@ func fetchFeedItems(ctx context.Context, feed *gofeed.Feed, itemsCap int) *gofee
_log.ErrorContext(ctx, "Error while creating a HTTP request for a feed item link", "err", err)
return
}
req.Header.Set("User-Agent", getRandUserAgent())
req.Header.Set("User-Agent", userAgent)

resp, err := client.Do(req)
if err != nil {
Expand Down Expand Up @@ -226,14 +237,20 @@ func fetchRSSData(ctx context.Context, params *plugin.RetrieveDataParams) (plugi
log := slog.Default()

fp := gofeed.NewParser()
fp.UserAgent = getRandUserAgent()

url := params.Args.GetAttrVal("url").AsString()

fillInContent := params.Args.GetAttrVal("fill_in_content").True()
useBrowserUserAgent := params.Args.GetAttrVal("use_browser_user_agent").True()
fillInMaxItems, _ := params.Args.GetAttrVal("fill_in_max_items").AsBigFloat().Int64()
onlyItemsAfterTimeAttr := params.Args.GetAttrVal("only_items_after_time")

userAgent := defaultUserAgent
if useBrowserUserAgent {
userAgent = getRandUserAgent()
}
fp.UserAgent = userAgent

basicAuth := params.Args.Blocks.GetFirstMatching("basic_auth")
if basicAuth != nil {
fp.AuthConfig = &gofeed.Auth{
Expand Down Expand Up @@ -283,7 +300,7 @@ func fetchRSSData(ctx context.Context, params *plugin.RetrieveDataParams) (plugi
}

if fillInContent {
feed = fetchFeedItems(ctx, feed, int(fillInMaxItems))
feed = fetchFeedItems(ctx, feed, userAgent, int(fillInMaxItems))
log.InfoContext(ctx, "The content for the feed items downloaded")
}

Expand Down

0 comments on commit 77b62ce

Please sign in to comment.