From 98c304df17f5cd264763002b9b8e5006a9bf41fc Mon Sep 17 00:00:00 2001 From: traut Date: Thu, 2 Jan 2025 00:28:28 +0100 Subject: [PATCH] Add `use_browser_user_agent` field --- docs/plugins/builtin/data-sources/rss.md | 7 +++++++ docs/plugins/plugins.json | 3 ++- internal/builtin/data_rss.go | 25 ++++++++++++++++++++---- 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/docs/plugins/builtin/data-sources/rss.md b/docs/plugins/builtin/data-sources/rss.md index ac8d0529..86c9bd16 100644 --- a/docs/plugins/builtin/data-sources/rss.md +++ b/docs/plugins/builtin/data-sources/rss.md @@ -58,6 +58,13 @@ data rss { # Default value: fill_in_content = false + # If the data source should pretend to be a browser while fetching the feed and the feed items. + # If set to "false", the default user-agent value "blackstork-rss/0.0.1" will be used. + # + # Optional bool. + # Default value: + use_browser_user_agent = false + # Maximum number of items to fill the content in per feed. # # Optional number. diff --git a/docs/plugins/plugins.json b/docs/plugins/plugins.json index e95779c0..b73a1ce4 100644 --- a/docs/plugins/plugins.json +++ b/docs/plugins/plugins.json @@ -123,7 +123,8 @@ "fill_in_content", "fill_in_max_items", "only_items_after_time", - "url" + "url", + "use_browser_user_agent" ] }, { diff --git a/internal/builtin/data_rss.go b/internal/builtin/data_rss.go index 24e812e0..79df793f 100644 --- a/internal/builtin/data_rss.go +++ b/internal/builtin/data_rss.go @@ -28,6 +28,7 @@ import ( const ( defaultRequestTimeout = 30 * time.Second + defaultUserAgent = "blackstork-rss/0.0.1" ) // https://techblog.willshouse.com/2012/01/03/most-common-user-agents/ @@ -69,6 +70,16 @@ func makeRSSDataSource() *plugin.DataSource { If the full content should be added when it's not present in the feed items. `, }, + { + Name: "use_browser_user_agent", + Type: cty.Bool, + DefaultVal: cty.BoolVal(false), + Constraints: constraint.NonNull, + Doc: fmt.Sprintf(` + If the data source should pretend to be a browser while fetching the feed and the feed items. + If set to "false", the default user-agent value "%s" will be used. + `, defaultUserAgent), + }, { Name: "fill_in_max_items", Type: cty.Number, @@ -150,7 +161,7 @@ func filterItems(ctx context.Context, feed *gofeed.Feed, from time.Time) *gofeed return feed } -func fetchFeedItems(ctx context.Context, feed *gofeed.Feed, itemsCap int) *gofeed.Feed { +func fetchFeedItems(ctx context.Context, feed *gofeed.Feed, userAgent string, itemsCap int) *gofeed.Feed { log := slog.Default() log = log.With("feed_url", feed.Link, "items_cap", itemsCap) log.InfoContext(ctx, "Fetching content for the items in the feed") @@ -192,7 +203,7 @@ func fetchFeedItems(ctx context.Context, feed *gofeed.Feed, itemsCap int) *gofee _log.ErrorContext(ctx, "Error while creating a HTTP request for a feed item link", "err", err) return } - req.Header.Set("User-Agent", getRandUserAgent()) + req.Header.Set("User-Agent", userAgent) resp, err := client.Do(req) if err != nil { @@ -226,14 +237,20 @@ func fetchRSSData(ctx context.Context, params *plugin.RetrieveDataParams) (plugi log := slog.Default() fp := gofeed.NewParser() - fp.UserAgent = getRandUserAgent() url := params.Args.GetAttrVal("url").AsString() fillInContent := params.Args.GetAttrVal("fill_in_content").True() + useBrowserUserAgent := params.Args.GetAttrVal("use_browser_user_agent").True() fillInMaxItems, _ := params.Args.GetAttrVal("fill_in_max_items").AsBigFloat().Int64() onlyItemsAfterTimeAttr := params.Args.GetAttrVal("only_items_after_time") + userAgent := defaultUserAgent + if useBrowserUserAgent { + userAgent = getRandUserAgent() + } + fp.UserAgent = userAgent + basicAuth := params.Args.Blocks.GetFirstMatching("basic_auth") if basicAuth != nil { fp.AuthConfig = &gofeed.Auth{ @@ -283,7 +300,7 @@ func fetchRSSData(ctx context.Context, params *plugin.RetrieveDataParams) (plugi } if fillInContent { - feed = fetchFeedItems(ctx, feed, int(fillInMaxItems)) + feed = fetchFeedItems(ctx, feed, userAgent, int(fillInMaxItems)) log.InfoContext(ctx, "The content for the feed items downloaded") }