From 91d61a1090d2e831423b1c68ce9ca451145ddef1 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Tue, 15 Oct 2024 15:28:23 +0200 Subject: [PATCH 001/121] added spiegel --- R/deliver_spiegel_de.R | 35 +++++++++++++++++++++++++++++++++++ inst/status.csv | 1 + 2 files changed, 36 insertions(+) create mode 100644 R/deliver_spiegel_de.R diff --git a/R/deliver_spiegel_de.R b/R/deliver_spiegel_de.R new file mode 100644 index 0000000..3721103 --- /dev/null +++ b/R/deliver_spiegel_de.R @@ -0,0 +1,35 @@ +pb_deliver.spiegel.de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + datetime <- html %>% + html_search(html,c("time"),c("datetime")) %>% + lubridate::as_datetime() + + # headline + headline <- html %>% + rvest::html_elements("title") %>% + rvest::html_text() + + # author + author <- html %>% + rvest::html_nodes("header a.text-black") %>% + rvest::html_text2() %>% + toString() + + # text + text <- html %>% + rvest::html_elements("div[data-area = \"body\"]") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + # the helper function safely creates a named list from objects + s_n_list( + datetime, + author, + headline, + text + ) + +} \ No newline at end of file diff --git a/inst/status.csv b/inst/status.csv index d27ae69..a1df69e 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -60,6 +60,7 @@ "sfgate.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "skwawkbox.org","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "sky.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.skynews.com/feeds/rss/home.xml" +"spiegel.de","![](https://img.shields.io/badge/status-requested-lightgrey)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.spiegel.de/schlagzeilen/index.rss" "telegraaf.nl","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#17](https://github.com/JBGruber/paperboy/issues/17)","https://www.telegraaf.nl/rss.xml" "telegraph.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "thecanary.co","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 28619d376013a1cdb12248aedd0d3ba37a5c4c8a Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Tue, 15 Oct 2024 16:04:32 +0200 Subject: [PATCH 002/121] added docs and export --- DESCRIPTION | 2 +- NAMESPACE | 1 + R/deliver_spiegel_de.R | 9 +++++++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 15d46ce..0365775 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,6 +44,6 @@ Suggests: URL: https://github.com/JBGruber/paperboy Encoding: UTF-8 BugReports: https://github.com/JBGruber/paperboy/issues -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 VignetteBuilder: knitr Language: en-GB diff --git a/NAMESPACE b/NAMESPACE index 94f68ed..6691d48 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ S3method(pb_deliver,character) S3method(pb_deliver,data.frame) S3method(pb_deliver,default) +S3method(pb_deliver,spiegel.de) S3method(pb_deliver_paper,ac24_cz) S3method(pb_deliver_paper,ad_nl) S3method(pb_deliver_paper,aktualne_cz) diff --git a/R/deliver_spiegel_de.R b/R/deliver_spiegel_de.R index 3721103..e0c564d 100644 --- a/R/deliver_spiegel_de.R +++ b/R/deliver_spiegel_de.R @@ -1,17 +1,22 @@ +#' @export pb_deliver.spiegel.de <- function(x, verbose = NULL, pb, ...) { pb_tick(x, verbose, pb) # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) datetime <- html %>% - html_search(html,c("time"),c("datetime")) %>% + html_search(c("time"),c("datetime")) %>% lubridate::as_datetime() # headline headline <- html %>% - rvest::html_elements("title") %>% + rvest::html_nodes(".font-brandUI .align-middle") %>% rvest::html_text() + if(length(headline) > 1) { + headline <- headline[!grepl("\\n", headline)] + } + # author author <- html %>% rvest::html_nodes("header a.text-black") %>% From b6c6bbafadb667edf63d4d93af012c2d8c450092 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Tue, 15 Oct 2024 16:11:40 +0200 Subject: [PATCH 003/121] renamed spiegel function --- NAMESPACE | 2 +- R/deliver_spiegel_de.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 6691d48..f64e770 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,7 +3,6 @@ S3method(pb_deliver,character) S3method(pb_deliver,data.frame) S3method(pb_deliver,default) -S3method(pb_deliver,spiegel.de) S3method(pb_deliver_paper,ac24_cz) S3method(pb_deliver_paper,ad_nl) S3method(pb_deliver_paper,aktualne_cz) @@ -55,6 +54,7 @@ S3method(pb_deliver_paper,seznamzpravy_cz) S3method(pb_deliver_paper,sfgate_com) S3method(pb_deliver_paper,skwawkbox_org) S3method(pb_deliver_paper,sky_com) +S3method(pb_deliver_paper,spiegel.de) S3method(pb_deliver_paper,telegraaf_nl) S3method(pb_deliver_paper,telegraph_co_uk) S3method(pb_deliver_paper,thecanary_co) diff --git a/R/deliver_spiegel_de.R b/R/deliver_spiegel_de.R index e0c564d..51d3689 100644 --- a/R/deliver_spiegel_de.R +++ b/R/deliver_spiegel_de.R @@ -1,5 +1,5 @@ #' @export -pb_deliver.spiegel.de <- function(x, verbose = NULL, pb, ...) { +pb_deliver_paper.spiegel.de <- function(x, verbose = NULL, pb, ...) { pb_tick(x, verbose, pb) # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) From a42d16d0f086d11fcf319f590f648bb33ce32357 Mon Sep 17 00:00:00 2001 From: JBGruber <johannesb.gruber@gmail.com> Date: Tue, 15 Oct 2024 19:27:53 +0200 Subject: [PATCH 004/121] change spiegel function --- R/deliver_spiegel_de.R | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/R/deliver_spiegel_de.R b/R/deliver_spiegel_de.R index 51d3689..067ebf3 100644 --- a/R/deliver_spiegel_de.R +++ b/R/deliver_spiegel_de.R @@ -1,5 +1,5 @@ #' @export -pb_deliver_paper.spiegel.de <- function(x, verbose = NULL, pb, ...) { +pb_deliver_paper.spiegel_de <- function(x, verbose = NULL, pb, ...) { pb_tick(x, verbose, pb) # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) @@ -10,17 +10,13 @@ pb_deliver_paper.spiegel.de <- function(x, verbose = NULL, pb, ...) { # headline headline <- html %>% - rvest::html_nodes(".font-brandUI .align-middle") %>% - rvest::html_text() - - if(length(headline) > 1) { - headline <- headline[!grepl("\\n", headline)] - } + rvest::html_element("article") %>% + rvest::html_attr("aria-label") # author author <- html %>% - rvest::html_nodes("header a.text-black") %>% - rvest::html_text2() %>% + rvest::html_element("meta[name=\"author\"]") %>% + rvest::html_attr("content") %>% toString() # text @@ -37,4 +33,4 @@ pb_deliver_paper.spiegel.de <- function(x, verbose = NULL, pb, ...) { text ) -} \ No newline at end of file +} From 5e9bc7e402a8d0827a3662d0ac47a2809c2bec53 Mon Sep 17 00:00:00 2001 From: JBGruber <johannesb.gruber@gmail.com> Date: Tue, 15 Oct 2024 19:31:31 +0200 Subject: [PATCH 005/121] fix namespace --- NAMESPACE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index f64e770..5ac5506 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -54,7 +54,7 @@ S3method(pb_deliver_paper,seznamzpravy_cz) S3method(pb_deliver_paper,sfgate_com) S3method(pb_deliver_paper,skwawkbox_org) S3method(pb_deliver_paper,sky_com) -S3method(pb_deliver_paper,spiegel.de) +S3method(pb_deliver_paper,spiegel_de) S3method(pb_deliver_paper,telegraaf_nl) S3method(pb_deliver_paper,telegraph_co_uk) S3method(pb_deliver_paper,thecanary_co) From 87526cae375e647b4f2edf0829ca36eaf7a915b5 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Tue, 15 Oct 2024 20:13:19 +0200 Subject: [PATCH 006/121] fixed spiegel and added bild --- NAMESPACE | 3 ++- R/deliver_bild_de.R | 37 +++++++++++++++++++++++++++++++++++++ R/deliver_spiegel_de.R | 6 +++--- inst/status.csv | 1 + 4 files changed, 43 insertions(+), 4 deletions(-) create mode 100644 R/deliver_bild_de.R diff --git a/NAMESPACE b/NAMESPACE index f64e770..6892fd4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ S3method(pb_deliver_paper,ad_nl) S3method(pb_deliver_paper,aktualne_cz) S3method(pb_deliver_paper,anotherangryvoice_blogspot_com) S3method(pb_deliver_paper,bbc_co_uk) +S3method(pb_deliver_paper,bild_de) S3method(pb_deliver_paper,blesk_cz) S3method(pb_deliver_paper,breakingnews_ie) S3method(pb_deliver_paper,breitbart_com) @@ -54,7 +55,7 @@ S3method(pb_deliver_paper,seznamzpravy_cz) S3method(pb_deliver_paper,sfgate_com) S3method(pb_deliver_paper,skwawkbox_org) S3method(pb_deliver_paper,sky_com) -S3method(pb_deliver_paper,spiegel.de) +S3method(pb_deliver_paper,spiegel_de) S3method(pb_deliver_paper,telegraaf_nl) S3method(pb_deliver_paper,telegraph_co_uk) S3method(pb_deliver_paper,thecanary_co) diff --git a/R/deliver_bild_de.R b/R/deliver_bild_de.R new file mode 100644 index 0000000..1f5b68f --- /dev/null +++ b/R/deliver_bild_de.R @@ -0,0 +1,37 @@ +#' @export +pb_deliver_paper.bild_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + datetime <- html %>% + rvest::html_nodes("time.datetime, time.datetime--video datetime") %>% + rvest::html_text() %>% + lubridate::as_datetime(format = "%d.%m.%Y - %H:%M Uhr ") + + # headline + headline <- html %>% + rvest::html_nodes(".document-title__headline") %>% + rvest::html_text() + + # author + author <- html %>% + rvest::html_nodes(".authors") %>% + rvest::html_text() %>% + toString() + + # text + text <- html %>% + rvest::html_nodes(".article-body") %>% + rvest::html_text() %>% + paste(collapse = "\n") + + # the helper function safely creates a named list from objects + s_n_list( + datetime, + author, + headline, + text + ) + +} \ No newline at end of file diff --git a/R/deliver_spiegel_de.R b/R/deliver_spiegel_de.R index 51d3689..6a5a834 100644 --- a/R/deliver_spiegel_de.R +++ b/R/deliver_spiegel_de.R @@ -1,16 +1,16 @@ #' @export -pb_deliver_paper.spiegel.de <- function(x, verbose = NULL, pb, ...) { +pb_deliver_paper.spiegel_de <- function(x, verbose = NULL, pb, ...) { pb_tick(x, verbose, pb) # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) datetime <- html %>% - html_search(c("time"),c("datetime")) %>% + html_search(c("time"), c("datetime")) %>% lubridate::as_datetime() # headline headline <- html %>% - rvest::html_nodes(".font-brandUI .align-middle") %>% + rvest::html_nodes(".font-brandUI .align-middle,.block.font-serifdisplayUI .align-middle") %>% rvest::html_text() if(length(headline) > 1) { diff --git a/inst/status.csv b/inst/status.csv index a1df69e..cef98cb 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -4,6 +4,7 @@ "aktualne.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.aktualne.cz/rss" "anotherangryvoice.blogspot.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "bbc.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.bbci.co.uk/news/rss.xml" +"bild.de","![](https://img.shields.io/badge/status-requested-lightgrey)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.bild.de/rssfeeds/rss3-20745882,feed=alles.bild.html" "blesk.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.blesk.cz/rss" "boston.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "bostonglobe.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA From aa8e8ecdf68b79068514108d2433ee14a4698637 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Tue, 15 Oct 2024 20:50:45 +0200 Subject: [PATCH 007/121] added welt.de --- NAMESPACE | 1 + R/deliver_welt_de.R | 22 ++++++++++++++++++++++ inst/status.csv | 5 +++-- 3 files changed, 26 insertions(+), 2 deletions(-) create mode 100644 R/deliver_welt_de.R diff --git a/NAMESPACE b/NAMESPACE index 6892fd4..93c8b8f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -65,6 +65,7 @@ S3method(pb_deliver_paper,thesun_ie) S3method(pb_deliver_paper,usatoday_com) S3method(pb_deliver_paper,volkskrant_nl) S3method(pb_deliver_paper,washingtonpost_com) +S3method(pb_deliver_paper,welt_de) S3method(pb_deliver_paper,wsj_com) S3method(pb_deliver_paper,yahoo_com) S3method(pb_deliver_paper,zeit_de) diff --git a/R/deliver_welt_de.R b/R/deliver_welt_de.R new file mode 100644 index 0000000..b1bf4e4 --- /dev/null +++ b/R/deliver_welt_de.R @@ -0,0 +1,22 @@ +#' @export +pb_deliver_paper.welt_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] |> rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- trimws(gsub("<[^>]+>", "", json_df$articleBody)) + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index cef98cb..23ddc22 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -4,7 +4,7 @@ "aktualne.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.aktualne.cz/rss" "anotherangryvoice.blogspot.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "bbc.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.bbci.co.uk/news/rss.xml" -"bild.de","![](https://img.shields.io/badge/status-requested-lightgrey)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.bild.de/rssfeeds/rss3-20745882,feed=alles.bild.html" +"bild.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.bild.de/rssfeeds/rss3-20745882,feed=alles.bild.html" "blesk.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.blesk.cz/rss" "boston.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "bostonglobe.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA @@ -61,7 +61,7 @@ "sfgate.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "skwawkbox.org","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "sky.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.skynews.com/feeds/rss/home.xml" -"spiegel.de","![](https://img.shields.io/badge/status-requested-lightgrey)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.spiegel.de/schlagzeilen/index.rss" +"spiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.spiegel.de/schlagzeilen/index.rss" "telegraaf.nl","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#17](https://github.com/JBGruber/paperboy/issues/17)","https://www.telegraaf.nl/rss.xml" "telegraph.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "thecanary.co","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA @@ -77,6 +77,7 @@ "usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "volkskrant.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.volkskrant.nl/rss.xml" "washingtonpost.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.washingtonpost.com/rss/world" +"welt.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.welt.de/feeds/latest.rss" "wsj.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.a.dj.com/rss/RSSWorldNews.xml" "yahoo.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://news.yahoo.com/rss.xml" "zeit.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.zeit.de/index" From c79cc1890111ca9a0ae1fab7af9e478f9339b5c3 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Tue, 15 Oct 2024 21:04:13 +0200 Subject: [PATCH 008/121] added tageschau.de --- NAMESPACE | 1 + R/deliver_tagesschau_de.R | 22 ++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 24 insertions(+) create mode 100644 R/deliver_tagesschau_de.R diff --git a/NAMESPACE b/NAMESPACE index 93c8b8f..4f141f2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -56,6 +56,7 @@ S3method(pb_deliver_paper,sfgate_com) S3method(pb_deliver_paper,skwawkbox_org) S3method(pb_deliver_paper,sky_com) S3method(pb_deliver_paper,spiegel_de) +S3method(pb_deliver_paper,tagesschau_de) S3method(pb_deliver_paper,telegraaf_nl) S3method(pb_deliver_paper,telegraph_co_uk) S3method(pb_deliver_paper,thecanary_co) diff --git a/R/deliver_tagesschau_de.R b/R/deliver_tagesschau_de.R new file mode 100644 index 0000000..9ab5e09 --- /dev/null +++ b/R/deliver_tagesschau_de.R @@ -0,0 +1,22 @@ +#' @export +pb_deliver_paper.tagesschau_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] |> rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- trimws(gsub("<[^>]+>", "", json_df$articleBody)) + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 23ddc22..2e4ef36 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -62,6 +62,7 @@ "skwawkbox.org","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "sky.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.skynews.com/feeds/rss/home.xml" "spiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.spiegel.de/schlagzeilen/index.rss" +"tagesschau.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "telegraaf.nl","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#17](https://github.com/JBGruber/paperboy/issues/17)","https://www.telegraaf.nl/rss.xml" "telegraph.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "thecanary.co","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From cdef66a63e7eb88db6d2d565f8fed1db8a65fc20 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Tue, 15 Oct 2024 21:13:25 +0200 Subject: [PATCH 009/121] added focus.de --- NAMESPACE | 1 + R/deliver_focus_de.R | 26 ++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 28 insertions(+) create mode 100644 R/deliver_focus_de.R diff --git a/NAMESPACE b/NAMESPACE index 4f141f2..f3b30dd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,6 +22,7 @@ S3method(pb_deliver_paper,default) S3method(pb_deliver_paper,denikn_cz) S3method(pb_deliver_paper,evolvepolitics_com) S3method(pb_deliver_paper,faz_net) +S3method(pb_deliver_paper,focus_de) S3method(pb_deliver_paper,forbes_com) S3method(pb_deliver_paper,foxbusiness_com) S3method(pb_deliver_paper,geenstijl_nl) diff --git a/R/deliver_focus_de.R b/R/deliver_focus_de.R new file mode 100644 index 0000000..a4385f4 --- /dev/null +++ b/R/deliver_focus_de.R @@ -0,0 +1,26 @@ +#' @export +pb_deliver_paper.focus_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] |> rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_elements(".leadIn,.textBlock") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 2e4ef36..534480c 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -22,6 +22,7 @@ "eu.usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "evolvepolitics.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "faz.net","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.faz.net/rss/aktuell/" +"focus.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://rss.focus.de/fol/XML/rss_folnews.xml" "forbes.com","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#2](https://github.com/JBGruber/paperboy/issues/2)",NA "fortune.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "foxbusiness.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From c975bcdf11c396e5376d54ca96539c70941f08ea Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Tue, 15 Oct 2024 21:22:47 +0200 Subject: [PATCH 010/121] added fr.de --- NAMESPACE | 1 + R/deliver_fr_de.R | 27 +++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 29 insertions(+) create mode 100644 R/deliver_fr_de.R diff --git a/NAMESPACE b/NAMESPACE index f3b30dd..c7a401f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,6 +25,7 @@ S3method(pb_deliver_paper,faz_net) S3method(pb_deliver_paper,focus_de) S3method(pb_deliver_paper,forbes_com) S3method(pb_deliver_paper,foxbusiness_com) +S3method(pb_deliver_paper,fr_de) S3method(pb_deliver_paper,geenstijl_nl) S3method(pb_deliver_paper,hn_cz) S3method(pb_deliver_paper,huffpost_com) diff --git a/R/deliver_fr_de.R b/R/deliver_fr_de.R new file mode 100644 index 0000000..77453dc --- /dev/null +++ b/R/deliver_fr_de.R @@ -0,0 +1,27 @@ +#' @export +pb_deliver_paper.fr_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] |> rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$mainEntity + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 534480c..a0b33fd 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -27,6 +27,7 @@ "fortune.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "foxbusiness.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "foxnews.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA +"fr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.fr.de/rssfeed.rdf" "ftw.usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "geenstijl.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.geenstijl.nl/feeds/recent.atom" "hn.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://domaci.hn.cz/?m=rss" From 90a91c21cb9e516ea44150e21fcfee149bde2a21 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Tue, 15 Oct 2024 21:48:24 +0200 Subject: [PATCH 011/121] added stern.de --- NAMESPACE | 1 + R/deliver_stern_de.R | 26 ++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 28 insertions(+) create mode 100644 R/deliver_stern_de.R diff --git a/NAMESPACE b/NAMESPACE index c7a401f..cc86957 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -58,6 +58,7 @@ S3method(pb_deliver_paper,sfgate_com) S3method(pb_deliver_paper,skwawkbox_org) S3method(pb_deliver_paper,sky_com) S3method(pb_deliver_paper,spiegel_de) +S3method(pb_deliver_paper,stern_de) S3method(pb_deliver_paper,tagesschau_de) S3method(pb_deliver_paper,telegraaf_nl) S3method(pb_deliver_paper,telegraph_co_uk) diff --git a/R/deliver_stern_de.R b/R/deliver_stern_de.R new file mode 100644 index 0000000..7c19e79 --- /dev/null +++ b/R/deliver_stern_de.R @@ -0,0 +1,26 @@ +#' @export +pb_deliver_paper.stern_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] |> rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt)[1, ] + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_elements(".intro,.text-element") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index a0b33fd..ab10cac 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -64,6 +64,7 @@ "skwawkbox.org","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "sky.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.skynews.com/feeds/rss/home.xml" "spiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.spiegel.de/schlagzeilen/index.rss" +"stern.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.stern.de/feed/standard/all/" "tagesschau.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "telegraaf.nl","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#17](https://github.com/JBGruber/paperboy/issues/17)","https://www.telegraaf.nl/rss.xml" "telegraph.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 16d578354634141f4e678179a14b0672bc9fa4ab Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Tue, 15 Oct 2024 21:57:53 +0200 Subject: [PATCH 012/121] added sueddeutsche --- NAMESPACE | 1 + R/deliver_sueddeutsche.R | 22 ++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 24 insertions(+) create mode 100644 R/deliver_sueddeutsche.R diff --git a/NAMESPACE b/NAMESPACE index cc86957..5d413d8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -59,6 +59,7 @@ S3method(pb_deliver_paper,skwawkbox_org) S3method(pb_deliver_paper,sky_com) S3method(pb_deliver_paper,spiegel_de) S3method(pb_deliver_paper,stern_de) +S3method(pb_deliver_paper,sueddeutsche_de) S3method(pb_deliver_paper,tagesschau_de) S3method(pb_deliver_paper,telegraaf_nl) S3method(pb_deliver_paper,telegraph_co_uk) diff --git a/R/deliver_sueddeutsche.R b/R/deliver_sueddeutsche.R new file mode 100644 index 0000000..47536b1 --- /dev/null +++ b/R/deliver_sueddeutsche.R @@ -0,0 +1,22 @@ +#' @export +pb_deliver_paper.sueddeutsche_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] |> rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- json_df$articleBody + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index ab10cac..40f381a 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -65,6 +65,7 @@ "sky.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.skynews.com/feeds/rss/home.xml" "spiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.spiegel.de/schlagzeilen/index.rss" "stern.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.stern.de/feed/standard/all/" +"sueddeutsche.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://rss.sueddeutsche.de/alles" "tagesschau.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "telegraaf.nl","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#17](https://github.com/JBGruber/paperboy/issues/17)","https://www.telegraaf.nl/rss.xml" "telegraph.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 453872bed7b09524b1a148b62c5867245bb61f27 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 09:08:27 +0200 Subject: [PATCH 013/121] added n-tv.de --- NAMESPACE | 1 + R/deliver_n-tv_de.R | 26 ++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 28 insertions(+) create mode 100644 R/deliver_n-tv_de.R diff --git a/NAMESPACE b/NAMESPACE index 5d413d8..1657e0b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -42,6 +42,7 @@ S3method(pb_deliver_paper,lidovky_cz) S3method(pb_deliver_paper,marketwatch_com) S3method(pb_deliver_paper,mediacourant_nl) S3method(pb_deliver_paper,metronieuws_nl) +S3method(pb_deliver_paper,n_tv_de) S3method(pb_deliver_paper,newstatesman_com) S3method(pb_deliver_paper,newsweek_com) S3method(pb_deliver_paper,nos_nl) diff --git a/R/deliver_n-tv_de.R b/R/deliver_n-tv_de.R new file mode 100644 index 0000000..9de5883 --- /dev/null +++ b/R/deliver_n-tv_de.R @@ -0,0 +1,26 @@ +#' @export +pb_deliver_paper.n_tv_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] |> rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_elements(".article__text") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 40f381a..fe01d39 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -52,6 +52,7 @@ "nos.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.nos.nl/nosnieuwsalgemeen" "novinky.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.novinky.cz/rss" "nrc.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA +"n-tv.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.n-tv.de/rss" "nu.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.nu.nl/rss" "nypost.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "nytimes.com","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#17](https://github.com/JBGruber/paperboy/issues/17)","https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml" From d0e4330005ce2482c21529d9daeb0480bed6a089 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 09:28:24 +0200 Subject: [PATCH 014/121] added rtl.de --- NAMESPACE | 1 + R/deliver_rtl_de.R | 31 +++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 33 insertions(+) create mode 100644 R/deliver_rtl_de.R diff --git a/NAMESPACE b/NAMESPACE index 1657e0b..9e336ab 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -53,6 +53,7 @@ S3method(pb_deliver_paper,nypost_com) S3method(pb_deliver_paper,nytimes_com) S3method(pb_deliver_paper,parlamentnilisty_cz) S3method(pb_deliver_paper,rte_ie) +S3method(pb_deliver_paper,rtl_de) S3method(pb_deliver_paper,rtl_nl) S3method(pb_deliver_paper,seznamzpravy_cz) S3method(pb_deliver_paper,sfgate_com) diff --git a/R/deliver_rtl_de.R b/R/deliver_rtl_de.R new file mode 100644 index 0000000..dc18132 --- /dev/null +++ b/R/deliver_rtl_de.R @@ -0,0 +1,31 @@ +#' @export +pb_deliver_paper.rtl_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] |> rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + if (json_df$`@type` != "VideoObject") { # NewsArticle + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_elements(".article-body .LeadText_lead__rfwFU,.article-body .AnnotatedMarkup_paragraph__IUT9l") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + } else { + datetime <- lubridate::as_datetime(json_df$uploadDate) + headline <- json_df$name + author <- "" + text <- json_df$transcript # for video objects, use transcript as text + } + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index fe01d39..3ec04a2 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -59,6 +59,7 @@ "pagesix.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "parlamentnilisty.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://www.parlamentnilisty.cz/export/rss.aspx" "rte.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.rte.ie/feeds/rss/?index=/news/" +"rtl.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.rtl.de/rss/feed/news" "rtl.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.rtlnieuws.nl/rss.xml" "seznamzpravy.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.seznamzpravy.cz/rss" "sfgate.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 42c8922cac70567507c9a02eca0ec7727863a274 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 09:44:50 +0200 Subject: [PATCH 015/121] added prosieben.de --- NAMESPACE | 1 + R/deliver_prosieben_de.R | 36 ++++++++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 38 insertions(+) create mode 100644 R/deliver_prosieben_de.R diff --git a/NAMESPACE b/NAMESPACE index 9e336ab..7dab8af 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -52,6 +52,7 @@ S3method(pb_deliver_paper,nu_nl) S3method(pb_deliver_paper,nypost_com) S3method(pb_deliver_paper,nytimes_com) S3method(pb_deliver_paper,parlamentnilisty_cz) +S3method(pb_deliver_paper,prosieben_de) S3method(pb_deliver_paper,rte_ie) S3method(pb_deliver_paper,rtl_de) S3method(pb_deliver_paper,rtl_nl) diff --git a/R/deliver_prosieben_de.R b/R/deliver_prosieben_de.R new file mode 100644 index 0000000..9ebbd50 --- /dev/null +++ b/R/deliver_prosieben_de.R @@ -0,0 +1,36 @@ +#' @export +pb_deliver_paper.prosieben_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ") + if (length(json_txt) == 2) { + json_txt <- json_txt[2] %>% rvest::html_text() + } else { + json_txt <- json_txt %>% rvest::html_text() + } + json_df <- jsonlite::fromJSON(json_txt) + if (json_df$`@type` != "VideoObject") { # NewsArticle + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_elements(".css-f9qfdi p.css-bq2685,.css-f9qfdi h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + } else { + datetime <- lubridate::as_datetime(json_df$uploadDate) + headline <- json_df$name + author <- "" + text <- json_df$description # for video objects, use description as text + } + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 3ec04a2..1dc9d3e 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -58,6 +58,7 @@ "nytimes.com","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#17](https://github.com/JBGruber/paperboy/issues/17)","https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml" "pagesix.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "parlamentnilisty.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://www.parlamentnilisty.cz/export/rss.aspx" +"prosieben.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "rte.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.rte.ie/feeds/rss/?index=/news/" "rtl.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.rtl.de/rss/feed/news" "rtl.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.rtlnieuws.nl/rss.xml" From 7ef9bfc5b19630e948f9c65e723040d279f25923 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 09:47:21 +0200 Subject: [PATCH 016/121] replaced base R pipe with magrittr pipe --- R/deliver_focus_de.R | 2 +- R/deliver_fr_de.R | 2 +- R/deliver_n-tv_de.R | 2 +- R/deliver_rtl_de.R | 2 +- R/deliver_stern_de.R | 2 +- R/deliver_sueddeutsche.R | 2 +- R/deliver_tagesschau_de.R | 2 +- R/deliver_welt_de.R | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/R/deliver_focus_de.R b/R/deliver_focus_de.R index a4385f4..60b4d32 100644 --- a/R/deliver_focus_de.R +++ b/R/deliver_focus_de.R @@ -4,7 +4,7 @@ pb_deliver_paper.focus_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] |> rvest::html_text() + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() json_df <- jsonlite::fromJSON(json_txt) datetime <- lubridate::as_datetime(json_df$datePublished) diff --git a/R/deliver_fr_de.R b/R/deliver_fr_de.R index 77453dc..e77617b 100644 --- a/R/deliver_fr_de.R +++ b/R/deliver_fr_de.R @@ -4,7 +4,7 @@ pb_deliver_paper.fr_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] |> rvest::html_text() + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() json_df <- jsonlite::fromJSON(json_txt) json_df <- json_df$mainEntity diff --git a/R/deliver_n-tv_de.R b/R/deliver_n-tv_de.R index 9de5883..5e3f096 100644 --- a/R/deliver_n-tv_de.R +++ b/R/deliver_n-tv_de.R @@ -4,7 +4,7 @@ pb_deliver_paper.n_tv_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] |> rvest::html_text() + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() json_df <- jsonlite::fromJSON(json_txt) datetime <- lubridate::as_datetime(json_df$datePublished) diff --git a/R/deliver_rtl_de.R b/R/deliver_rtl_de.R index dc18132..3238812 100644 --- a/R/deliver_rtl_de.R +++ b/R/deliver_rtl_de.R @@ -4,7 +4,7 @@ pb_deliver_paper.rtl_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] |> rvest::html_text() + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() json_df <- jsonlite::fromJSON(json_txt) if (json_df$`@type` != "VideoObject") { # NewsArticle datetime <- lubridate::as_datetime(json_df$datePublished) diff --git a/R/deliver_stern_de.R b/R/deliver_stern_de.R index 7c19e79..33e93ce 100644 --- a/R/deliver_stern_de.R +++ b/R/deliver_stern_de.R @@ -4,7 +4,7 @@ pb_deliver_paper.stern_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] |> rvest::html_text() + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() json_df <- jsonlite::fromJSON(json_txt)[1, ] datetime <- lubridate::as_datetime(json_df$datePublished) diff --git a/R/deliver_sueddeutsche.R b/R/deliver_sueddeutsche.R index 47536b1..bddcbd8 100644 --- a/R/deliver_sueddeutsche.R +++ b/R/deliver_sueddeutsche.R @@ -4,7 +4,7 @@ pb_deliver_paper.sueddeutsche_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] |> rvest::html_text() + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() json_df <- jsonlite::fromJSON(json_txt) datetime <- lubridate::as_datetime(json_df$datePublished) diff --git a/R/deliver_tagesschau_de.R b/R/deliver_tagesschau_de.R index 9ab5e09..effc1e2 100644 --- a/R/deliver_tagesschau_de.R +++ b/R/deliver_tagesschau_de.R @@ -4,7 +4,7 @@ pb_deliver_paper.tagesschau_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] |> rvest::html_text() + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() json_df <- jsonlite::fromJSON(json_txt) datetime <- lubridate::as_datetime(json_df$datePublished) diff --git a/R/deliver_welt_de.R b/R/deliver_welt_de.R index b1bf4e4..66d30a5 100644 --- a/R/deliver_welt_de.R +++ b/R/deliver_welt_de.R @@ -4,7 +4,7 @@ pb_deliver_paper.welt_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] |> rvest::html_text() + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() json_df <- jsonlite::fromJSON(json_txt) datetime <- lubridate::as_datetime(json_df$datePublished) From a4470ed4986862fcb4e218e1f37a25dab99b0fb4 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 09:51:26 +0200 Subject: [PATCH 017/121] renamed sueddeutsche file --- R/{deliver_sueddeutsche.R => deliver_sueddeutsche_de.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename R/{deliver_sueddeutsche.R => deliver_sueddeutsche_de.R} (100%) diff --git a/R/deliver_sueddeutsche.R b/R/deliver_sueddeutsche_de.R similarity index 100% rename from R/deliver_sueddeutsche.R rename to R/deliver_sueddeutsche_de.R From 4d6bfeecbabd23e591356be5adb4cd55c8087f9f Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 10:04:39 +0200 Subject: [PATCH 018/121] added rp-online.de --- NAMESPACE | 1 + R/deliver_rp_online_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_rp_online_de.R diff --git a/NAMESPACE b/NAMESPACE index 7dab8af..b296dee 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -53,6 +53,7 @@ S3method(pb_deliver_paper,nypost_com) S3method(pb_deliver_paper,nytimes_com) S3method(pb_deliver_paper,parlamentnilisty_cz) S3method(pb_deliver_paper,prosieben_de) +S3method(pb_deliver_paper,rp_online_de) S3method(pb_deliver_paper,rte_ie) S3method(pb_deliver_paper,rtl_de) S3method(pb_deliver_paper,rtl_nl) diff --git a/R/deliver_rp_online_de.R b/R/deliver_rp_online_de.R new file mode 100644 index 0000000..e8fb160 --- /dev/null +++ b/R/deliver_rp_online_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.rp_online_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("strong[data-cy=\"intro\"],div[data-cy=\"article_content\"] p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 1dc9d3e..6cf23f2 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -59,6 +59,7 @@ "pagesix.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "parlamentnilisty.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://www.parlamentnilisty.cz/export/rss.aspx" "prosieben.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA +"rp-online.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://rp-online.de/feed.rss" "rte.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.rte.ie/feeds/rss/?index=/news/" "rtl.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.rtl.de/rss/feed/news" "rtl.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.rtlnieuws.nl/rss.xml" From bc7ba880e823d00e679a751d04f4198d1af6748a Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 10:21:20 +0200 Subject: [PATCH 019/121] added t-online.de --- NAMESPACE | 1 + R/deliver_t_online_de.R | 26 ++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 28 insertions(+) create mode 100644 R/deliver_t_online_de.R diff --git a/NAMESPACE b/NAMESPACE index b296dee..5f161f9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -64,6 +64,7 @@ S3method(pb_deliver_paper,sky_com) S3method(pb_deliver_paper,spiegel_de) S3method(pb_deliver_paper,stern_de) S3method(pb_deliver_paper,sueddeutsche_de) +S3method(pb_deliver_paper,t_online_de) S3method(pb_deliver_paper,tagesschau_de) S3method(pb_deliver_paper,telegraaf_nl) S3method(pb_deliver_paper,telegraph_co_uk) diff --git a/R/deliver_t_online_de.R b/R/deliver_t_online_de.R new file mode 100644 index 0000000..05bdb8a --- /dev/null +++ b/R/deliver_t_online_de.R @@ -0,0 +1,26 @@ +#' @export +pb_deliver_paper.t_online_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$`@graph`[1, ] + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author[[1]]$name) + text <- html %>% + rvest::html_nodes("div[data-testid=\"ArticleBody.StreamLayout\"] p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 6cf23f2..b1ed841 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -82,6 +82,7 @@ "thismorningwithgordondeal.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "time.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "tribpub.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA +"t-online.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.t-online.de/nachrichten/feed.rss" "us.cnn.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "volkskrant.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.volkskrant.nl/rss.xml" From a945fc4fc9975f68371b5399fbb48274fd514a6a Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 11:34:08 +0200 Subject: [PATCH 020/121] added zdf.de --- NAMESPACE | 1 + R/deliver_zdf_de.R | 44 ++++++++++++++++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 46 insertions(+) create mode 100644 R/deliver_zdf_de.R diff --git a/NAMESPACE b/NAMESPACE index 5f161f9..681babc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -78,6 +78,7 @@ S3method(pb_deliver_paper,washingtonpost_com) S3method(pb_deliver_paper,welt_de) S3method(pb_deliver_paper,wsj_com) S3method(pb_deliver_paper,yahoo_com) +S3method(pb_deliver_paper,zdf_de) S3method(pb_deliver_paper,zeit_de) export("%>%") export(pb_available) diff --git a/R/deliver_zdf_de.R b/R/deliver_zdf_de.R new file mode 100644 index 0000000..e11177c --- /dev/null +++ b/R/deliver_zdf_de.R @@ -0,0 +1,44 @@ +#' @export +pb_deliver_paper.zdf_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ") + if (length(json_txt) != 1) { + json_txt <- json_txt[2] + } + json_txt <- json_txt %>% rvest::html_text() + json_df <- jsonlite::fromJSON(gsub("\r\n", " ", json_txt)) + if (json_df$`@type` != "VideoObject" && json_df$`@type` != "BreadcrumbList") { + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".r1nj4qn5") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + } else if (json_df$`@type` == "VideoObject") { + datetime <- lubridate::as_datetime(json_df$uploadDate) + headline <- json_df$name + author <- toString(json_df$publisher$name) + text <- json_df$description + } else { + datetime <- html %>% + rvest::html_node("time") %>% + rvest::html_attr("datetime") %>% + lubridate::as_datetime() + headline <- html %>% + rvest::html_node("main h2") %>% + rvest::html_text() + author <- "" + text <- "" + } + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index b1ed841..510949e 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -90,4 +90,5 @@ "welt.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.welt.de/feeds/latest.rss" "wsj.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.a.dj.com/rss/RSSWorldNews.xml" "yahoo.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://news.yahoo.com/rss.xml" +"zdf.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.zdf.de/rss/zdf/nachrichten" "zeit.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.zeit.de/index" From 7e262170b61882bb3214090444f48c04cbc8525c Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 14:44:33 +0200 Subject: [PATCH 021/121] added tagesspiegel.de --- NAMESPACE | 1 + R/deliver_tagesspiegel_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_tagesspiegel_de.R diff --git a/NAMESPACE b/NAMESPACE index 681babc..51bd17d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -66,6 +66,7 @@ S3method(pb_deliver_paper,stern_de) S3method(pb_deliver_paper,sueddeutsche_de) S3method(pb_deliver_paper,t_online_de) S3method(pb_deliver_paper,tagesschau_de) +S3method(pb_deliver_paper,tagesspiegel_de) S3method(pb_deliver_paper,telegraaf_nl) S3method(pb_deliver_paper,telegraph_co_uk) S3method(pb_deliver_paper,thecanary_co) diff --git a/R/deliver_tagesspiegel_de.R b/R/deliver_tagesspiegel_de.R new file mode 100644 index 0000000..2e40f49 --- /dev/null +++ b/R/deliver_tagesspiegel_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.tagesspiegel_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("#story-elements p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 510949e..a29e6aa 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -71,6 +71,7 @@ "stern.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.stern.de/feed/standard/all/" "sueddeutsche.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://rss.sueddeutsche.de/alles" "tagesschau.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA +"tagesspiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.tagesspiegel.de/contentexport/feed/home" "telegraaf.nl","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#17](https://github.com/JBGruber/paperboy/issues/17)","https://www.telegraaf.nl/rss.xml" "telegraph.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "thecanary.co","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 05d3045193c0d9b9e9d969adcaa5d18a1b69d86c Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 14:51:48 +0200 Subject: [PATCH 022/121] added morgenpost.de --- NAMESPACE | 1 + R/deliver_morgenpost_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_morgenpost_de.R diff --git a/NAMESPACE b/NAMESPACE index 51bd17d..385d1c9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -42,6 +42,7 @@ S3method(pb_deliver_paper,lidovky_cz) S3method(pb_deliver_paper,marketwatch_com) S3method(pb_deliver_paper,mediacourant_nl) S3method(pb_deliver_paper,metronieuws_nl) +S3method(pb_deliver_paper,morgenpost_de) S3method(pb_deliver_paper,n_tv_de) S3method(pb_deliver_paper,newstatesman_com) S3method(pb_deliver_paper,newsweek_com) diff --git a/R/deliver_morgenpost_de.R b/R/deliver_morgenpost_de.R new file mode 100644 index 0000000..6d4940d --- /dev/null +++ b/R/deliver_morgenpost_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.morgenpost_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".article-body p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index a29e6aa..adc4ae0 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -46,6 +46,7 @@ "marketwatch.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "mediacourant.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.mediacourant.nl/feed/" "metronieuws.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.metronieuws.nl/feed/" +"morgenpost.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "msnbc.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "newstatesman.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.newstatesman.com/feed/" "newsweek.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 7cb8a2ce901b2b2071010620eddc47d442ec89c0 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 15:48:37 +0200 Subject: [PATCH 023/121] added handelsblatt.com --- NAMESPACE | 1 + R/deliver_handelsblatt_de.R | 27 +++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 29 insertions(+) create mode 100644 R/deliver_handelsblatt_de.R diff --git a/NAMESPACE b/NAMESPACE index 385d1c9..3f6fc94 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,6 +27,7 @@ S3method(pb_deliver_paper,forbes_com) S3method(pb_deliver_paper,foxbusiness_com) S3method(pb_deliver_paper,fr_de) S3method(pb_deliver_paper,geenstijl_nl) +S3method(pb_deliver_paper,handelsblatt_com) S3method(pb_deliver_paper,hn_cz) S3method(pb_deliver_paper,huffpost_com) S3method(pb_deliver_paper,idnes_cz) diff --git a/R/deliver_handelsblatt_de.R b/R/deliver_handelsblatt_de.R new file mode 100644 index 0000000..af2b67b --- /dev/null +++ b/R/deliver_handelsblatt_de.R @@ -0,0 +1,27 @@ +#' @export +pb_deliver_paper.handelsblatt_com <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + # html <- rvest::read_html(x$content_raw) + base_url <- "https://content.www.handelsblatt.com/api/content/eager/?url=" + path <- adaR::ada_get_pathname(x$expanded_url) + json_df <- jsonlite::fromJSON(paste0(base_url, path)) + if (json_df$type == "redirect") { + path <- json_df$location + json_df <- jsonlite::fromJSON(paste0(base_url, path)) + } + datetime <- lubridate::as_datetime(json_df$header$dates$published) + headline <- json_df$header$headline + author <- toString(paste(json_df$authors$firstName, json_df$authors$lastName)) + text <- jsonlite::fromJSON(json_df$seo$jsonLd)$articleBody + text <- text[!is.na(text)] + text <- paste(text, collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index adc4ae0..791358d 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -30,6 +30,7 @@ "fr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.fr.de/rssfeed.rdf" "ftw.usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "geenstijl.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.geenstijl.nl/feeds/recent.atom" +"handelsblatt.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.handelsblatt.com/contentexport/feed/schlagzeilen" "hn.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://domaci.hn.cz/?m=rss" "huffingtonpost.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "idnes.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 34895ef2ec74d7cabb35dd8f1d0cceaff76a9510 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 15:58:31 +0200 Subject: [PATCH 024/121] added berliner-zeitung.de --- NAMESPACE | 1 + R/deliver_berliner_zeitung_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_berliner_zeitung_de.R diff --git a/NAMESPACE b/NAMESPACE index 3f6fc94..4a5316b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,7 @@ S3method(pb_deliver_paper,ad_nl) S3method(pb_deliver_paper,aktualne_cz) S3method(pb_deliver_paper,anotherangryvoice_blogspot_com) S3method(pb_deliver_paper,bbc_co_uk) +S3method(pb_deliver_paper,berliner_zeitung_de) S3method(pb_deliver_paper,bild_de) S3method(pb_deliver_paper,blesk_cz) S3method(pb_deliver_paper,breakingnews_ie) diff --git a/R/deliver_berliner_zeitung_de.R b/R/deliver_berliner_zeitung_de.R new file mode 100644 index 0000000..009b61e --- /dev/null +++ b/R/deliver_berliner_zeitung_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.berliner_zeitung_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".article_paragraph__hXYKJ") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 791358d..97a9b5e 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -4,6 +4,7 @@ "aktualne.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.aktualne.cz/rss" "anotherangryvoice.blogspot.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "bbc.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.bbci.co.uk/news/rss.xml" +"berliner-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.berliner-zeitung.de/feed.xml" "bild.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.bild.de/rssfeeds/rss3-20745882,feed=alles.bild.html" "blesk.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.blesk.cz/rss" "boston.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA From 9f40bb0bc7c28744ff8bf55638e14e6a576a1801 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 19:43:58 +0200 Subject: [PATCH 025/121] added badische-zeitung.de --- NAMESPACE | 1 + R/deliver_badische_zeitung_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_badische_zeitung_de.R diff --git a/NAMESPACE b/NAMESPACE index 4a5316b..752eb51 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ S3method(pb_deliver_paper,ac24_cz) S3method(pb_deliver_paper,ad_nl) S3method(pb_deliver_paper,aktualne_cz) S3method(pb_deliver_paper,anotherangryvoice_blogspot_com) +S3method(pb_deliver_paper,badische_zeitung_de) S3method(pb_deliver_paper,bbc_co_uk) S3method(pb_deliver_paper,berliner_zeitung_de) S3method(pb_deliver_paper,bild_de) diff --git a/R/deliver_badische_zeitung_de.R b/R/deliver_badische_zeitung_de.R new file mode 100644 index 0000000..c94d4bb --- /dev/null +++ b/R/deliver_badische_zeitung_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.badische_zeitung_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(iconv(x$content_raw, from = "ISO-8859-1", to = "UTF-8")) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author) + text <- html %>% + rvest::html_nodes("section[role = \"article\"], .article-site__topic") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 97a9b5e..5fc2a0c 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -3,6 +3,7 @@ "ad.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.ad.nl/home/rss.xml" "aktualne.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.aktualne.cz/rss" "anotherangryvoice.blogspot.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA +"badische-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "bbc.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.bbci.co.uk/news/rss.xml" "berliner-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.berliner-zeitung.de/feed.xml" "bild.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.bild.de/rssfeeds/rss3-20745882,feed=alles.bild.html" From 1a0807cd3a9d0dd7a7e3c3c5c4a951421e257b5e Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 19:59:15 +0200 Subject: [PATCH 026/121] added derwesten.de --- NAMESPACE | 1 + R/deliver_derwesten_de.R | 29 +++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 31 insertions(+) create mode 100644 R/deliver_derwesten_de.R diff --git a/NAMESPACE b/NAMESPACE index 752eb51..49e1feb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,6 +22,7 @@ S3method(pb_deliver_paper,cnn_com) S3method(pb_deliver_paper,dailymail_co_uk) S3method(pb_deliver_paper,default) S3method(pb_deliver_paper,denikn_cz) +S3method(pb_deliver_paper,derwesten_de) S3method(pb_deliver_paper,evolvepolitics_com) S3method(pb_deliver_paper,faz_net) S3method(pb_deliver_paper,focus_de) diff --git a/R/deliver_derwesten_de.R b/R/deliver_derwesten_de.R new file mode 100644 index 0000000..3bcb4ae --- /dev/null +++ b/R/deliver_derwesten_de.R @@ -0,0 +1,29 @@ +#' @export +pb_deliver_paper.derwesten_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$`@graph`[1, ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- html %>% + rvest::html_nodes(".author.vcard .url.fn.n") %>% + rvest::html_text() %>% + toString() + + text <- html %>% + rvest::html_nodes(".lead p,.article-body p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 5fc2a0c..8ff627d 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -20,6 +20,7 @@ "dailymail.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.dailymail.co.uk/news/index.rss" "decider.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "denikn.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://denikn.cz/rss" +"badische-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.derwesten.de/feed" "edition.cnn.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://rss.cnn.com/rss/edition.rss" "eu.usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "evolvepolitics.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From a34a578a2565365643860907475c111e09d26920 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 20:22:51 +0200 Subject: [PATCH 027/121] added tag24.de --- NAMESPACE | 1 + R/deliver_tag24_de.R | 21 +++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 23 insertions(+) create mode 100644 R/deliver_tag24_de.R diff --git a/NAMESPACE b/NAMESPACE index 49e1feb..6816984 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -70,6 +70,7 @@ S3method(pb_deliver_paper,spiegel_de) S3method(pb_deliver_paper,stern_de) S3method(pb_deliver_paper,sueddeutsche_de) S3method(pb_deliver_paper,t_online_de) +S3method(pb_deliver_paper,tag24_de) S3method(pb_deliver_paper,tagesschau_de) S3method(pb_deliver_paper,tagesspiegel_de) S3method(pb_deliver_paper,telegraaf_nl) diff --git a/R/deliver_tag24_de.R b/R/deliver_tag24_de.R new file mode 100644 index 0000000..b8443e3 --- /dev/null +++ b/R/deliver_tag24_de.R @@ -0,0 +1,21 @@ +#' @export +pb_deliver_paper.tag24_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- json_df$articleBody + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 8ff627d..6c71078 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -75,6 +75,7 @@ "spiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.spiegel.de/schlagzeilen/index.rss" "stern.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.stern.de/feed/standard/all/" "sueddeutsche.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://rss.sueddeutsche.de/alles" +"tag24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "tagesschau.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "tagesspiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.tagesspiegel.de/contentexport/feed/home" "telegraaf.nl","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#17](https://github.com/JBGruber/paperboy/issues/17)","https://www.telegraaf.nl/rss.xml" From 07c6d2b0d303a8f41a12c2ae42c6592db86bdc50 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 20:49:51 +0200 Subject: [PATCH 028/121] added heise.de --- NAMESPACE | 1 + R/deliver_heise_de.R | 24 ++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 26 insertions(+) create mode 100644 R/deliver_heise_de.R diff --git a/NAMESPACE b/NAMESPACE index 6816984..4a5aec2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -31,6 +31,7 @@ S3method(pb_deliver_paper,foxbusiness_com) S3method(pb_deliver_paper,fr_de) S3method(pb_deliver_paper,geenstijl_nl) S3method(pb_deliver_paper,handelsblatt_com) +S3method(pb_deliver_paper,heise_de) S3method(pb_deliver_paper,hn_cz) S3method(pb_deliver_paper,huffpost_com) S3method(pb_deliver_paper,idnes_cz) diff --git a/R/deliver_heise_de.R b/R/deliver_heise_de.R new file mode 100644 index 0000000..3432b23 --- /dev/null +++ b/R/deliver_heise_de.R @@ -0,0 +1,24 @@ +#' @export +pb_deliver_paper.heise_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + + text <- html %>% + rvest::html_nodes("#lead,#article-content-body .ringCommonDetail.ringBlockType-paragraph,.article-content,.a-article-header__lead") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text + ) +} diff --git a/inst/status.csv b/inst/status.csv index 6c71078..2a7d71c 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -34,6 +34,7 @@ "ftw.usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "geenstijl.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.geenstijl.nl/feeds/recent.atom" "handelsblatt.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.handelsblatt.com/contentexport/feed/schlagzeilen" +"heise.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.heise.de/rss/heise.rdf" "hn.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://domaci.hn.cz/?m=rss" "huffingtonpost.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "idnes.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From edd6439d19b71068f151250639fe882e697b3e83 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 20:58:59 +0200 Subject: [PATCH 029/121] added merkur.de --- NAMESPACE | 1 + R/deliver_merkur_de.R | 26 ++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 28 insertions(+) create mode 100644 R/deliver_merkur_de.R diff --git a/NAMESPACE b/NAMESPACE index 4a5aec2..8df6508 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,6 +46,7 @@ S3method(pb_deliver_paper,latimes_com) S3method(pb_deliver_paper,lidovky_cz) S3method(pb_deliver_paper,marketwatch_com) S3method(pb_deliver_paper,mediacourant_nl) +S3method(pb_deliver_paper,merkur_de) S3method(pb_deliver_paper,metronieuws_nl) S3method(pb_deliver_paper,morgenpost_de) S3method(pb_deliver_paper,n_tv_de) diff --git a/R/deliver_merkur_de.R b/R/deliver_merkur_de.R new file mode 100644 index 0000000..8f670a0 --- /dev/null +++ b/R/deliver_merkur_de.R @@ -0,0 +1,26 @@ +#' @export +pb_deliver_paper.merkur_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$mainEntity + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 2a7d71c..fab9b2d 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -50,6 +50,7 @@ "lnk.techrepublic.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "marketwatch.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "mediacourant.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.mediacourant.nl/feed/" +"merkur.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "http://www.merkur.de/rssfeed.rdf" "metronieuws.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.metronieuws.nl/feed/" "morgenpost.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "msnbc.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA From 389985892ff0005d1592779f107eed850f599c93 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 21:35:49 +0200 Subject: [PATCH 030/121] added ndr.de --- NAMESPACE | 1 + R/deliver_ndr_de.R | 30 ++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 32 insertions(+) create mode 100644 R/deliver_ndr_de.R diff --git a/NAMESPACE b/NAMESPACE index 8df6508..f583e30 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -50,6 +50,7 @@ S3method(pb_deliver_paper,merkur_de) S3method(pb_deliver_paper,metronieuws_nl) S3method(pb_deliver_paper,morgenpost_de) S3method(pb_deliver_paper,n_tv_de) +S3method(pb_deliver_paper,ndr_de) S3method(pb_deliver_paper,newstatesman_com) S3method(pb_deliver_paper,newsweek_com) S3method(pb_deliver_paper,nos_nl) diff --git a/R/deliver_ndr_de.R b/R/deliver_ndr_de.R new file mode 100644 index 0000000..8de58a9 --- /dev/null +++ b/R/deliver_ndr_de.R @@ -0,0 +1,30 @@ +#' @export +pb_deliver_paper.ndr_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + if (json_df$`@type` != "VideoObject" && json_df$`@type` != "AudioObject") { # NewsArticle + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".modulepadding.copytext p, .modulepadding.copytext h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + } else { + datetime <- lubridate::as_datetime(json_df$uploadDate) + headline <- json_df$name + author <- "" + text <- json_df$description + } + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index fab9b2d..8f7df34 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -54,6 +54,7 @@ "metronieuws.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.metronieuws.nl/feed/" "morgenpost.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "msnbc.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA +"ndr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "http://www.ndr.de/home/index-rss.xml" "newstatesman.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.newstatesman.com/feed/" "newsweek.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "nos.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.nos.nl/nosnieuwsalgemeen" From 744e0f7c9d6a6b00f961c6c182308d482fb69288 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Wed, 16 Oct 2024 22:12:15 +0200 Subject: [PATCH 031/121] added br.de --- NAMESPACE | 1 + R/deliver_br_de.R | 41 +++++++++++++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 43 insertions(+) create mode 100644 R/deliver_br_de.R diff --git a/NAMESPACE b/NAMESPACE index f583e30..ef0a16e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ S3method(pb_deliver_paper,bbc_co_uk) S3method(pb_deliver_paper,berliner_zeitung_de) S3method(pb_deliver_paper,bild_de) S3method(pb_deliver_paper,blesk_cz) +S3method(pb_deliver_paper,br_de) S3method(pb_deliver_paper,breakingnews_ie) S3method(pb_deliver_paper,breitbart_com) S3method(pb_deliver_paper,buzzfeed_com) diff --git a/R/deliver_br_de.R b/R/deliver_br_de.R new file mode 100644 index 0000000..bf8bb09 --- /dev/null +++ b/R/deliver_br_de.R @@ -0,0 +1,41 @@ +#' @export +pb_deliver_paper.br_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + json_df <- lapply(json_txt, jsonlite::fromJSON) + if (is.null(names(json_df))) { + types <- sapply(json_df, function(x) x$`@type`) + if (any(types == "NewsArticle")) { + json_df <- json_df[types == "NewsArticle"][[1]] + } else if (any(type == "VideoObject")) { + json_df <- json_df[types == "VideoObject"][[1]] + } else if (any(type == "AudioObject")) { + json_df <- json_df[types == "AudioObject"][[1]] + } + } + if (json_df$`@type` != "VideoObject" && json_df$`@type` != "AudioObject") { # NewsArticle + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_node(".RichText_richText__wS9Rz.body3") |> + rvest::html_nodes("p, h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + } else { + datetime <- lubridate::as_datetime(json_df$uploadDate) + headline <- json_df$name + author <- "" + text <- json_df$description + } + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 8f7df34..26ebcbb 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -10,6 +10,7 @@ "blesk.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.blesk.cz/rss" "boston.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "bostonglobe.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA +"br.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://nachrichtenfeeds.br.de/rss/nachrichten/seiten/QXAPwyN" "breakingnews.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.breakingnews.ie/feed/all.rss" "breitbart.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "buzzfeed.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 67ec519d60101e5ce5ea48ab65fde5f177dac48f Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 07:18:35 +0200 Subject: [PATCH 032/121] added t3n.de --- NAMESPACE | 1 + R/deliver_t3n_de.R | 23 +++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 25 insertions(+) create mode 100644 R/deliver_t3n_de.R diff --git a/NAMESPACE b/NAMESPACE index ef0a16e..280e8e8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -73,6 +73,7 @@ S3method(pb_deliver_paper,sky_com) S3method(pb_deliver_paper,spiegel_de) S3method(pb_deliver_paper,stern_de) S3method(pb_deliver_paper,sueddeutsche_de) +S3method(pb_deliver_paper,t3n_de) S3method(pb_deliver_paper,t_online_de) S3method(pb_deliver_paper,tag24_de) S3method(pb_deliver_paper,tagesschau_de) diff --git a/R/deliver_t3n_de.R b/R/deliver_t3n_de.R new file mode 100644 index 0000000..7345e19 --- /dev/null +++ b/R/deliver_t3n_de.R @@ -0,0 +1,23 @@ +#' @export +pb_deliver_paper.t3n_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- gsub("\r\n", "\n", json_df$articleBody) + text <- gsub("\\[.*?\\]", "", text) + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 26ebcbb..44e104d 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -82,6 +82,7 @@ "tag24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "tagesschau.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "tagesspiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.tagesspiegel.de/contentexport/feed/home" +"t3n.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://t3n.de/rss.xml" "telegraaf.nl","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#17](https://github.com/JBGruber/paperboy/issues/17)","https://www.telegraaf.nl/rss.xml" "telegraph.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "thecanary.co","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 30b0385b8d40b445d60f0fb0914e650421796803 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 07:39:28 +0200 Subject: [PATCH 033/121] added karlsruhe-insider.de --- NAMESPACE | 1 + R/deliver_karlsruhe_insider_de.R | 29 +++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 31 insertions(+) create mode 100644 R/deliver_karlsruhe_insider_de.R diff --git a/NAMESPACE b/NAMESPACE index 280e8e8..adccb9c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -43,6 +43,7 @@ S3method(pb_deliver_paper,irishmirror_ie) S3method(pb_deliver_paper,irishtimes_com) S3method(pb_deliver_paper,irozhlas_cz) S3method(pb_deliver_paper,joe_ie) +S3method(pb_deliver_paper,karlsruhe_insider_de) S3method(pb_deliver_paper,latimes_com) S3method(pb_deliver_paper,lidovky_cz) S3method(pb_deliver_paper,marketwatch_com) diff --git a/R/deliver_karlsruhe_insider_de.R b/R/deliver_karlsruhe_insider_de.R new file mode 100644 index 0000000..1c7cae4 --- /dev/null +++ b/R/deliver_karlsruhe_insider_de.R @@ -0,0 +1,29 @@ +#' @export +pb_deliver_paper.karlsruhe_insider_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } + json_df <- json_df[1, ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + text <- html %>% + rvest::html_node("article .td-post-content") |> + rvest::html_nodes("p, h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 44e104d..f27cb05 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -46,6 +46,7 @@ "irishtimes.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.irishtimes.com/arc/outboundfeeds/feed-irish-news/" "irozhlas.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.irozhlas.cz/rss/irozhlas" "joe.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.joe.ie/feed" +"karlsruhe-insider.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "latimes.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.latimes.com/politics/rss2.0.xml" "lidovky.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://servis.lidovky.cz/rss.aspx" "lnk.techrepublic.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA From 755b04ec0908b43e45f90cc0319e307d12cb1b40 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 08:12:00 +0200 Subject: [PATCH 034/121] added mdr.de --- NAMESPACE | 1 + R/deliver_mdr_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_mdr_de.R diff --git a/NAMESPACE b/NAMESPACE index adccb9c..73a7b76 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -47,6 +47,7 @@ S3method(pb_deliver_paper,karlsruhe_insider_de) S3method(pb_deliver_paper,latimes_com) S3method(pb_deliver_paper,lidovky_cz) S3method(pb_deliver_paper,marketwatch_com) +S3method(pb_deliver_paper,mdr_de) S3method(pb_deliver_paper,mediacourant_nl) S3method(pb_deliver_paper,merkur_de) S3method(pb_deliver_paper,metronieuws_nl) diff --git a/R/deliver_mdr_de.R b/R/deliver_mdr_de.R new file mode 100644 index 0000000..4650fc1 --- /dev/null +++ b/R/deliver_mdr_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.mdr_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".einleitung,.paragraph") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index f27cb05..305405c 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -51,6 +51,7 @@ "lidovky.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://servis.lidovky.cz/rss.aspx" "lnk.techrepublic.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "marketwatch.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA +"mdr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "mediacourant.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.mediacourant.nl/feed/" "merkur.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "http://www.merkur.de/rssfeed.rdf" "metronieuws.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.metronieuws.nl/feed/" From 4fc65032cb88f7beccfef47adea3fc974203248d Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 09:07:00 +0200 Subject: [PATCH 035/121] added ruhr24.de --- NAMESPACE | 1 + R/deliver_ruhr24_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_ruhr24_de.R diff --git a/NAMESPACE b/NAMESPACE index 73a7b76..0bd1c30 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -68,6 +68,7 @@ S3method(pb_deliver_paper,rp_online_de) S3method(pb_deliver_paper,rte_ie) S3method(pb_deliver_paper,rtl_de) S3method(pb_deliver_paper,rtl_nl) +S3method(pb_deliver_paper,ruhr24_de) S3method(pb_deliver_paper,seznamzpravy_cz) S3method(pb_deliver_paper,sfgate_com) S3method(pb_deliver_paper,skwawkbox_org) diff --git a/R/deliver_ruhr24_de.R b/R/deliver_ruhr24_de.R new file mode 100644 index 0000000..8097e81 --- /dev/null +++ b/R/deliver_ruhr24_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.ruhr24_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$mainEntity + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-crosshead,.id-StoryElement-paragraph") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 305405c..149c31e 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -74,6 +74,7 @@ "rte.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.rte.ie/feeds/rss/?index=/news/" "rtl.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.rtl.de/rss/feed/news" "rtl.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.rtlnieuws.nl/rss.xml" +"ruhr24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "seznamzpravy.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.seznamzpravy.cz/rss" "sfgate.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "skwawkbox.org","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From c0a0efe5a47abd03b384dc5259f7ea0e1853d837 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 09:10:41 +0200 Subject: [PATCH 036/121] added tz.de --- NAMESPACE | 1 + R/deliver_tz_de.r | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_tz_de.r diff --git a/NAMESPACE b/NAMESPACE index 0bd1c30..f96cba9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -87,6 +87,7 @@ S3method(pb_deliver_paper,thecanary_co) S3method(pb_deliver_paper,theguardian_com) S3method(pb_deliver_paper,thejournal_ie) S3method(pb_deliver_paper,thesun_ie) +S3method(pb_deliver_paper,tz_de) S3method(pb_deliver_paper,usatoday_com) S3method(pb_deliver_paper,volkskrant_nl) S3method(pb_deliver_paper,washingtonpost_com) diff --git a/R/deliver_tz_de.r b/R/deliver_tz_de.r new file mode 100644 index 0000000..5a81d1a --- /dev/null +++ b/R/deliver_tz_de.r @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.tz_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$mainEntity + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-crosshead,.id-StoryElement-paragraph") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 149c31e..8ffb9a2 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -97,6 +97,7 @@ "thismorningwithgordondeal.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "time.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "tribpub.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA +"tz.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.tz.de/welt/rssfeed.rdf" "t-online.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.t-online.de/nachrichten/feed.rss" "us.cnn.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 7be71e4e11fdb9723e44bc5a2b463e55b3b96ee6 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 09:28:06 +0200 Subject: [PATCH 037/121] added swr.de --- NAMESPACE | 1 + R/deliver_swr_de.R | 30 ++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 32 insertions(+) create mode 100644 R/deliver_swr_de.R diff --git a/NAMESPACE b/NAMESPACE index f96cba9..040df70 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -76,6 +76,7 @@ S3method(pb_deliver_paper,sky_com) S3method(pb_deliver_paper,spiegel_de) S3method(pb_deliver_paper,stern_de) S3method(pb_deliver_paper,sueddeutsche_de) +S3method(pb_deliver_paper,swr_de) S3method(pb_deliver_paper,t3n_de) S3method(pb_deliver_paper,t_online_de) S3method(pb_deliver_paper,tag24_de) diff --git a/R/deliver_swr_de.R b/R/deliver_swr_de.R new file mode 100644 index 0000000..eeac63b --- /dev/null +++ b/R/deliver_swr_de.R @@ -0,0 +1,30 @@ +#' @export +pb_deliver_paper.swr_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + + datetime <- html %>% + rvest::html_node("time") %>% + rvest::html_attr("datetime") %>% + lubridate::as_datetime() + headline <- html %>% + rvest::html_node("h1.headline") %>% + rvest::html_text() + author <- html %>% + rvest::html_nodes(".meta-top .meta-authors .meta-author-name a") %>% + rvest::html_text2() %>% + toString() + text <- html %>% + rvest::html_nodes(".detail-body .lead, .bodytext p, .bodytext h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text + ) +} diff --git a/inst/status.csv b/inst/status.csv index 8ffb9a2..7cbd904 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -82,6 +82,7 @@ "spiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.spiegel.de/schlagzeilen/index.rss" "stern.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.stern.de/feed/standard/all/" "sueddeutsche.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://rss.sueddeutsche.de/alles" +"swr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "tag24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "tagesschau.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "tagesspiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.tagesspiegel.de/contentexport/feed/home" From 28ea94ca784466a31c1f1657eb4b6aa228f4982a Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 09:33:54 +0200 Subject: [PATCH 038/121] added swp.de --- NAMESPACE | 1 + R/deliver_swp_de.R | 24 ++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 26 insertions(+) create mode 100644 R/deliver_swp_de.R diff --git a/NAMESPACE b/NAMESPACE index 040df70..bd3cb1f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -76,6 +76,7 @@ S3method(pb_deliver_paper,sky_com) S3method(pb_deliver_paper,spiegel_de) S3method(pb_deliver_paper,stern_de) S3method(pb_deliver_paper,sueddeutsche_de) +S3method(pb_deliver_paper,swp_de) S3method(pb_deliver_paper,swr_de) S3method(pb_deliver_paper,t3n_de) S3method(pb_deliver_paper,t_online_de) diff --git a/R/deliver_swp_de.R b/R/deliver_swp_de.R new file mode 100644 index 0000000..13009db --- /dev/null +++ b/R/deliver_swp_de.R @@ -0,0 +1,24 @@ +#' @export +pb_deliver_paper.swp_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".u-article-header .fs-4,.u-paragraph, .u-title.u-headline") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 7cbd904..568c112 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -82,6 +82,7 @@ "spiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.spiegel.de/schlagzeilen/index.rss" "stern.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.stern.de/feed/standard/all/" "sueddeutsche.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://rss.sueddeutsche.de/alles" +"swp.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "swr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "tag24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "tagesschau.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA From 949018be7419a0a2b0b3013b9c2d1e1550360ea7 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 09:57:14 +0200 Subject: [PATCH 039/121] added augsburger-allgemeine.de --- NAMESPACE | 1 + R/deliver_augsburger_allgemeine.R | 31 +++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 33 insertions(+) create mode 100644 R/deliver_augsburger_allgemeine.R diff --git a/NAMESPACE b/NAMESPACE index bd3cb1f..8d9c121 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ S3method(pb_deliver_paper,ac24_cz) S3method(pb_deliver_paper,ad_nl) S3method(pb_deliver_paper,aktualne_cz) S3method(pb_deliver_paper,anotherangryvoice_blogspot_com) +S3method(pb_deliver_paper,augsburger_allgemeine_de) S3method(pb_deliver_paper,badische_zeitung_de) S3method(pb_deliver_paper,bbc_co_uk) S3method(pb_deliver_paper,berliner_zeitung_de) diff --git a/R/deliver_augsburger_allgemeine.R b/R/deliver_augsburger_allgemeine.R new file mode 100644 index 0000000..83d448e --- /dev/null +++ b/R/deliver_augsburger_allgemeine.R @@ -0,0 +1,31 @@ +#' @export +pb_deliver_paper.augsburger_allgemeine_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + + datetime <- html %>% + rvest::html_node("time") %>% + rvest::html_attr("datetime") %>% + lubridate::as_datetime() + headline <- html %>% + rvest::html_node("h2.typo-teaserheadline-SoleXL, h2.typo-articleheadline-Recife") %>% + rvest::html_text() + author <- html %>% + rvest::html_nodes("a.typo-author-link") %>% + rvest::html_text2() %>% + toString() + text <- html %>% + rvest::html_nodes(".typo-article-teaser-Recife, .typo-article-teaser, .article-body-paid-content, .typo-subhead, p.text-xs") %>% + rvest::html_text2() %>% + unique() %>% # teaser might be duplicated + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text + ) +} diff --git a/inst/status.csv b/inst/status.csv index 568c112..35c9c0f 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -3,6 +3,7 @@ "ad.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.ad.nl/home/rss.xml" "aktualne.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.aktualne.cz/rss" "anotherangryvoice.blogspot.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA +"augsburger-allgemeine.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "http://www.augsburger-allgemeine.de/augsburg/rss" "badische-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "bbc.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.bbci.co.uk/news/rss.xml" "berliner-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.berliner-zeitung.de/feed.xml" From 466eb5fb8f1be5350e3fb3600257f19f5b6cbd19 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 10:03:32 +0200 Subject: [PATCH 040/121] added watson.de --- NAMESPACE | 1 + R/deliver_watson_de.R | 22 ++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 24 insertions(+) create mode 100644 R/deliver_watson_de.R diff --git a/NAMESPACE b/NAMESPACE index 8d9c121..976eb34 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -94,6 +94,7 @@ S3method(pb_deliver_paper,tz_de) S3method(pb_deliver_paper,usatoday_com) S3method(pb_deliver_paper,volkskrant_nl) S3method(pb_deliver_paper,washingtonpost_com) +S3method(pb_deliver_paper,watson_de) S3method(pb_deliver_paper,welt_de) S3method(pb_deliver_paper,wsj_com) S3method(pb_deliver_paper,yahoo_com) diff --git a/R/deliver_watson_de.R b/R/deliver_watson_de.R new file mode 100644 index 0000000..5812ce4 --- /dev/null +++ b/R/deliver_watson_de.R @@ -0,0 +1,22 @@ +#' @export +pb_deliver_paper.watson_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- json_df$articleBody + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 35c9c0f..ee33a44 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -106,6 +106,7 @@ "usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "volkskrant.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.volkskrant.nl/rss.xml" "washingtonpost.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.washingtonpost.com/rss/world" +"watson.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "welt.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.welt.de/feeds/latest.rss" "wsj.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.a.dj.com/rss/RSSWorldNews.xml" "yahoo.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://news.yahoo.com/rss.xml" From 333634474fc63abc0e4688326139dfd8c43fd5c7 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 10:47:53 +0200 Subject: [PATCH 041/121] added wiwo.de --- NAMESPACE | 1 + R/deliver_wiwo_de.R | 33 +++++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 35 insertions(+) create mode 100644 R/deliver_wiwo_de.R diff --git a/NAMESPACE b/NAMESPACE index 976eb34..205650c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -96,6 +96,7 @@ S3method(pb_deliver_paper,volkskrant_nl) S3method(pb_deliver_paper,washingtonpost_com) S3method(pb_deliver_paper,watson_de) S3method(pb_deliver_paper,welt_de) +S3method(pb_deliver_paper,wiwo_de) S3method(pb_deliver_paper,wsj_com) S3method(pb_deliver_paper,yahoo_com) S3method(pb_deliver_paper,zdf_de) diff --git a/R/deliver_wiwo_de.R b/R/deliver_wiwo_de.R new file mode 100644 index 0000000..33807a7 --- /dev/null +++ b/R/deliver_wiwo_de.R @@ -0,0 +1,33 @@ +#' @export +pb_deliver_paper.wiwo_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + if (length(json_txt) != 0) { # otherwise the article is paywalled and not scrapeable + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$creator) + text <- html %>% + rvest::html_nodes(".c-leadtext,.u-richtext h3,.u-richtext p") %>% + rvest::html_text2() %>% + .[!grepl("Lesen Sie auch", .)] %>% # Remove links in between + paste(collapse = "\n") + } else { + datetime <- NA + headline <- NA + author <- NA + text <- NA + json_df <- list("no access") + } + s_n_list( + datetime, + author, + headline, + text, + json_df + ) +} diff --git a/inst/status.csv b/inst/status.csv index ee33a44..724a250 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -108,6 +108,7 @@ "washingtonpost.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.washingtonpost.com/rss/world" "watson.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "welt.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.welt.de/feeds/latest.rss" +"wiwo.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.wiwo.de/contentexport/feed/rss/schlagzeilen" "wsj.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.a.dj.com/rss/RSSWorldNews.xml" "yahoo.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://news.yahoo.com/rss.xml" "zdf.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.zdf.de/rss/zdf/nachrichten" From 794a79b60e982a7c5a1c6ed597c7c0986871b149 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 11:15:18 +0200 Subject: [PATCH 042/121] added rnd.de --- NAMESPACE | 1 + R/deliver_rnd_de.R | 29 +++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 31 insertions(+) create mode 100644 R/deliver_rnd_de.R diff --git a/NAMESPACE b/NAMESPACE index 205650c..00b0875 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -65,6 +65,7 @@ S3method(pb_deliver_paper,nypost_com) S3method(pb_deliver_paper,nytimes_com) S3method(pb_deliver_paper,parlamentnilisty_cz) S3method(pb_deliver_paper,prosieben_de) +S3method(pb_deliver_paper,rnd_de) S3method(pb_deliver_paper,rp_online_de) S3method(pb_deliver_paper,rte_ie) S3method(pb_deliver_paper,rtl_de) diff --git a/R/deliver_rnd_de.R b/R/deliver_rnd_de.R new file mode 100644 index 0000000..5052eec --- /dev/null +++ b/R/deliver_rnd_de.R @@ -0,0 +1,29 @@ +#' @export +pb_deliver_paper.rnd_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[3] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% + rvest::html_text2() + + more_items <- html %>% # delete content in lists of related items + rvest::html_nodes("div[data-is-element-rendered='true']") %>% + rvest::html_nodes(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% + rvest::html_text2() + text <- text[!text %in% more_items] %>% paste(collapse = "\n") + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 724a250..bfa6b41 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -71,6 +71,7 @@ "pagesix.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "parlamentnilisty.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://www.parlamentnilisty.cz/export/rss.aspx" "prosieben.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA +"rnd.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.rnd.de/arc/outboundfeeds/rss/" "rp-online.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://rp-online.de/feed.rss" "rte.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.rte.ie/feeds/rss/?index=/news/" "rtl.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.rtl.de/rss/feed/news" From 228437cdb8a6e61a71455677e0e272085bcfa839 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 11:27:20 +0200 Subject: [PATCH 043/121] added news.de --- NAMESPACE | 1 + R/deliver_news_de.R | 23 +++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 25 insertions(+) create mode 100644 R/deliver_news_de.R diff --git a/NAMESPACE b/NAMESPACE index 00b0875..cf23d6d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -55,6 +55,7 @@ S3method(pb_deliver_paper,metronieuws_nl) S3method(pb_deliver_paper,morgenpost_de) S3method(pb_deliver_paper,n_tv_de) S3method(pb_deliver_paper,ndr_de) +S3method(pb_deliver_paper,news_de) S3method(pb_deliver_paper,newstatesman_com) S3method(pb_deliver_paper,newsweek_com) S3method(pb_deliver_paper,nos_nl) diff --git a/R/deliver_news_de.R b/R/deliver_news_de.R new file mode 100644 index 0000000..024b887 --- /dev/null +++ b/R/deliver_news_de.R @@ -0,0 +1,23 @@ +#' @export +pb_deliver_paper.news_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- trimws(gsub("\\+\\+\\+.*?\\+\\+\\+", "", json_df$articleBody)) + text <- gsub("\r\n", "\n", text) + text <- gsub("Folgen Sie.*", "", text) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index bfa6b41..38000ed 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -59,6 +59,7 @@ "morgenpost.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "msnbc.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "ndr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "http://www.ndr.de/home/index-rss.xml" +"news.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.news.de/rss/364367598/politik/" "newstatesman.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.newstatesman.com/feed/" "newsweek.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "nos.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.nos.nl/nosnieuwsalgemeen" From d5167010268887e5f4ddd5764833253084752194 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 11:38:29 +0200 Subject: [PATCH 044/121] added deutschlandfunk.de --- NAMESPACE | 1 + R/deliver_deutschlandfunk_de.R | 26 ++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 28 insertions(+) create mode 100644 R/deliver_deutschlandfunk_de.R diff --git a/NAMESPACE b/NAMESPACE index cf23d6d..8e602fd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,6 +25,7 @@ S3method(pb_deliver_paper,dailymail_co_uk) S3method(pb_deliver_paper,default) S3method(pb_deliver_paper,denikn_cz) S3method(pb_deliver_paper,derwesten_de) +S3method(pb_deliver_paper,deutschlandfunk_de) S3method(pb_deliver_paper,evolvepolitics_com) S3method(pb_deliver_paper,faz_net) S3method(pb_deliver_paper,focus_de) diff --git a/R/deliver_deutschlandfunk_de.R b/R/deliver_deutschlandfunk_de.R new file mode 100644 index 0000000..06601ed --- /dev/null +++ b/R/deliver_deutschlandfunk_de.R @@ -0,0 +1,26 @@ +#' @export +pb_deliver_paper.deutschlandfunk_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + datetime <- html %>% + rvest::html_node("time") %>% + rvest::html_attr("datetime") %>% + lubridate::as_datetime() + headline <- html %>% + rvest::html_node(".headline-title") %>% + rvest::html_text() + author <- "deutschlandfunk.de" # could not find article with author + text <- html %>% + rvest::html_nodes(".article-header-description,.article-details-text:not(.u-text-italic),.article-details-title") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text + ) +} diff --git a/inst/status.csv b/inst/status.csv index 38000ed..3d2c9f9 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -22,6 +22,7 @@ "dailymail.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.dailymail.co.uk/news/index.rss" "decider.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "denikn.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://denikn.cz/rss" +"deutschlandfunk.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.deutschlandfunk.de/nachrichten-100.rss" "badische-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.derwesten.de/feed" "edition.cnn.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://rss.cnn.com/rss/edition.rss" "eu.usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From b34e02a01edc3f794e2583e10f1e994346afa4c9 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 11:45:16 +0200 Subject: [PATCH 045/121] added businessinsider.de --- NAMESPACE | 1 + R/deliver_businessinsider_de.R | 31 +++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 33 insertions(+) create mode 100644 R/deliver_businessinsider_de.R diff --git a/NAMESPACE b/NAMESPACE index 8e602fd..b71b8e5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ S3method(pb_deliver_paper,blesk_cz) S3method(pb_deliver_paper,br_de) S3method(pb_deliver_paper,breakingnews_ie) S3method(pb_deliver_paper,breitbart_com) +S3method(pb_deliver_paper,businessinsider_de) S3method(pb_deliver_paper,buzzfeed_com) S3method(pb_deliver_paper,cbsnews_com) S3method(pb_deliver_paper,ceskatelevize_cz) diff --git a/R/deliver_businessinsider_de.R b/R/deliver_businessinsider_de.R new file mode 100644 index 0000000..8c70e0b --- /dev/null +++ b/R/deliver_businessinsider_de.R @@ -0,0 +1,31 @@ +#' @export +pb_deliver_paper.businessinsider_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[1, ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + text <- html %>% + rvest::html_node(".article-main") |> + rvest::html_nodes("p, h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 3d2c9f9..9d0d8b1 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -14,6 +14,7 @@ "br.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://nachrichtenfeeds.br.de/rss/nachrichten/seiten/QXAPwyN" "breakingnews.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.breakingnews.ie/feed/all.rss" "breitbart.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA +"businessinsider.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.businessinsider.de/feed/businessinsider-alle-artikel" "buzzfeed.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "cbslnk.cbsileads.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "cbsnews.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.cbsnews.com/latest/rss/evening-news" From a753622e65bb2ba84c8ae77573186b053104ca65 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 11:45:54 +0200 Subject: [PATCH 046/121] added empty author if not found for ka_insider_de --- R/deliver_karlsruhe_insider_de.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/deliver_karlsruhe_insider_de.R b/R/deliver_karlsruhe_insider_de.R index 1c7cae4..a7bd7c9 100644 --- a/R/deliver_karlsruhe_insider_de.R +++ b/R/deliver_karlsruhe_insider_de.R @@ -9,6 +9,8 @@ pb_deliver_paper.karlsruhe_insider_de <- function(x, verbose = NULL, pb, ...) { json_df <- json_df$`@graph` if (any(json_df$`@type` == "Person")) { author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" } json_df <- json_df[1, ] datetime <- lubridate::as_datetime(json_df$datePublished) From aaf24c25297a2ffbbff72f633c7c71dcb5f7cbaa Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 12:18:40 +0200 Subject: [PATCH 047/121] added nzz.ch --- NAMESPACE | 1 + R/deliver_nzz_ch.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_nzz_ch.R diff --git a/NAMESPACE b/NAMESPACE index b71b8e5..59c45cc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -66,6 +66,7 @@ S3method(pb_deliver_paper,nrc_nl) S3method(pb_deliver_paper,nu_nl) S3method(pb_deliver_paper,nypost_com) S3method(pb_deliver_paper,nytimes_com) +S3method(pb_deliver_paper,nzz_ch) S3method(pb_deliver_paper,parlamentnilisty_cz) S3method(pb_deliver_paper,prosieben_de) S3method(pb_deliver_paper,rnd_de) diff --git a/R/deliver_nzz_ch.R b/R/deliver_nzz_ch.R new file mode 100644 index 0000000..f3012a2 --- /dev/null +++ b/R/deliver_nzz_ch.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.nzz_ch <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".headline__lead,.articlecomponent.text,.subtitle,.articlecomponent") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 9d0d8b1..fa53eb4 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -71,6 +71,7 @@ "nu.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.nu.nl/rss" "nypost.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "nytimes.com","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#17](https://github.com/JBGruber/paperboy/issues/17)","https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml" +"nzz.ch","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.nzz.ch/recent.rss" "pagesix.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "parlamentnilisty.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://www.parlamentnilisty.cz/export/rss.aspx" "prosieben.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA From e793f4b82e79d00ac0b8d326acd51cee333c8beb Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 12:25:13 +0200 Subject: [PATCH 048/121] added waz.de --- NAMESPACE | 1 + R/deliver_waz_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_waz_de.R diff --git a/NAMESPACE b/NAMESPACE index 59c45cc..f3bab75 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -100,6 +100,7 @@ S3method(pb_deliver_paper,usatoday_com) S3method(pb_deliver_paper,volkskrant_nl) S3method(pb_deliver_paper,washingtonpost_com) S3method(pb_deliver_paper,watson_de) +S3method(pb_deliver_paper,waz_de) S3method(pb_deliver_paper,welt_de) S3method(pb_deliver_paper,wiwo_de) S3method(pb_deliver_paper,wsj_com) diff --git a/R/deliver_waz_de.R b/R/deliver_waz_de.R new file mode 100644 index 0000000..b6d275a --- /dev/null +++ b/R/deliver_waz_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.waz_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".article-body p,.article-body h3") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index fa53eb4..a0a386c 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -112,6 +112,7 @@ "volkskrant.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.volkskrant.nl/rss.xml" "washingtonpost.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.washingtonpost.com/rss/world" "watson.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA +"waz.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.waz.de/rss" "welt.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.welt.de/feeds/latest.rss" "wiwo.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.wiwo.de/contentexport/feed/rss/schlagzeilen" "wsj.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.a.dj.com/rss/RSSWorldNews.xml" From d5a04aff5172d4d5e9bdec6b53bd03767bf7855c Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 12:31:03 +0200 Subject: [PATCH 049/121] added finanzen.net --- NAMESPACE | 1 + R/deliver_finanzen_net.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_finanzen_net.R diff --git a/NAMESPACE b/NAMESPACE index f3bab75..e651ac1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -29,6 +29,7 @@ S3method(pb_deliver_paper,derwesten_de) S3method(pb_deliver_paper,deutschlandfunk_de) S3method(pb_deliver_paper,evolvepolitics_com) S3method(pb_deliver_paper,faz_net) +S3method(pb_deliver_paper,finanzen_net) S3method(pb_deliver_paper,focus_de) S3method(pb_deliver_paper,forbes_com) S3method(pb_deliver_paper,foxbusiness_com) diff --git a/R/deliver_finanzen_net.R b/R/deliver_finanzen_net.R new file mode 100644 index 0000000..f6abc23 --- /dev/null +++ b/R/deliver_finanzen_net.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.finanzen_net <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("p.h3, .news-container__text p, .news-container__text h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index a0a386c..8b2b330 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -29,6 +29,7 @@ "eu.usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "evolvepolitics.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "faz.net","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.faz.net/rss/aktuell/" +"finanzen.net","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.finanzen.net/rss/news" "focus.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://rss.focus.de/fol/XML/rss_folnews.xml" "forbes.com","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#2](https://github.com/JBGruber/paperboy/issues/2)",NA "fortune.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA From 5e864025d883552ebe7875abaa17579cb15011d9 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 12:37:34 +0200 Subject: [PATCH 050/121] added presseportal.de --- NAMESPACE | 1 + R/deliver_presseportal_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_presseportal_de.R diff --git a/NAMESPACE b/NAMESPACE index e651ac1..1355d85 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -69,6 +69,7 @@ S3method(pb_deliver_paper,nypost_com) S3method(pb_deliver_paper,nytimes_com) S3method(pb_deliver_paper,nzz_ch) S3method(pb_deliver_paper,parlamentnilisty_cz) +S3method(pb_deliver_paper,presseportal_de) S3method(pb_deliver_paper,prosieben_de) S3method(pb_deliver_paper,rnd_de) S3method(pb_deliver_paper,rp_online_de) diff --git a/R/deliver_presseportal_de.R b/R/deliver_presseportal_de.R new file mode 100644 index 0000000..4aff63a --- /dev/null +++ b/R/deliver_presseportal_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.presseportal_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("article.story p:not([class])") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 8b2b330..5dc2cee 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -75,6 +75,7 @@ "nzz.ch","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.nzz.ch/recent.rss" "pagesix.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "parlamentnilisty.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://www.parlamentnilisty.cz/export/rss.aspx" +"presseportal.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.presseportal.de/rss/presseportal.rss2" "prosieben.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "rnd.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.rnd.de/arc/outboundfeeds/rss/" "rp-online.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://rp-online.de/feed.rss" From 0c4398db14b977efd2d128192bd624fb7c38ed52 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 13:02:08 +0200 Subject: [PATCH 051/121] added wdr.de --- NAMESPACE | 1 + R/deliver_wdr_de.R | 28 ++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 30 insertions(+) create mode 100644 R/deliver_wdr_de.R diff --git a/NAMESPACE b/NAMESPACE index 1355d85..fdd68ff 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -103,6 +103,7 @@ S3method(pb_deliver_paper,volkskrant_nl) S3method(pb_deliver_paper,washingtonpost_com) S3method(pb_deliver_paper,watson_de) S3method(pb_deliver_paper,waz_de) +S3method(pb_deliver_paper,wdr_de) S3method(pb_deliver_paper,welt_de) S3method(pb_deliver_paper,wiwo_de) S3method(pb_deliver_paper,wsj_com) diff --git a/R/deliver_wdr_de.R b/R/deliver_wdr_de.R new file mode 100644 index 0000000..c4bfb62 --- /dev/null +++ b/R/deliver_wdr_de.R @@ -0,0 +1,28 @@ +#' @export +pb_deliver_paper.wdr_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + # careful: json can have many objects but the first seems to be the article + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + date_tmp <- json_df$datePublished # missing sec + date_tmp <- sub("(\\d{2}:\\d{2})(\\+\\d{2}:\\d{2})", "\\1:00\\2", date_tmp) + datetime <- lubridate::as_datetime(date_tmp) + headline <- json_df$headline + author <- toString(json_df$author$name) %>% gsub("/", ",", .) + text <- html %>% + rvest::html_nodes(".einleitung,.text,.subtitle") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} +# rss feed contains also overviews of articles which make the parser fail diff --git a/inst/status.csv b/inst/status.csv index 5dc2cee..170a96d 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -115,6 +115,7 @@ "washingtonpost.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.washingtonpost.com/rss/world" "watson.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "waz.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.waz.de/rss" +"wdr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www1.wdr.de/wissen/uebersicht-nachrichten-100.feed" "welt.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.welt.de/feeds/latest.rss" "wiwo.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.wiwo.de/contentexport/feed/rss/schlagzeilen" "wsj.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.a.dj.com/rss/RSSWorldNews.xml" From 63798f38d354107d3f824b0a19ac856aec938ed8 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 13:07:31 +0200 Subject: [PATCH 052/121] added hna.de --- NAMESPACE | 1 + R/deliver_hna_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_hna_de.R diff --git a/NAMESPACE b/NAMESPACE index fdd68ff..cfca41a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -38,6 +38,7 @@ S3method(pb_deliver_paper,geenstijl_nl) S3method(pb_deliver_paper,handelsblatt_com) S3method(pb_deliver_paper,heise_de) S3method(pb_deliver_paper,hn_cz) +S3method(pb_deliver_paper,hna_de) S3method(pb_deliver_paper,huffpost_com) S3method(pb_deliver_paper,idnes_cz) S3method(pb_deliver_paper,independent_co_uk) diff --git a/R/deliver_hna_de.R b/R/deliver_hna_de.R new file mode 100644 index 0000000..4346251 --- /dev/null +++ b/R/deliver_hna_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.hna_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$mainEntity + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 170a96d..5313873 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -41,6 +41,7 @@ "handelsblatt.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.handelsblatt.com/contentexport/feed/schlagzeilen" "heise.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.heise.de/rss/heise.rdf" "hn.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://domaci.hn.cz/?m=rss" +"hna.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.hna.de/politik/rssfeed.xml" "huffingtonpost.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "idnes.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "independent.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 5744393f94176846ee986392d1a5830a09223c40 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 13:18:29 +0200 Subject: [PATCH 053/121] replaced base R pipe with magrittr --- R/deliver_br_de.R | 2 +- R/deliver_businessinsider_de.R | 2 +- R/deliver_karlsruhe_insider_de.R | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/deliver_br_de.R b/R/deliver_br_de.R index bf8bb09..53f0b05 100644 --- a/R/deliver_br_de.R +++ b/R/deliver_br_de.R @@ -21,7 +21,7 @@ pb_deliver_paper.br_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_node(".RichText_richText__wS9Rz.body3") |> + rvest::html_node(".RichText_richText__wS9Rz.body3") %>% rvest::html_nodes("p, h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_businessinsider_de.R b/R/deliver_businessinsider_de.R index 8c70e0b..df4f50d 100644 --- a/R/deliver_businessinsider_de.R +++ b/R/deliver_businessinsider_de.R @@ -16,7 +16,7 @@ pb_deliver_paper.businessinsider_de <- function(x, verbose = NULL, pb, ...) { datetime <- lubridate::as_datetime(json_df$datePublished) headline <- json_df$headline text <- html %>% - rvest::html_node(".article-main") |> + rvest::html_node(".article-main") %>% rvest::html_nodes("p, h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_karlsruhe_insider_de.R b/R/deliver_karlsruhe_insider_de.R index a7bd7c9..66e531e 100644 --- a/R/deliver_karlsruhe_insider_de.R +++ b/R/deliver_karlsruhe_insider_de.R @@ -16,7 +16,7 @@ pb_deliver_paper.karlsruhe_insider_de <- function(x, verbose = NULL, pb, ...) { datetime <- lubridate::as_datetime(json_df$datePublished) headline <- json_df$headline text <- html %>% - rvest::html_node("article .td-post-content") |> + rvest::html_node("article .td-post-content") %>% rvest::html_nodes("p, h2") %>% rvest::html_text2() %>% paste(collapse = "\n") From 1c835a8c52935d03c969edd59b5287afe643f251 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 13:18:46 +0200 Subject: [PATCH 054/121] added express.de --- NAMESPACE | 1 + R/deliver_express_de.R | 33 +++++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 35 insertions(+) create mode 100644 R/deliver_express_de.R diff --git a/NAMESPACE b/NAMESPACE index cfca41a..022f31d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -28,6 +28,7 @@ S3method(pb_deliver_paper,denikn_cz) S3method(pb_deliver_paper,derwesten_de) S3method(pb_deliver_paper,deutschlandfunk_de) S3method(pb_deliver_paper,evolvepolitics_com) +S3method(pb_deliver_paper,express_de) S3method(pb_deliver_paper,faz_net) S3method(pb_deliver_paper,finanzen_net) S3method(pb_deliver_paper,focus_de) diff --git a/R/deliver_express_de.R b/R/deliver_express_de.R new file mode 100644 index 0000000..dc498c5 --- /dev/null +++ b/R/deliver_express_de.R @@ -0,0 +1,33 @@ +#' @export +pb_deliver_paper.express_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + text <- html %>% + rvest::html_nodes(".dm-article__intro,.dm-paragraph,.dm-article__subheadline") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + if (author == "") { + # the text has the author abbr. at the end + author <- sub(".*\\(([^)]+)\\)$", "\\1", text) + } + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 5313873..a7cdb85 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -28,6 +28,7 @@ "edition.cnn.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://rss.cnn.com/rss/edition.rss" "eu.usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "evolvepolitics.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA +"express.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "faz.net","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.faz.net/rss/aktuell/" "finanzen.net","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.finanzen.net/rss/news" "focus.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://rss.focus.de/fol/XML/rss_folnews.xml" From 6496292e2aaaa9c734d6ccc3d87586a91cc0bb83 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 13:25:23 +0200 Subject: [PATCH 055/121] removed extra content in headline for express.de --- R/deliver_express_de.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/deliver_express_de.R b/R/deliver_express_de.R index dc498c5..664f237 100644 --- a/R/deliver_express_de.R +++ b/R/deliver_express_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.express_de <- function(x, verbose = NULL, pb, ...) { } json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline + headline <- sub(" \\| .*", "", json_df$headline) text <- html %>% rvest::html_nodes(".dm-article__intro,.dm-paragraph,.dm-article__subheadline") %>% rvest::html_text2() %>% From 2a9520cd778833aef33534888f1c8b416be4d213 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 13:25:30 +0200 Subject: [PATCH 056/121] added ksta.de --- NAMESPACE | 1 + R/deliver_ksta_de.R | 33 +++++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 35 insertions(+) create mode 100644 R/deliver_ksta_de.R diff --git a/NAMESPACE b/NAMESPACE index 022f31d..f387920 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -50,6 +50,7 @@ S3method(pb_deliver_paper,irishtimes_com) S3method(pb_deliver_paper,irozhlas_cz) S3method(pb_deliver_paper,joe_ie) S3method(pb_deliver_paper,karlsruhe_insider_de) +S3method(pb_deliver_paper,ksta_de) S3method(pb_deliver_paper,latimes_com) S3method(pb_deliver_paper,lidovky_cz) S3method(pb_deliver_paper,marketwatch_com) diff --git a/R/deliver_ksta_de.R b/R/deliver_ksta_de.R new file mode 100644 index 0000000..8934857 --- /dev/null +++ b/R/deliver_ksta_de.R @@ -0,0 +1,33 @@ +#' @export +pb_deliver_paper.ksta_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- sub(" \\| .*", "", json_df$headline) + text <- html %>% + rvest::html_nodes(".dm-article__intro,.dm-paragraph,.dm-article__subheadline") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + if (author == "") { + # the text has the author abbr. at the end + author <- sub(".*\\(([^)]+)\\)$", "\\1", text) + } + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index a7cdb85..359a9ef 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -53,6 +53,7 @@ "irozhlas.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.irozhlas.cz/rss/irozhlas" "joe.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.joe.ie/feed" "karlsruhe-insider.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA +"ksta.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://feed.ksta.de/feed/rss/politik/index.rss" "latimes.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.latimes.com/politics/rss2.0.xml" "lidovky.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://servis.lidovky.cz/rss.aspx" "lnk.techrepublic.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA From 2f3c56e3c9ab860f379042b0e037ab4128132d4a Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 13:35:56 +0200 Subject: [PATCH 057/121] added suedkurier.de --- NAMESPACE | 1 + R/deliver_suedkurier_de.R | 29 +++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 31 insertions(+) create mode 100644 R/deliver_suedkurier_de.R diff --git a/NAMESPACE b/NAMESPACE index f387920..7066713 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -87,6 +87,7 @@ S3method(pb_deliver_paper,sky_com) S3method(pb_deliver_paper,spiegel_de) S3method(pb_deliver_paper,stern_de) S3method(pb_deliver_paper,sueddeutsche_de) +S3method(pb_deliver_paper,suedkurier_de) S3method(pb_deliver_paper,swp_de) S3method(pb_deliver_paper,swr_de) S3method(pb_deliver_paper,t3n_de) diff --git a/R/deliver_suedkurier_de.R b/R/deliver_suedkurier_de.R new file mode 100644 index 0000000..cda60fe --- /dev/null +++ b/R/deliver_suedkurier_de.R @@ -0,0 +1,29 @@ +#' @export +pb_deliver_paper.suedkurier_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- html %>% + rvest::html_node("header h1") %>% + rvest::html_text() + author <- paste0("<p>", json_df$author$name, "</p>", collapse = ",") %>% + rvest::read_html() %>% + rvest::html_text() %>% + toString() + text <- html %>% + rvest::html_nodes(".article-summary,.article-jsonld.article-paywall-summary,.article-jsonld p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 359a9ef..ca1729a 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -93,6 +93,7 @@ "spiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.spiegel.de/schlagzeilen/index.rss" "stern.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.stern.de/feed/standard/all/" "sueddeutsche.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://rss.sueddeutsche.de/alles" +"suedkurier.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "swp.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "swr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "tag24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA From c7dd45825845011cc189056a75ac328f1027e6e2 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 15:36:58 +0200 Subject: [PATCH 058/121] added deutschlandfunkkultur.de --- NAMESPACE | 1 + R/deliver_deutschlandfunkkultur_de.R | 28 ++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 30 insertions(+) create mode 100644 R/deliver_deutschlandfunkkultur_de.R diff --git a/NAMESPACE b/NAMESPACE index 7066713..0abf0f4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,6 +27,7 @@ S3method(pb_deliver_paper,default) S3method(pb_deliver_paper,denikn_cz) S3method(pb_deliver_paper,derwesten_de) S3method(pb_deliver_paper,deutschlandfunk_de) +S3method(pb_deliver_paper,deutschlandfunkkultur_de) S3method(pb_deliver_paper,evolvepolitics_com) S3method(pb_deliver_paper,express_de) S3method(pb_deliver_paper,faz_net) diff --git a/R/deliver_deutschlandfunkkultur_de.R b/R/deliver_deutschlandfunkkultur_de.R new file mode 100644 index 0000000..154b3fc --- /dev/null +++ b/R/deliver_deutschlandfunkkultur_de.R @@ -0,0 +1,28 @@ +#' @export +pb_deliver_paper.deutschlandfunkkultur_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + datetime <- html %>% + rvest::html_node("time") %>% + rvest::html_attr("datetime") %>% + lubridate::as_datetime() + headline <- html %>% + rvest::html_node(".headline-title,.section-article-head-area-title") %>% + rvest::html_text() + author <- html %>% + rvest::html_node(".article-header-author") %>% + rvest::html_text() + text <- html %>% + rvest::html_nodes(".section-article-head-area-description,.article-header-description,.article-details-text:not(.u-text-italic),.article-details-title") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text + ) +} diff --git a/inst/status.csv b/inst/status.csv index ca1729a..307f3bb 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -24,6 +24,7 @@ "decider.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "denikn.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://denikn.cz/rss" "deutschlandfunk.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.deutschlandfunk.de/nachrichten-100.rss" +"deutschlandfunkkultur.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.deutschlandfunkkultur.de/politik-114.rss" "badische-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.derwesten.de/feed" "edition.cnn.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://rss.cnn.com/rss/edition.rss" "eu.usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From dd441a460a9228fbf4afab2042c00a16d5cc04a1 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 15:45:31 +0200 Subject: [PATCH 059/121] added kreiszeitung.de --- NAMESPACE | 1 + R/deliver_kreiszeitung_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_kreiszeitung_de.R diff --git a/NAMESPACE b/NAMESPACE index 0abf0f4..61eec28 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -51,6 +51,7 @@ S3method(pb_deliver_paper,irishtimes_com) S3method(pb_deliver_paper,irozhlas_cz) S3method(pb_deliver_paper,joe_ie) S3method(pb_deliver_paper,karlsruhe_insider_de) +S3method(pb_deliver_paper,kreiszeitung_de) S3method(pb_deliver_paper,ksta_de) S3method(pb_deliver_paper,latimes_com) S3method(pb_deliver_paper,lidovky_cz) diff --git a/R/deliver_kreiszeitung_de.R b/R/deliver_kreiszeitung_de.R new file mode 100644 index 0000000..19536ab --- /dev/null +++ b/R/deliver_kreiszeitung_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.kreiszeitung_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$mainEntity + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 307f3bb..842893e 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -54,6 +54,7 @@ "irozhlas.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.irozhlas.cz/rss/irozhlas" "joe.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.joe.ie/feed" "karlsruhe-insider.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA +"kreiszeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "ksta.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://feed.ksta.de/feed/rss/politik/index.rss" "latimes.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.latimes.com/politics/rss2.0.xml" "lidovky.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://servis.lidovky.cz/rss.aspx" From 9b6eaa905845a0615fc0e12a9d7813e766c48bce Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 15:58:45 +0200 Subject: [PATCH 060/121] added abendblatt.de --- NAMESPACE | 1 + R/deliver_abendblatt_de.R | 27 +++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 29 insertions(+) create mode 100644 R/deliver_abendblatt_de.R diff --git a/NAMESPACE b/NAMESPACE index 61eec28..50aa0f0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ S3method(pb_deliver,character) S3method(pb_deliver,data.frame) S3method(pb_deliver,default) +S3method(pb_deliver_paper,abendblatt_de) S3method(pb_deliver_paper,ac24_cz) S3method(pb_deliver_paper,ad_nl) S3method(pb_deliver_paper,aktualne_cz) diff --git a/R/deliver_abendblatt_de.R b/R/deliver_abendblatt_de.R new file mode 100644 index 0000000..d043093 --- /dev/null +++ b/R/deliver_abendblatt_de.R @@ -0,0 +1,27 @@ +#' @export +pb_deliver_paper.abendblatt_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".article-body h3, .article-body p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} +# rss feed includes pages that cannot be parsed because they are subpages +# rss feed also includes podcast, which cannot be parsed diff --git a/inst/status.csv b/inst/status.csv index 842893e..293236e 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -1,4 +1,5 @@ "domain","status","author","issues","rss" +"abendblatt.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.abendblatt.de/rss" "ac24.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.ac24.cz/feed/" "ad.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.ad.nl/home/rss.xml" "aktualne.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.aktualne.cz/rss" From 9716ad6652e2cfd21a4a7a4cf485fac7a61260cd Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 16:07:36 +0200 Subject: [PATCH 061/121] added stuttgarter-zeitung.de --- NAMESPACE | 1 + R/deliver_stuttgarter_zeitung_de.R | 30 ++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 32 insertions(+) create mode 100644 R/deliver_stuttgarter_zeitung_de.R diff --git a/NAMESPACE b/NAMESPACE index 50aa0f0..d7df42b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -89,6 +89,7 @@ S3method(pb_deliver_paper,skwawkbox_org) S3method(pb_deliver_paper,sky_com) S3method(pb_deliver_paper,spiegel_de) S3method(pb_deliver_paper,stern_de) +S3method(pb_deliver_paper,stuttgarter_zeitung_de) S3method(pb_deliver_paper,sueddeutsche_de) S3method(pb_deliver_paper,suedkurier_de) S3method(pb_deliver_paper,swp_de) diff --git a/R/deliver_stuttgarter_zeitung_de.R b/R/deliver_stuttgarter_zeitung_de.R new file mode 100644 index 0000000..b811af2 --- /dev/null +++ b/R/deliver_stuttgarter_zeitung_de.R @@ -0,0 +1,30 @@ +#' @export +pb_deliver_paper.stuttgarter_zeitung_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".brick.intro-text p,.brickgroup p,.brickgroup h2") %>% + rvest::html_text2() + rm_text <- c("StZ-Plus-Abonnement", "Vertrag mit Werbung") + + text <- text[!text %in% rm_text] %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} +# rss feed includes pages that cannot be parsed because they are subpages +# rss feed also includes podcast, which cannot be parsed diff --git a/inst/status.csv b/inst/status.csv index 293236e..2a75805 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -95,6 +95,7 @@ "sky.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.skynews.com/feeds/rss/home.xml" "spiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.spiegel.de/schlagzeilen/index.rss" "stern.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.stern.de/feed/standard/all/" +"stuttgarter-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "sueddeutsche.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://rss.sueddeutsche.de/alles" "suedkurier.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "swp.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA From c9471832c725dc934e5bf5a60014d8dab6d5bca7 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 16:17:27 +0200 Subject: [PATCH 062/121] added infranken.de --- NAMESPACE | 1 + R/deliver_infranken_de.R | 22 ++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 24 insertions(+) create mode 100644 R/deliver_infranken_de.R diff --git a/NAMESPACE b/NAMESPACE index d7df42b..f4dd484 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -46,6 +46,7 @@ S3method(pb_deliver_paper,huffpost_com) S3method(pb_deliver_paper,idnes_cz) S3method(pb_deliver_paper,independent_co_uk) S3method(pb_deliver_paper,independent_ie) +S3method(pb_deliver_paper,infranken_de) S3method(pb_deliver_paper,irishexaminer_com) S3method(pb_deliver_paper,irishmirror_ie) S3method(pb_deliver_paper,irishtimes_com) diff --git a/R/deliver_infranken_de.R b/R/deliver_infranken_de.R new file mode 100644 index 0000000..57616b3 --- /dev/null +++ b/R/deliver_infranken_de.R @@ -0,0 +1,22 @@ +#' @export +pb_deliver_paper.infranken_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- json_df$articleBody + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 2a75805..d3e3786 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -49,6 +49,7 @@ "idnes.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "independent.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "independent.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.independent.ie/rss/section/ada62966-6b00-4ead-a0ba-2c179a0730b0" +"infranken.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "irishexaminer.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.feedburner.com/ieireland" "irishmirror.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.irishmirror.ie/?service=rss" "irishtimes.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.irishtimes.com/arc/outboundfeeds/feed-irish-news/" From 2d6b079520e4492a642d9d588610adc28fa4bf0e Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 20:03:21 +0200 Subject: [PATCH 063/121] added rbb24.de --- NAMESPACE | 1 + R/deliver_rbb24_de.R | 31 +++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 33 insertions(+) create mode 100644 R/deliver_rbb24_de.R diff --git a/NAMESPACE b/NAMESPACE index f4dd484..a1f2626 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -78,6 +78,7 @@ S3method(pb_deliver_paper,nzz_ch) S3method(pb_deliver_paper,parlamentnilisty_cz) S3method(pb_deliver_paper,presseportal_de) S3method(pb_deliver_paper,prosieben_de) +S3method(pb_deliver_paper,rbb24_de) S3method(pb_deliver_paper,rnd_de) S3method(pb_deliver_paper,rp_online_de) S3method(pb_deliver_paper,rte_ie) diff --git a/R/deliver_rbb24_de.R b/R/deliver_rbb24_de.R new file mode 100644 index 0000000..a255e14 --- /dev/null +++ b/R/deliver_rbb24_de.R @@ -0,0 +1,31 @@ +#' @export +pb_deliver_paper.rbb24_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + datetime <- html %>% + rvest::html_nodes(".technicalline .lineinfo") %>% + rvest::html_text2() %>% + gsub(".*(\\d{2}\\.\\d{2}\\.\\d{2}) \\| (\\d{2}:\\d{2}).*", "\\1 \\2", .) %>% + lubridate::as_datetime(format = "%d.%m.%y %H:%M", tz = "UTC") # This will not be the correct timezone + + + headline <- html %>% + rvest::html_nodes(".titletext") %>% + rvest::html_text2() + + author <- "" # no article with author info found + + text <- html %>% + rvest::html_nodes(".shorttext p, .textblock p, h4.texttitle") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text + ) +} diff --git a/inst/status.csv b/inst/status.csv index d3e3786..38442b4 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -84,6 +84,7 @@ "parlamentnilisty.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://www.parlamentnilisty.cz/export/rss.aspx" "presseportal.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.presseportal.de/rss/presseportal.rss2" "prosieben.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA +"rbb24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.rbb24.de/aktuell/index.xml/feed=rss.xml" "rnd.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.rnd.de/arc/outboundfeeds/rss/" "rp-online.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://rp-online.de/feed.rss" "rte.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.rte.ie/feeds/rss/?index=/news/" From d2dbe2d74a66a613e7584d5df1b9aac985e6154f Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 20:09:33 +0200 Subject: [PATCH 064/121] added abendzeitung-muenchen.de --- NAMESPACE | 1 + R/deliver_abendzeitung_muenchen_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_abendzeitung_muenchen_de.R diff --git a/NAMESPACE b/NAMESPACE index a1f2626..061615c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ S3method(pb_deliver,character) S3method(pb_deliver,data.frame) S3method(pb_deliver,default) S3method(pb_deliver_paper,abendblatt_de) +S3method(pb_deliver_paper,abendzeitung_muenchen_de) S3method(pb_deliver_paper,ac24_cz) S3method(pb_deliver_paper,ad_nl) S3method(pb_deliver_paper,aktualne_cz) diff --git a/R/deliver_abendzeitung_muenchen_de.R b/R/deliver_abendzeitung_muenchen_de.R new file mode 100644 index 0000000..3a99054 --- /dev/null +++ b/R/deliver_abendzeitung_muenchen_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.abendzeitung_muenchen_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".artdetail_short ,.artdetail_text p,.artdetail_text h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 38442b4..da902a7 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -1,5 +1,6 @@ "domain","status","author","issues","rss" "abendblatt.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.abendblatt.de/rss" +"abendzeitung-muenchen.de.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.abendzeitung-muenchen.de/storage/rss/rss/alle-artikel-abendzeitung.xml" "ac24.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.ac24.cz/feed/" "ad.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.ad.nl/home/rss.xml" "aktualne.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.aktualne.cz/rss" From 6731ed8e8357bb465c4c5d8a73262d7d0dc4d5f2 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 20:13:09 +0200 Subject: [PATCH 065/121] added echo24.de --- NAMESPACE | 1 + R/deliver_echo24_de.R | 26 ++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 28 insertions(+) create mode 100644 R/deliver_echo24_de.R diff --git a/NAMESPACE b/NAMESPACE index 061615c..18c1b0d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -30,6 +30,7 @@ S3method(pb_deliver_paper,denikn_cz) S3method(pb_deliver_paper,derwesten_de) S3method(pb_deliver_paper,deutschlandfunk_de) S3method(pb_deliver_paper,deutschlandfunkkultur_de) +S3method(pb_deliver_paper,echo24_de) S3method(pb_deliver_paper,evolvepolitics_com) S3method(pb_deliver_paper,express_de) S3method(pb_deliver_paper,faz_net) diff --git a/R/deliver_echo24_de.R b/R/deliver_echo24_de.R new file mode 100644 index 0000000..fc52a5c --- /dev/null +++ b/R/deliver_echo24_de.R @@ -0,0 +1,26 @@ +#' @export +pb_deliver_paper.echo24_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$mainEntity + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index da902a7..e414d45 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -28,6 +28,7 @@ "deutschlandfunk.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.deutschlandfunk.de/nachrichten-100.rss" "deutschlandfunkkultur.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.deutschlandfunkkultur.de/politik-114.rss" "badische-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.derwesten.de/feed" +"echo24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "edition.cnn.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://rss.cnn.com/rss/edition.rss" "eu.usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "evolvepolitics.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 13ae0ab83910b9e078bc654f7ed6392ad872a579 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 20:24:43 +0200 Subject: [PATCH 066/121] added mopo.de --- NAMESPACE | 1 + R/deliver_mopo_de.R | 33 +++++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 35 insertions(+) create mode 100644 R/deliver_mopo_de.R diff --git a/NAMESPACE b/NAMESPACE index 18c1b0d..19eabbd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -64,6 +64,7 @@ S3method(pb_deliver_paper,mdr_de) S3method(pb_deliver_paper,mediacourant_nl) S3method(pb_deliver_paper,merkur_de) S3method(pb_deliver_paper,metronieuws_nl) +S3method(pb_deliver_paper,mopo_de) S3method(pb_deliver_paper,morgenpost_de) S3method(pb_deliver_paper,n_tv_de) S3method(pb_deliver_paper,ndr_de) diff --git a/R/deliver_mopo_de.R b/R/deliver_mopo_de.R new file mode 100644 index 0000000..4fc5456 --- /dev/null +++ b/R/deliver_mopo_de.R @@ -0,0 +1,33 @@ +#' @export +pb_deliver_paper.mopo_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- sub(" \\| .*", "", json_df$headline) + text <- html %>% + rvest::html_nodes("p, h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + if (author == "") { + # the text has the author abbr. at the end + author <- sub(".*\\(([^)]+)\\)$", "\\1", text) + } + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index e414d45..38ce345 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -68,6 +68,7 @@ "mediacourant.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.mediacourant.nl/feed/" "merkur.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "http://www.merkur.de/rssfeed.rdf" "metronieuws.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.metronieuws.nl/feed/" +"mopo.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "morgenpost.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "msnbc.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "ndr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "http://www.ndr.de/home/index-rss.xml" From 63a1231877f7519f377837d4a3741769c58e9a18 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 20:29:54 +0200 Subject: [PATCH 067/121] added saechsische.de --- NAMESPACE | 1 + R/deliver_saechsische_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_saechsische_de.R diff --git a/NAMESPACE b/NAMESPACE index 19eabbd..1b9c976 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -88,6 +88,7 @@ S3method(pb_deliver_paper,rte_ie) S3method(pb_deliver_paper,rtl_de) S3method(pb_deliver_paper,rtl_nl) S3method(pb_deliver_paper,ruhr24_de) +S3method(pb_deliver_paper,saechsische_de) S3method(pb_deliver_paper,seznamzpravy_cz) S3method(pb_deliver_paper,sfgate_com) S3method(pb_deliver_paper,skwawkbox_org) diff --git a/R/deliver_saechsische_de.R b/R/deliver_saechsische_de.R new file mode 100644 index 0000000..7ad3951 --- /dev/null +++ b/R/deliver_saechsische_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.saechsische_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[3] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 38ce345..14a8bca 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -94,6 +94,7 @@ "rtl.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.rtl.de/rss/feed/news" "rtl.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.rtlnieuws.nl/rss.xml" "ruhr24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA +"saechsische.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.saechsische.de/arc/outboundfeeds/rss/" "seznamzpravy.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.seznamzpravy.cz/rss" "sfgate.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "skwawkbox.org","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 32e8c6130e72a5edd814826c5f114c6223e6897e Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 20:34:57 +0200 Subject: [PATCH 068/121] added kurier.at --- NAMESPACE | 1 + R/deliver_kurier_at.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_kurier_at.R diff --git a/NAMESPACE b/NAMESPACE index 1b9c976..539c7db 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -57,6 +57,7 @@ S3method(pb_deliver_paper,joe_ie) S3method(pb_deliver_paper,karlsruhe_insider_de) S3method(pb_deliver_paper,kreiszeitung_de) S3method(pb_deliver_paper,ksta_de) +S3method(pb_deliver_paper,kurier_at) S3method(pb_deliver_paper,latimes_com) S3method(pb_deliver_paper,lidovky_cz) S3method(pb_deliver_paper,marketwatch_com) diff --git a/R/deliver_kurier_at.R b/R/deliver_kurier_at.R new file mode 100644 index 0000000..c40452b --- /dev/null +++ b/R/deliver_kurier_at.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.kurier_at <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".headerComp-intro,.paragraph.copy") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 14a8bca..facdacc 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -60,6 +60,7 @@ "karlsruhe-insider.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "kreiszeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "ksta.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://feed.ksta.de/feed/rss/politik/index.rss" +"kurier.at","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://kurier.at/xml/rssd" "latimes.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.latimes.com/politics/rss2.0.xml" "lidovky.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://servis.lidovky.cz/rss.aspx" "lnk.techrepublic.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA From 059f2d514a763c7514ee091dd287e4e479157727 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 20:45:03 +0200 Subject: [PATCH 069/121] added manager-magazin.de --- NAMESPACE | 1 + R/deliver_manager_magazin_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_manager_magazin_de.R diff --git a/NAMESPACE b/NAMESPACE index 539c7db..b08e412 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -60,6 +60,7 @@ S3method(pb_deliver_paper,ksta_de) S3method(pb_deliver_paper,kurier_at) S3method(pb_deliver_paper,latimes_com) S3method(pb_deliver_paper,lidovky_cz) +S3method(pb_deliver_paper,manager_magazin_de) S3method(pb_deliver_paper,marketwatch_com) S3method(pb_deliver_paper,mdr_de) S3method(pb_deliver_paper,mediacourant_nl) diff --git a/R/deliver_manager_magazin_de.R b/R/deliver_manager_magazin_de.R new file mode 100644 index 0000000..137bf07 --- /dev/null +++ b/R/deliver_manager_magazin_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.manager_magazin_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df[json_df$`@type` == "NewsArticle", ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".leading-loose, .RichText p, .RichText h3") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index facdacc..e035ca4 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -64,6 +64,7 @@ "latimes.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.latimes.com/politics/rss2.0.xml" "lidovky.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://servis.lidovky.cz/rss.aspx" "lnk.techrepublic.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA +"manager-magazin.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.manager-magazin.de/news/index.rss" "marketwatch.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "mdr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "mediacourant.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.mediacourant.nl/feed/" From b92fae00c41f050714934491a2b918b7b7517d12 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 20:53:23 +0200 Subject: [PATCH 070/121] added bnn.de --- NAMESPACE | 1 + R/deliver_bnn_de.R | 26 ++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 28 insertions(+) create mode 100644 R/deliver_bnn_de.R diff --git a/NAMESPACE b/NAMESPACE index b08e412..c751208 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -15,6 +15,7 @@ S3method(pb_deliver_paper,bbc_co_uk) S3method(pb_deliver_paper,berliner_zeitung_de) S3method(pb_deliver_paper,bild_de) S3method(pb_deliver_paper,blesk_cz) +S3method(pb_deliver_paper,bnn_de) S3method(pb_deliver_paper,br_de) S3method(pb_deliver_paper,breakingnews_ie) S3method(pb_deliver_paper,breitbart_com) diff --git a/R/deliver_bnn_de.R b/R/deliver_bnn_de.R new file mode 100644 index 0000000..7551201 --- /dev/null +++ b/R/deliver_bnn_de.R @@ -0,0 +1,26 @@ +#' @export +pb_deliver_paper.bnn_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + article <- grepl("\"NewsArticle\"", json_txt) + json_df <- jsonlite::fromJSON(json_txt[article]) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".intro,.article__body p,.article__body h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index e035ca4..46934c3 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -11,6 +11,7 @@ "berliner-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.berliner-zeitung.de/feed.xml" "bild.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.bild.de/rssfeeds/rss3-20745882,feed=alles.bild.html" "blesk.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.blesk.cz/rss" +"bnn.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "boston.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "bostonglobe.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "br.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://nachrichtenfeeds.br.de/rss/nachrichten/seiten/QXAPwyN" From beffc2fdbe834956a05d3ffe6139616a0c259b1c Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 21:00:02 +0200 Subject: [PATCH 071/121] added nordkurier.de --- NAMESPACE | 1 + R/deliver_nordkurier_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_nordkurier_de.R diff --git a/NAMESPACE b/NAMESPACE index c751208..15dc8cf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -74,6 +74,7 @@ S3method(pb_deliver_paper,ndr_de) S3method(pb_deliver_paper,news_de) S3method(pb_deliver_paper,newstatesman_com) S3method(pb_deliver_paper,newsweek_com) +S3method(pb_deliver_paper,nordkurier_de) S3method(pb_deliver_paper,nos_nl) S3method(pb_deliver_paper,novinky_cz) S3method(pb_deliver_paper,nrc_nl) diff --git a/R/deliver_nordkurier_de.R b/R/deliver_nordkurier_de.R new file mode 100644 index 0000000..22b0a63 --- /dev/null +++ b/R/deliver_nordkurier_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.nordkurier_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".tw-text-title-md, .paragraph,h2.tw-mb-4") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 46934c3..c8ba66f 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -78,6 +78,7 @@ "news.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.news.de/rss/364367598/politik/" "newstatesman.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.newstatesman.com/feed/" "newsweek.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA +"nordkurier.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "nos.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.nos.nl/nosnieuwsalgemeen" "novinky.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.novinky.cz/rss" "nrc.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 660b7de207e8e4d850cd6c6bf3375c49c6c52f87 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 17 Oct 2024 22:08:13 +0200 Subject: [PATCH 072/121] added rollingstone.de --- NAMESPACE | 1 + R/deliver_rollingstone_de.R | 30 ++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 32 insertions(+) create mode 100644 R/deliver_rollingstone_de.R diff --git a/NAMESPACE b/NAMESPACE index 15dc8cf..67ddf3c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -87,6 +87,7 @@ S3method(pb_deliver_paper,presseportal_de) S3method(pb_deliver_paper,prosieben_de) S3method(pb_deliver_paper,rbb24_de) S3method(pb_deliver_paper,rnd_de) +S3method(pb_deliver_paper,rollingstone_de) S3method(pb_deliver_paper,rp_online_de) S3method(pb_deliver_paper,rte_ie) S3method(pb_deliver_paper,rtl_de) diff --git a/R/deliver_rollingstone_de.R b/R/deliver_rollingstone_de.R new file mode 100644 index 0000000..c3d0415 --- /dev/null +++ b/R/deliver_rollingstone_de.R @@ -0,0 +1,30 @@ +#' @export +pb_deliver_paper.rollingstone_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + text <- html %>% + rvest::html_nodes(".asmb-article-excerpt,.asmb-article-content-container h2,.asmb-article-content-container p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index c8ba66f..fb72773 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -93,6 +93,7 @@ "prosieben.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "rbb24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.rbb24.de/aktuell/index.xml/feed=rss.xml" "rnd.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.rnd.de/arc/outboundfeeds/rss/" +"rollingstone.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "rp-online.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://rp-online.de/feed.rss" "rte.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.rte.ie/feeds/rss/?index=/news/" "rtl.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.rtl.de/rss/feed/news" From 37f17395b25e4c746a1103ed517474779ba7f748 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 07:15:25 +0200 Subject: [PATCH 073/121] added berliner-kurier.de --- NAMESPACE | 1 + R/deliver_berliner_kurier_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_berliner_kurier_de.R diff --git a/NAMESPACE b/NAMESPACE index 67ddf3c..6848e72 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ S3method(pb_deliver_paper,anotherangryvoice_blogspot_com) S3method(pb_deliver_paper,augsburger_allgemeine_de) S3method(pb_deliver_paper,badische_zeitung_de) S3method(pb_deliver_paper,bbc_co_uk) +S3method(pb_deliver_paper,berliner_kurier_de) S3method(pb_deliver_paper,berliner_zeitung_de) S3method(pb_deliver_paper,bild_de) S3method(pb_deliver_paper,blesk_cz) diff --git a/R/deliver_berliner_kurier_de.R b/R/deliver_berliner_kurier_de.R new file mode 100644 index 0000000..d840e75 --- /dev/null +++ b/R/deliver_berliner_kurier_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.berliner_kurier_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".article_header-lead__0E3Bn, p.article_paragraph__hXYKJ, h2.article_subtitle__wx1Lu") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index fb72773..2ff65f9 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -8,6 +8,7 @@ "augsburger-allgemeine.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "http://www.augsburger-allgemeine.de/augsburg/rss" "badische-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "bbc.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.bbci.co.uk/news/rss.xml" +"berliner-kurier.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "berliner-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.berliner-zeitung.de/feed.xml" "bild.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.bild.de/rssfeeds/rss3-20745882,feed=alles.bild.html" "blesk.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.blesk.cz/rss" From 10542fdbe2c925a9b6dfcfe7a09e9e60deb5afd4 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 07:28:18 +0200 Subject: [PATCH 074/121] added vice.com --- NAMESPACE | 1 + R/deliver_vice_com.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_vice_com.R diff --git a/NAMESPACE b/NAMESPACE index 6848e72..f4fe2c3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -119,6 +119,7 @@ S3method(pb_deliver_paper,thejournal_ie) S3method(pb_deliver_paper,thesun_ie) S3method(pb_deliver_paper,tz_de) S3method(pb_deliver_paper,usatoday_com) +S3method(pb_deliver_paper,vice_com) S3method(pb_deliver_paper,volkskrant_nl) S3method(pb_deliver_paper,washingtonpost_com) S3method(pb_deliver_paper,watson_de) diff --git a/R/deliver_vice_com.R b/R/deliver_vice_com.R new file mode 100644 index 0000000..45d5123 --- /dev/null +++ b/R/deliver_vice_com.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.vice_com <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".entry-content.entry-content p,.entry-content entry-content h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 2ff65f9..aa7fe37 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -131,6 +131,7 @@ "t-online.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.t-online.de/nachrichten/feed.rss" "us.cnn.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA +"vice.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "volkskrant.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.volkskrant.nl/rss.xml" "washingtonpost.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.washingtonpost.com/rss/world" "watson.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA From e856414f51ca3946fadd61e42f9b866da91a7bca Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 07:43:15 +0200 Subject: [PATCH 075/121] fixed wrong variable name for br.de --- R/deliver_br_de.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/deliver_br_de.R b/R/deliver_br_de.R index 53f0b05..6776d40 100644 --- a/R/deliver_br_de.R +++ b/R/deliver_br_de.R @@ -10,9 +10,9 @@ pb_deliver_paper.br_de <- function(x, verbose = NULL, pb, ...) { types <- sapply(json_df, function(x) x$`@type`) if (any(types == "NewsArticle")) { json_df <- json_df[types == "NewsArticle"][[1]] - } else if (any(type == "VideoObject")) { + } else if (any(types == "VideoObject")) { json_df <- json_df[types == "VideoObject"][[1]] - } else if (any(type == "AudioObject")) { + } else if (any(types == "AudioObject")) { json_df <- json_df[types == "AudioObject"][[1]] } } From eb53e7646d9416af0b62a404fcfa94b954c61c47 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 07:43:33 +0200 Subject: [PATCH 076/121] added ruhrnachrichten.de --- NAMESPACE | 1 + R/deliver_ruhrnachrichten_de.R | 31 +++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 33 insertions(+) create mode 100644 R/deliver_ruhrnachrichten_de.R diff --git a/NAMESPACE b/NAMESPACE index f4fe2c3..45c9b82 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -94,6 +94,7 @@ S3method(pb_deliver_paper,rte_ie) S3method(pb_deliver_paper,rtl_de) S3method(pb_deliver_paper,rtl_nl) S3method(pb_deliver_paper,ruhr24_de) +S3method(pb_deliver_paper,ruhrnachrichten_de) S3method(pb_deliver_paper,saechsische_de) S3method(pb_deliver_paper,seznamzpravy_cz) S3method(pb_deliver_paper,sfgate_com) diff --git a/R/deliver_ruhrnachrichten_de.R b/R/deliver_ruhrnachrichten_de.R new file mode 100644 index 0000000..355e1d9 --- /dev/null +++ b/R/deliver_ruhrnachrichten_de.R @@ -0,0 +1,31 @@ +#' @export +pb_deliver_paper.ruhrnachrichten_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[grepl("NewsArticle|Article", json_df$`@type`), ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + text <- html %>% + rvest::html_nodes("p.article__teaser-text,.article__content p, .article__content h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") %>% + gsub("\nZur Startseite$", "", .) + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index aa7fe37..0a186e4 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -100,6 +100,7 @@ "rtl.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.rtl.de/rss/feed/news" "rtl.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.rtlnieuws.nl/rss.xml" "ruhr24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA +"ruhrnachrichten.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "saechsische.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.saechsische.de/arc/outboundfeeds/rss/" "seznamzpravy.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.seznamzpravy.cz/rss" "sfgate.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 7a7657700bed5759251a4ae6a089bccac6dcfae7 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 08:20:41 +0200 Subject: [PATCH 077/121] added vox.de --- NAMESPACE | 1 + R/deliver_vox_de.R | 30 ++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 32 insertions(+) create mode 100644 R/deliver_vox_de.R diff --git a/NAMESPACE b/NAMESPACE index 45c9b82..977e5ce 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -122,6 +122,7 @@ S3method(pb_deliver_paper,tz_de) S3method(pb_deliver_paper,usatoday_com) S3method(pb_deliver_paper,vice_com) S3method(pb_deliver_paper,volkskrant_nl) +S3method(pb_deliver_paper,vox_de) S3method(pb_deliver_paper,washingtonpost_com) S3method(pb_deliver_paper,watson_de) S3method(pb_deliver_paper,waz_de) diff --git a/R/deliver_vox_de.R b/R/deliver_vox_de.R new file mode 100644 index 0000000..f0cf8ea --- /dev/null +++ b/R/deliver_vox_de.R @@ -0,0 +1,30 @@ +#' @export +pb_deliver_paper.vox_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + if (nrow(json_df) > 1) { + json_df <- json_df[json_df$`@type` == "Article", ] + } + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- json_df$articleBody + if (author == "VOX Online") { + # the text might have the author abbr. at the end + author_abbr <- sub(".*\\(([^)]+)\\)$", "\\1", text) + if (author_abbr != "") { + author <- author_abbr + } + } + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 0a186e4..b3c22c2 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -134,6 +134,7 @@ "usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "vice.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "volkskrant.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.volkskrant.nl/rss.xml" +"vox.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "washingtonpost.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.washingtonpost.com/rss/world" "watson.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "waz.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.waz.de/rss" From a9d1d81b38ceeafd30f0ef1e1bb0aa6bef51c826 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 08:41:48 +0200 Subject: [PATCH 078/121] added der-postillon.com --- NAMESPACE | 1 + R/deliver_der_postillon_com.R | 35 +++++++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 37 insertions(+) create mode 100644 R/deliver_der_postillon_com.R diff --git a/NAMESPACE b/NAMESPACE index 977e5ce..f982cce 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -29,6 +29,7 @@ S3method(pb_deliver_paper,cnn_com) S3method(pb_deliver_paper,dailymail_co_uk) S3method(pb_deliver_paper,default) S3method(pb_deliver_paper,denikn_cz) +S3method(pb_deliver_paper,der_postillon_com) S3method(pb_deliver_paper,derwesten_de) S3method(pb_deliver_paper,deutschlandfunk_de) S3method(pb_deliver_paper,deutschlandfunkkultur_de) diff --git a/R/deliver_der_postillon_com.R b/R/deliver_der_postillon_com.R new file mode 100644 index 0000000..96d211d --- /dev/null +++ b/R/deliver_der_postillon_com.R @@ -0,0 +1,35 @@ +#' @export +pb_deliver_paper.der_postillon_com <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".post-body p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + # author abbr can be found at the end of the article + if (author == "Der Postillon") { + author_tmp <- html %>% + rvest::html_node("div[id='post-body'] span[style='font-size: x-small;']") %>% + rvest::html_text() %>% + sub("; Erstver.*$", "", .) + if (author_tmp != "") { + author <- author_tmp + } + } + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index b3c22c2..58357f1 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -27,6 +27,7 @@ "dailymail.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.dailymail.co.uk/news/index.rss" "decider.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "denikn.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://denikn.cz/rss" +"der-postillon.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "deutschlandfunk.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.deutschlandfunk.de/nachrichten-100.rss" "deutschlandfunkkultur.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.deutschlandfunkkultur.de/politik-114.rss" "badische-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.derwesten.de/feed" From 45586a2a702a27957fb597526bebe93f4e199283 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 08:45:33 +0200 Subject: [PATCH 079/121] added heidelberg24.de --- NAMESPACE | 1 + R/deliver_heidelberg24_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_heidelberg24_de.R diff --git a/NAMESPACE b/NAMESPACE index f982cce..05199a2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -44,6 +44,7 @@ S3method(pb_deliver_paper,foxbusiness_com) S3method(pb_deliver_paper,fr_de) S3method(pb_deliver_paper,geenstijl_nl) S3method(pb_deliver_paper,handelsblatt_com) +S3method(pb_deliver_paper,heidelberg24_de) S3method(pb_deliver_paper,heise_de) S3method(pb_deliver_paper,hn_cz) S3method(pb_deliver_paper,hna_de) diff --git a/R/deliver_heidelberg24_de.R b/R/deliver_heidelberg24_de.R new file mode 100644 index 0000000..8b96a1d --- /dev/null +++ b/R/deliver_heidelberg24_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.heidelberg24_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$mainEntity + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 58357f1..fad5ad5 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -47,6 +47,7 @@ "ftw.usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "geenstijl.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.geenstijl.nl/feeds/recent.atom" "handelsblatt.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.handelsblatt.com/contentexport/feed/schlagzeilen" +"heidelberg24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "heise.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.heise.de/rss/heise.rdf" "hn.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://domaci.hn.cz/?m=rss" "hna.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.hna.de/politik/rssfeed.xml" From 8a98991b3c7853866a694eef54bead4e1f4d25a2 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 09:08:28 +0200 Subject: [PATCH 080/121] added news-und-nachrichten.de --- NAMESPACE | 1 + R/deliver_news_und_nachrichten_de.R | 22 ++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 24 insertions(+) create mode 100644 R/deliver_news_und_nachrichten_de.R diff --git a/NAMESPACE b/NAMESPACE index 05199a2..ef96bab 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -75,6 +75,7 @@ S3method(pb_deliver_paper,morgenpost_de) S3method(pb_deliver_paper,n_tv_de) S3method(pb_deliver_paper,ndr_de) S3method(pb_deliver_paper,news_de) +S3method(pb_deliver_paper,news_und_nachrichten_de) S3method(pb_deliver_paper,newstatesman_com) S3method(pb_deliver_paper,newsweek_com) S3method(pb_deliver_paper,nordkurier_de) diff --git a/R/deliver_news_und_nachrichten_de.R b/R/deliver_news_und_nachrichten_de.R new file mode 100644 index 0000000..a168e0d --- /dev/null +++ b/R/deliver_news_und_nachrichten_de.R @@ -0,0 +1,22 @@ +#' @export +pb_deliver_paper.news_und_nachrichten_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(gsub("[\r\n]*", "", json_txt)) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author) + text <- json_df$articleBody + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index fad5ad5..dd8d9f7 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -81,6 +81,7 @@ "news.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.news.de/rss/364367598/politik/" "newstatesman.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.newstatesman.com/feed/" "newsweek.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA +"news-und-nachrichten.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "nordkurier.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "nos.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.nos.nl/nosnieuwsalgemeen" "novinky.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.novinky.cz/rss" From 62cd6e4221893021b43db39012cffaf1a4418e00 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 09:15:29 +0200 Subject: [PATCH 081/121] added volksstimme.de --- NAMESPACE | 1 + R/deliver_volksstimme_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_volksstimme_de.R diff --git a/NAMESPACE b/NAMESPACE index ef96bab..b9327c9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -125,6 +125,7 @@ S3method(pb_deliver_paper,tz_de) S3method(pb_deliver_paper,usatoday_com) S3method(pb_deliver_paper,vice_com) S3method(pb_deliver_paper,volkskrant_nl) +S3method(pb_deliver_paper,volksstimme_de) S3method(pb_deliver_paper,vox_de) S3method(pb_deliver_paper,washingtonpost_com) S3method(pb_deliver_paper,watson_de) diff --git a/R/deliver_volksstimme_de.R b/R/deliver_volksstimme_de.R new file mode 100644 index 0000000..735f8a2 --- /dev/null +++ b/R/deliver_volksstimme_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.volksstimme_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".fp-article-heading__excerpt,.fp-paragraph, .fp-subheading") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index dd8d9f7..ed248c6 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -137,6 +137,7 @@ "usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "vice.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "volkskrant.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.volkskrant.nl/rss.xml" +"volksstimme.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "vox.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "washingtonpost.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.washingtonpost.com/rss/world" "watson.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA From ee67203180c363b5d5a3e8c82dfc94f7f77125ec Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 09:49:13 +0200 Subject: [PATCH 082/121] added 3sat.de --- NAMESPACE | 1 + R/deliver_3sat_de.R | 28 ++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 30 insertions(+) create mode 100644 R/deliver_3sat_de.R diff --git a/NAMESPACE b/NAMESPACE index b9327c9..33fb747 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ S3method(pb_deliver,character) S3method(pb_deliver,data.frame) S3method(pb_deliver,default) +S3method(pb_deliver_paper,"3sat_de") S3method(pb_deliver_paper,abendblatt_de) S3method(pb_deliver_paper,abendzeitung_muenchen_de) S3method(pb_deliver_paper,ac24_cz) diff --git a/R/deliver_3sat_de.R b/R/deliver_3sat_de.R new file mode 100644 index 0000000..8362825 --- /dev/null +++ b/R/deliver_3sat_de.R @@ -0,0 +1,28 @@ +#' @export +pb_deliver_paper.3sat_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + datetime <- html %>% + rvest::html_nodes("time") %>% + rvest::html_attr("datetime") %>% + lubridate::as_datetime() + + headline <- html %>% + rvest::html_nodes(".main-content-details h2") %>% + rvest::html_text() + + author <- "" # no author info found + + text <- html %>% + rvest::html_nodes(".o--post-long p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text + ) +} diff --git a/inst/status.csv b/inst/status.csv index ed248c6..aa4c5b7 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -1,4 +1,5 @@ "domain","status","author","issues","rss" +"3sat.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.3sat.de/rss/zdf/gesellschaft" "abendblatt.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.abendblatt.de/rss" "abendzeitung-muenchen.de.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.abendzeitung-muenchen.de/storage/rss/rss/alle-artikel-abendzeitung.xml" "ac24.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.ac24.cz/feed/" From e55b30c557a7579276ca3b945fb13887a92371ae Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 10:05:09 +0200 Subject: [PATCH 083/121] added derstandard.at --- NAMESPACE | 1 + R/derstandard_at.R | 31 +++++++++++++++++++++++++++++++ inst/status.csv | 3 ++- 3 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 R/derstandard_at.R diff --git a/NAMESPACE b/NAMESPACE index 33fb747..dae7b37 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -31,6 +31,7 @@ S3method(pb_deliver_paper,dailymail_co_uk) S3method(pb_deliver_paper,default) S3method(pb_deliver_paper,denikn_cz) S3method(pb_deliver_paper,der_postillon_com) +S3method(pb_deliver_paper,derstandard_at) S3method(pb_deliver_paper,derwesten_de) S3method(pb_deliver_paper,deutschlandfunk_de) S3method(pb_deliver_paper,deutschlandfunkkultur_de) diff --git a/R/derstandard_at.R b/R/derstandard_at.R new file mode 100644 index 0000000..d23094c --- /dev/null +++ b/R/derstandard_at.R @@ -0,0 +1,31 @@ +#' @export +pb_deliver_paper.derstandard_at <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + datetime <- html %>% + rvest::html_nodes(".article-meta") %>% + rvest::html_text() %>% + lubridate::as_datetime() + + headline <- html %>% + rvest::html_nodes("h1.article-title") %>% + rvest::html_text() + + author <- html %>% + rvest::html_nodes(".article-origins") %>% + rvest::html_text() %>% + toString() + + text <- html %>% + rvest::html_nodes(".article-body p, .article-body h3") %>% + rvest::html_text2() %>% + paste(collapse = "\n") # There is a note that parts of the website are blocked + + s_n_list( + datetime, + author, + headline, + text + ) +} diff --git a/inst/status.csv b/inst/status.csv index aa4c5b7..6a9592a 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -28,10 +28,11 @@ "dailymail.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.dailymail.co.uk/news/index.rss" "decider.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "denikn.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://denikn.cz/rss" +"derstandard.at","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.derstandard.at/rss" +"derwesten.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.derwesten.de/feed" "der-postillon.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "deutschlandfunk.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.deutschlandfunk.de/nachrichten-100.rss" "deutschlandfunkkultur.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.deutschlandfunkkultur.de/politik-114.rss" -"badische-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.derwesten.de/feed" "echo24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "edition.cnn.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://rss.cnn.com/rss/edition.rss" "eu.usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 677e7342324ec9ba0cfa5263df339b91a2e9973c Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 10:35:32 +0200 Subject: [PATCH 084/121] added lvz.de --- NAMESPACE | 1 + R/deliver_lvz_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_lvz_de.R diff --git a/NAMESPACE b/NAMESPACE index dae7b37..009adeb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -66,6 +66,7 @@ S3method(pb_deliver_paper,ksta_de) S3method(pb_deliver_paper,kurier_at) S3method(pb_deliver_paper,latimes_com) S3method(pb_deliver_paper,lidovky_cz) +S3method(pb_deliver_paper,lvz_de) S3method(pb_deliver_paper,manager_magazin_de) S3method(pb_deliver_paper,marketwatch_com) S3method(pb_deliver_paper,mdr_de) diff --git a/R/deliver_lvz_de.R b/R/deliver_lvz_de.R new file mode 100644 index 0000000..136dc78 --- /dev/null +++ b/R/deliver_lvz_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.lvz_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[3] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Headlinestyled__Headline-sc-mamptc-0,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Textstyled__Text-sc-1cqv9mi-0") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 6a9592a..7646487 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -70,6 +70,7 @@ "latimes.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.latimes.com/politics/rss2.0.xml" "lidovky.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://servis.lidovky.cz/rss.aspx" "lnk.techrepublic.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA +"lvz.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.lvz.de/arc/outboundfeeds/rss/" "manager-magazin.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.manager-magazin.de/news/index.rss" "marketwatch.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "mdr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA From 18a0a5ccdb0225bd408d9f140d083c9ce75b54be Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 10:43:41 +0200 Subject: [PATCH 085/121] added swrfernsehen.de --- NAMESPACE | 1 + R/deliver_swrfernsehen_de.R | 31 +++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 33 insertions(+) create mode 100644 R/deliver_swrfernsehen_de.R diff --git a/NAMESPACE b/NAMESPACE index 009adeb..c19ab35 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -113,6 +113,7 @@ S3method(pb_deliver_paper,sueddeutsche_de) S3method(pb_deliver_paper,suedkurier_de) S3method(pb_deliver_paper,swp_de) S3method(pb_deliver_paper,swr_de) +S3method(pb_deliver_paper,swrfernsehen_de) S3method(pb_deliver_paper,t3n_de) S3method(pb_deliver_paper,t_online_de) S3method(pb_deliver_paper,tag24_de) diff --git a/R/deliver_swrfernsehen_de.R b/R/deliver_swrfernsehen_de.R new file mode 100644 index 0000000..8506be7 --- /dev/null +++ b/R/deliver_swrfernsehen_de.R @@ -0,0 +1,31 @@ +#' @export +pb_deliver_paper.swrfernsehen_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + datetime <- html %>% + rvest::html_nodes(".meta-top .meta-description time") %>% + rvest::html_attr("datetime") %>% + lubridate::as_datetime() + + headline <- html %>% + rvest::html_nodes("h1.headline") %>% + rvest::html_text() + + author <- html %>% + rvest::html_nodes(".meta-top .meta-author-name a") %>% + rvest::html_text2() %>% + toString() + + text <- html %>% + rvest::html_nodes(".detail-body .lead,.bodytext p,.bodytext h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text + ) +} diff --git a/inst/status.csv b/inst/status.csv index 7646487..2d6972c 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -119,6 +119,7 @@ "suedkurier.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "swp.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "swr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA +"swrfernsehen.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "tag24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "tagesschau.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "tagesspiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.tagesspiegel.de/contentexport/feed/home" From e1da8822028938dfed7f8baec31c6df950c9e59f Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 13:50:21 +0200 Subject: [PATCH 086/121] added shz.de --- NAMESPACE | 1 + R/deliver_shz_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_shz_de.R diff --git a/NAMESPACE b/NAMESPACE index c19ab35..a25f7aa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -104,6 +104,7 @@ S3method(pb_deliver_paper,ruhrnachrichten_de) S3method(pb_deliver_paper,saechsische_de) S3method(pb_deliver_paper,seznamzpravy_cz) S3method(pb_deliver_paper,sfgate_com) +S3method(pb_deliver_paper,shz_de) S3method(pb_deliver_paper,skwawkbox_org) S3method(pb_deliver_paper,sky_com) S3method(pb_deliver_paper,spiegel_de) diff --git a/R/deliver_shz_de.R b/R/deliver_shz_de.R new file mode 100644 index 0000000..8706bae --- /dev/null +++ b/R/deliver_shz_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.shz_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("p.w-600, p,h2.h4") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 2d6972c..1108032 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -110,6 +110,7 @@ "saechsische.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.saechsische.de/arc/outboundfeeds/rss/" "seznamzpravy.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.seznamzpravy.cz/rss" "sfgate.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA +"shz.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "skwawkbox.org","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "sky.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.skynews.com/feeds/rss/home.xml" "spiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.spiegel.de/schlagzeilen/index.rss" From 0d1b2f99932eab52ab74bf73589e0e5db0edc274 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 13:55:44 +0200 Subject: [PATCH 087/121] added fnp.de --- NAMESPACE | 1 + ...standard_at.R => deliver_derstandard_at.R} | 0 R/deliver_fnp_de.r | 25 +++++++++++++++++++ inst/status.csv | 1 + 4 files changed, 27 insertions(+) rename R/{derstandard_at.R => deliver_derstandard_at.R} (100%) create mode 100644 R/deliver_fnp_de.r diff --git a/NAMESPACE b/NAMESPACE index a25f7aa..97a3eba 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -40,6 +40,7 @@ S3method(pb_deliver_paper,evolvepolitics_com) S3method(pb_deliver_paper,express_de) S3method(pb_deliver_paper,faz_net) S3method(pb_deliver_paper,finanzen_net) +S3method(pb_deliver_paper,fnp_de) S3method(pb_deliver_paper,focus_de) S3method(pb_deliver_paper,forbes_com) S3method(pb_deliver_paper,foxbusiness_com) diff --git a/R/derstandard_at.R b/R/deliver_derstandard_at.R similarity index 100% rename from R/derstandard_at.R rename to R/deliver_derstandard_at.R diff --git a/R/deliver_fnp_de.r b/R/deliver_fnp_de.r new file mode 100644 index 0000000..4f2a587 --- /dev/null +++ b/R/deliver_fnp_de.r @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.fnp_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$mainEntity + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 1108032..952df7a 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -40,6 +40,7 @@ "express.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "faz.net","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.faz.net/rss/aktuell/" "finanzen.net","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.finanzen.net/rss/news" +"fnp.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "focus.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://rss.focus.de/fol/XML/rss_folnews.xml" "forbes.com","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#2](https://github.com/JBGruber/paperboy/issues/2)",NA "fortune.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA From 0abfc16abbc1fde98fb26f6fce02eac1681d0664 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 14:01:54 +0200 Subject: [PATCH 088/121] added freiepresse.de --- NAMESPACE | 1 + R/deliver_freiepresse_de.R | 26 ++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 28 insertions(+) create mode 100644 R/deliver_freiepresse_de.R diff --git a/NAMESPACE b/NAMESPACE index 97a3eba..cf8efbb 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -45,6 +45,7 @@ S3method(pb_deliver_paper,focus_de) S3method(pb_deliver_paper,forbes_com) S3method(pb_deliver_paper,foxbusiness_com) S3method(pb_deliver_paper,fr_de) +S3method(pb_deliver_paper,freiepresse_de) S3method(pb_deliver_paper,geenstijl_nl) S3method(pb_deliver_paper,handelsblatt_com) S3method(pb_deliver_paper,heidelberg24_de) diff --git a/R/deliver_freiepresse_de.R b/R/deliver_freiepresse_de.R new file mode 100644 index 0000000..af3db23 --- /dev/null +++ b/R/deliver_freiepresse_de.R @@ -0,0 +1,26 @@ +#' @export +pb_deliver_paper.freiepresse_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + json_txt <- json_txt[grepl("NewsArticle", json_txt)] + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author) + text <- html %>% + rvest::html_nodes(".article__text p,.article__text h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 952df7a..6dc4295 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -47,6 +47,7 @@ "foxbusiness.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "foxnews.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "fr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.fr.de/rssfeed.rdf" +"freiepresse.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "ftw.usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "geenstijl.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.geenstijl.nl/feeds/recent.atom" "handelsblatt.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.handelsblatt.com/contentexport/feed/schlagzeilen" From 47e6dd241364e26c184d2b119b0d0167b011ca22 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 17:31:18 +0200 Subject: [PATCH 089/121] added wa.de --- NAMESPACE | 1 + R/deliver_wa_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_wa_de.R diff --git a/NAMESPACE b/NAMESPACE index cf8efbb..f77c5ec 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -134,6 +134,7 @@ S3method(pb_deliver_paper,vice_com) S3method(pb_deliver_paper,volkskrant_nl) S3method(pb_deliver_paper,volksstimme_de) S3method(pb_deliver_paper,vox_de) +S3method(pb_deliver_paper,wa_de) S3method(pb_deliver_paper,washingtonpost_com) S3method(pb_deliver_paper,watson_de) S3method(pb_deliver_paper,waz_de) diff --git a/R/deliver_wa_de.R b/R/deliver_wa_de.R new file mode 100644 index 0000000..f0bd62f --- /dev/null +++ b/R/deliver_wa_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.wa_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$mainEntity + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 6dc4295..5e6d3f8 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -148,6 +148,7 @@ "vox.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "washingtonpost.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.washingtonpost.com/rss/world" "watson.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA +"wa.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "waz.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.waz.de/rss" "wdr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www1.wdr.de/wissen/uebersicht-nachrichten-100.feed" "welt.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.welt.de/feeds/latest.rss" From d409ec1286605cdafb8a64831912ef65ec6bdf12 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 18:28:28 +0200 Subject: [PATCH 090/121] added haz_de --- NAMESPACE | 1 + 1 file changed, 1 insertion(+) diff --git a/NAMESPACE b/NAMESPACE index f77c5ec..c0959c6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -48,6 +48,7 @@ S3method(pb_deliver_paper,fr_de) S3method(pb_deliver_paper,freiepresse_de) S3method(pb_deliver_paper,geenstijl_nl) S3method(pb_deliver_paper,handelsblatt_com) +S3method(pb_deliver_paper,haz_de) S3method(pb_deliver_paper,heidelberg24_de) S3method(pb_deliver_paper,heise_de) S3method(pb_deliver_paper,hn_cz) From 6629448d684352b2e045da79fb17894fc060abc9 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 18:29:05 +0200 Subject: [PATCH 091/121] added haz.de remaining --- R/deliver_haz_de.R | 26 ++++++++++++++++++++++++++ inst/status.csv | 1 + 2 files changed, 27 insertions(+) create mode 100644 R/deliver_haz_de.R diff --git a/R/deliver_haz_de.R b/R/deliver_haz_de.R new file mode 100644 index 0000000..f7cc1ba --- /dev/null +++ b/R/deliver_haz_de.R @@ -0,0 +1,26 @@ +#' @export +pb_deliver_paper.haz_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + json_txt <- json_txt[grepl("NewsArticle", json_txt)] + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2,.Textstyled__Text-sc-1cqv9mi-0.gqSIEH") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 5e6d3f8..5cb5974 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -51,6 +51,7 @@ "ftw.usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "geenstijl.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.geenstijl.nl/feeds/recent.atom" "handelsblatt.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.handelsblatt.com/contentexport/feed/schlagzeilen" +"haz.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "heidelberg24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "heise.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.heise.de/rss/heise.rdf" "hn.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://domaci.hn.cz/?m=rss" From 1304b8fa9036a1444cb4e3226cb5cbb1e280c34e Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 18:37:52 +0200 Subject: [PATCH 092/121] added nw.de --- NAMESPACE | 1 + R/deliver_nw_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_nw_de.R diff --git a/NAMESPACE b/NAMESPACE index c0959c6..7558f02 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -89,6 +89,7 @@ S3method(pb_deliver_paper,nos_nl) S3method(pb_deliver_paper,novinky_cz) S3method(pb_deliver_paper,nrc_nl) S3method(pb_deliver_paper,nu_nl) +S3method(pb_deliver_paper,nw_de) S3method(pb_deliver_paper,nypost_com) S3method(pb_deliver_paper,nytimes_com) S3method(pb_deliver_paper,nzz_ch) diff --git a/R/deliver_nw_de.R b/R/deliver_nw_de.R new file mode 100644 index 0000000..999f24f --- /dev/null +++ b/R/deliver_nw_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.nw_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("p.em_text,h2.Zwischenzeile") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 5cb5974..904c1ce 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -92,6 +92,7 @@ "nos.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.nos.nl/nosnieuwsalgemeen" "novinky.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.novinky.cz/rss" "nrc.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA +"nw.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "n-tv.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.n-tv.de/rss" "nu.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.nu.nl/rss" "nypost.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 0bee0d26e984eca9099f0422ccfed6731dff1d10 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 18:43:47 +0200 Subject: [PATCH 093/121] added noz.de --- NAMESPACE | 1 + R/deliver_noz_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_noz_de.R diff --git a/NAMESPACE b/NAMESPACE index 7558f02..7e3fbe2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -87,6 +87,7 @@ S3method(pb_deliver_paper,newsweek_com) S3method(pb_deliver_paper,nordkurier_de) S3method(pb_deliver_paper,nos_nl) S3method(pb_deliver_paper,novinky_cz) +S3method(pb_deliver_paper,noz_de) S3method(pb_deliver_paper,nrc_nl) S3method(pb_deliver_paper,nu_nl) S3method(pb_deliver_paper,nw_de) diff --git a/R/deliver_noz_de.R b/R/deliver_noz_de.R new file mode 100644 index 0000000..45446d7 --- /dev/null +++ b/R/deliver_noz_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.noz_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("p.w-600,section.content--group p, section.content--group h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 904c1ce..325fb39 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -91,6 +91,7 @@ "nordkurier.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "nos.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.nos.nl/nosnieuwsalgemeen" "novinky.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.novinky.cz/rss" +"noz.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "nrc.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "nw.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "n-tv.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.n-tv.de/rss" From 9a383eb6f4ad49227688b3bb62e6c060b12e9a0a Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 18:49:01 +0200 Subject: [PATCH 094/121] added orf.at --- NAMESPACE | 1 + R/deliver_orf_at.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_orf_at.R diff --git a/NAMESPACE b/NAMESPACE index 7e3fbe2..16ac40f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -94,6 +94,7 @@ S3method(pb_deliver_paper,nw_de) S3method(pb_deliver_paper,nypost_com) S3method(pb_deliver_paper,nytimes_com) S3method(pb_deliver_paper,nzz_ch) +S3method(pb_deliver_paper,orf_at) S3method(pb_deliver_paper,parlamentnilisty_cz) S3method(pb_deliver_paper,presseportal_de) S3method(pb_deliver_paper,prosieben_de) diff --git a/R/deliver_orf_at.R b/R/deliver_orf_at.R new file mode 100644 index 0000000..d1ce977 --- /dev/null +++ b/R/deliver_orf_at.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.orf_at <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".story-lead-text,.story-story p,.story-story h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 325fb39..289c742 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -99,6 +99,7 @@ "nypost.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "nytimes.com","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#17](https://github.com/JBGruber/paperboy/issues/17)","https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml" "nzz.ch","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.nzz.ch/recent.rss" +"orf.at","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://rss.orf.at/news.xml" "pagesix.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "parlamentnilisty.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://www.parlamentnilisty.cz/export/rss.aspx" "presseportal.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.presseportal.de/rss/presseportal.rss2" From 633fd2ccb1dd81428c009183c2158a9f197c4540 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 19:13:48 +0200 Subject: [PATCH 095/121] added srf.ch --- NAMESPACE | 1 + R/deliver_srf_ch.R | 31 +++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 33 insertions(+) create mode 100644 R/deliver_srf_ch.R diff --git a/NAMESPACE b/NAMESPACE index 16ac40f..92f52ca 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -114,6 +114,7 @@ S3method(pb_deliver_paper,shz_de) S3method(pb_deliver_paper,skwawkbox_org) S3method(pb_deliver_paper,sky_com) S3method(pb_deliver_paper,spiegel_de) +S3method(pb_deliver_paper,srf_ch) S3method(pb_deliver_paper,stern_de) S3method(pb_deliver_paper,stuttgarter_zeitung_de) S3method(pb_deliver_paper,sueddeutsche_de) diff --git a/R/deliver_srf_ch.R b/R/deliver_srf_ch.R new file mode 100644 index 0000000..8ef8d3e --- /dev/null +++ b/R/deliver_srf_ch.R @@ -0,0 +1,31 @@ +#' @export +pb_deliver_paper.srf_ch <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_df <- html %>% + rvest::html_node("span#config__js") %>% + rvest::html_attr("data-analytics-webtrekk-survey-gizmo-value-object") %>% + jsonlite::fromJSON() + + datetime <- lubridate::as_datetime(json_df$params$content_publication_datetime) + + headline <- html %>% + rvest::html_nodes("h1 .article-title__text") %>% + rvest::html_text() + + author <- "" # no article with author info founds + + text <- html %>% + rvest::html_nodes(".article-content p, .article-content h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text + ) +} diff --git a/inst/status.csv b/inst/status.csv index 289c742..3ffc7c5 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -120,6 +120,7 @@ "skwawkbox.org","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "sky.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.skynews.com/feeds/rss/home.xml" "spiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.spiegel.de/schlagzeilen/index.rss" +"srf.ch","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.srf.ch/news/bnf/rss/1922" "stern.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.stern.de/feed/standard/all/" "stuttgarter-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "sueddeutsche.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://rss.sueddeutsche.de/alles" From f2480b2bfb4ac800022dc945a64c368d30c6856c Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 19:18:02 +0200 Subject: [PATCH 096/121] added epochtimes.de --- NAMESPACE | 1 + R/deliver_epochtimes_de.R | 22 ++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 24 insertions(+) create mode 100644 R/deliver_epochtimes_de.R diff --git a/NAMESPACE b/NAMESPACE index 92f52ca..ad3ce07 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -36,6 +36,7 @@ S3method(pb_deliver_paper,derwesten_de) S3method(pb_deliver_paper,deutschlandfunk_de) S3method(pb_deliver_paper,deutschlandfunkkultur_de) S3method(pb_deliver_paper,echo24_de) +S3method(pb_deliver_paper,epochtimes_de) S3method(pb_deliver_paper,evolvepolitics_com) S3method(pb_deliver_paper,express_de) S3method(pb_deliver_paper,faz_net) diff --git a/R/deliver_epochtimes_de.R b/R/deliver_epochtimes_de.R new file mode 100644 index 0000000..1a91fc0 --- /dev/null +++ b/R/deliver_epochtimes_de.R @@ -0,0 +1,22 @@ +#' @export +pb_deliver_paper.epochtimes_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- json_df$articleBody + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 3ffc7c5..96d74d9 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -35,6 +35,7 @@ "deutschlandfunkkultur.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.deutschlandfunkkultur.de/politik-114.rss" "echo24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "edition.cnn.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://rss.cnn.com/rss/edition.rss" +"epochtimes.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "eu.usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "evolvepolitics.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "express.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA From d40724a098ded030fc303851059025a3c3672115 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 19:25:00 +0200 Subject: [PATCH 097/121] added ostsee-zeitung.de --- NAMESPACE | 1 + R/deliver_ostsee_zeitung_de.R | 26 ++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 28 insertions(+) create mode 100644 R/deliver_ostsee_zeitung_de.R diff --git a/NAMESPACE b/NAMESPACE index ad3ce07..9a1dc7e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -96,6 +96,7 @@ S3method(pb_deliver_paper,nypost_com) S3method(pb_deliver_paper,nytimes_com) S3method(pb_deliver_paper,nzz_ch) S3method(pb_deliver_paper,orf_at) +S3method(pb_deliver_paper,ostsee_zeitung_de) S3method(pb_deliver_paper,parlamentnilisty_cz) S3method(pb_deliver_paper,presseportal_de) S3method(pb_deliver_paper,prosieben_de) diff --git a/R/deliver_ostsee_zeitung_de.R b/R/deliver_ostsee_zeitung_de.R new file mode 100644 index 0000000..07b557a --- /dev/null +++ b/R/deliver_ostsee_zeitung_de.R @@ -0,0 +1,26 @@ +#' @export +pb_deliver_paper.ostsee_zeitung_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + json_txt <- json_txt[grepl("NewsArticle", json_txt)] + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Textstyled__Text-sc-1cqv9mi-0,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Headlinestyled__Headline-sc-mamptc-0") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 96d74d9..9adc330 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -101,6 +101,7 @@ "nytimes.com","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#17](https://github.com/JBGruber/paperboy/issues/17)","https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml" "nzz.ch","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.nzz.ch/recent.rss" "orf.at","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://rss.orf.at/news.xml" +"ostsee-zeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "pagesix.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "parlamentnilisty.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://www.parlamentnilisty.cz/export/rss.aspx" "presseportal.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.presseportal.de/rss/presseportal.rss2" From 5525f52c1c26301ebaa0f6cdf276d60c8a41a9e5 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 19:34:03 +0200 Subject: [PATCH 098/121] added swr3.de --- NAMESPACE | 1 + R/deliver_swr3_de.R | 31 +++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 33 insertions(+) create mode 100644 R/deliver_swr3_de.R diff --git a/NAMESPACE b/NAMESPACE index 9a1dc7e..4c0aa73 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -122,6 +122,7 @@ S3method(pb_deliver_paper,stuttgarter_zeitung_de) S3method(pb_deliver_paper,sueddeutsche_de) S3method(pb_deliver_paper,suedkurier_de) S3method(pb_deliver_paper,swp_de) +S3method(pb_deliver_paper,swr3_de) S3method(pb_deliver_paper,swr_de) S3method(pb_deliver_paper,swrfernsehen_de) S3method(pb_deliver_paper,t3n_de) diff --git a/R/deliver_swr3_de.R b/R/deliver_swr3_de.R new file mode 100644 index 0000000..85d400a --- /dev/null +++ b/R/deliver_swr3_de.R @@ -0,0 +1,31 @@ +#' @export +pb_deliver_paper.swr3_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + datetime <- html %>% + rvest::html_nodes(".meta-top time") %>% + rvest::html_attr("datetime") %>% + lubridate::as_datetime() + + headline <- html %>% + rvest::html_nodes("h1.headline") %>% + rvest::html_text() + + author <- html %>% + rvest::html_nodes(".meta-top .meta-author-name a") %>% + rvest::html_text2() %>% + toString() + + text <- html %>% + rvest::html_nodes("p.lead, .bodytext p, .bodytext h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text + ) +} diff --git a/inst/status.csv b/inst/status.csv index 9adc330..3a238fe 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -129,6 +129,7 @@ "suedkurier.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "swp.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "swr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA +"swr3.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "swrfernsehen.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "tag24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "tagesschau.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA From e5d4ada66ee440eeac4407d8feba23b1ee251a84 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 19:38:37 +0200 Subject: [PATCH 099/121] added newsflash24.de --- NAMESPACE | 1 + R/deliver_newsflash24_de.R | 30 ++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 32 insertions(+) create mode 100644 R/deliver_newsflash24_de.R diff --git a/NAMESPACE b/NAMESPACE index 4c0aa73..a0c37cd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -83,6 +83,7 @@ S3method(pb_deliver_paper,n_tv_de) S3method(pb_deliver_paper,ndr_de) S3method(pb_deliver_paper,news_de) S3method(pb_deliver_paper,news_und_nachrichten_de) +S3method(pb_deliver_paper,newsflash24_de) S3method(pb_deliver_paper,newstatesman_com) S3method(pb_deliver_paper,newsweek_com) S3method(pb_deliver_paper,nordkurier_de) diff --git a/R/deliver_newsflash24_de.R b/R/deliver_newsflash24_de.R new file mode 100644 index 0000000..66a56a4 --- /dev/null +++ b/R/deliver_newsflash24_de.R @@ -0,0 +1,30 @@ +#' @export +pb_deliver_paper.newsflash24_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + text <- html %>% + rvest::html_nodes(".entry-content p, .entry-content h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 3a238fe..414f567 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -86,6 +86,7 @@ "msnbc.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "ndr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "http://www.ndr.de/home/index-rss.xml" "news.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.news.de/rss/364367598/politik/" +"newsflash24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "newstatesman.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.newstatesman.com/feed/" "newsweek.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "news-und-nachrichten.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA From 20ed71a7f46204d80389b0ede4f1311b2810c978 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 19:43:03 +0200 Subject: [PATCH 100/121] added jungefreiheit.de --- NAMESPACE | 1 + R/deliver_jungefreiheit_de.R | 30 ++++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 32 insertions(+) create mode 100644 R/deliver_jungefreiheit_de.R diff --git a/NAMESPACE b/NAMESPACE index a0c37cd..35c71c6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -64,6 +64,7 @@ S3method(pb_deliver_paper,irishmirror_ie) S3method(pb_deliver_paper,irishtimes_com) S3method(pb_deliver_paper,irozhlas_cz) S3method(pb_deliver_paper,joe_ie) +S3method(pb_deliver_paper,jungefreiheit_de) S3method(pb_deliver_paper,karlsruhe_insider_de) S3method(pb_deliver_paper,kreiszeitung_de) S3method(pb_deliver_paper,ksta_de) diff --git a/R/deliver_jungefreiheit_de.R b/R/deliver_jungefreiheit_de.R new file mode 100644 index 0000000..3c55803 --- /dev/null +++ b/R/deliver_jungefreiheit_de.R @@ -0,0 +1,30 @@ +#' @export +pb_deliver_paper.jungefreiheit_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + text <- html %>% + rvest::html_nodes(".elementor-widget-container p, .elementor-widget-container h3") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 414f567..3b4c12c 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -67,6 +67,7 @@ "irishtimes.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.irishtimes.com/arc/outboundfeeds/feed-irish-news/" "irozhlas.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.irozhlas.cz/rss/irozhlas" "joe.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.joe.ie/feed" +"jungefreiheit.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "karlsruhe-insider.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "kreiszeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "ksta.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://feed.ksta.de/feed/rss/politik/index.rss" From 4d50b5c26af085f5f34d0dc446d7930222df46e2 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 19:55:54 +0200 Subject: [PATCH 101/121] added kabeleins.de --- NAMESPACE | 1 + R/deliver_kabeleins_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_kabeleins_de.R diff --git a/NAMESPACE b/NAMESPACE index 35c71c6..d4e9808 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -65,6 +65,7 @@ S3method(pb_deliver_paper,irishtimes_com) S3method(pb_deliver_paper,irozhlas_cz) S3method(pb_deliver_paper,joe_ie) S3method(pb_deliver_paper,jungefreiheit_de) +S3method(pb_deliver_paper,kabeleins_de) S3method(pb_deliver_paper,karlsruhe_insider_de) S3method(pb_deliver_paper,kreiszeitung_de) S3method(pb_deliver_paper,ksta_de) diff --git a/R/deliver_kabeleins_de.R b/R/deliver_kabeleins_de.R new file mode 100644 index 0000000..be93d01 --- /dev/null +++ b/R/deliver_kabeleins_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.kabeleins_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("p.css-1tkp8z5, h2.css-xfddm,p.css-1pcz62z") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 3b4c12c..063d137 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -68,6 +68,7 @@ "irozhlas.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.irozhlas.cz/rss/irozhlas" "joe.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.joe.ie/feed" "jungefreiheit.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA +"kabeleins.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "karlsruhe-insider.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "kreiszeitung.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "ksta.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://feed.ksta.de/feed/rss/politik/index.rss" From 4a8fff6dd583135c1a803e826fc5c09021812cec Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 20:02:24 +0200 Subject: [PATCH 102/121] added thueringer-allgemeine.de --- NAMESPACE | 1 + R/deliver_thueringer_allgemeine_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_thueringer_allgemeine_de.R diff --git a/NAMESPACE b/NAMESPACE index d4e9808..ed2d2c6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -139,6 +139,7 @@ S3method(pb_deliver_paper,thecanary_co) S3method(pb_deliver_paper,theguardian_com) S3method(pb_deliver_paper,thejournal_ie) S3method(pb_deliver_paper,thesun_ie) +S3method(pb_deliver_paper,thueringer_allgemeine_de) S3method(pb_deliver_paper,tz_de) S3method(pb_deliver_paper,usatoday_com) S3method(pb_deliver_paper,vice_com) diff --git a/R/deliver_thueringer_allgemeine_de.R b/R/deliver_thueringer_allgemeine_de.R new file mode 100644 index 0000000..4cab500 --- /dev/null +++ b/R/deliver_thueringer_allgemeine_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.thueringer_allgemeine_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".article-body p, .article-body h3") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 063d137..f6ec66f 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -147,6 +147,7 @@ "thestreet.com","![](https://img.shields.io/badge/status-requested-lightgrey)","[@JBGruber](https://github.com/JBGruber/)","","https://news.yahoo.com/rss.xml" "thesun.ie","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.thesun.ie/feed/" "thismorningwithgordondeal.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA +"thueringer-allgemeine.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "time.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "tribpub.com","![](https://img.shields.io/badge/status-requested-lightgrey)","","[#1](https://github.com/JBGruber/paperboy/issues/1)",NA "tz.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.tz.de/welt/rssfeed.rdf" From cc90fda0de7acb201a93a01b892a3efca05c034b Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 20:04:45 +0200 Subject: [PATCH 103/121] added watson.ch --- NAMESPACE | 1 + R/deliver_watson_ch.R | 22 ++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 24 insertions(+) create mode 100644 R/deliver_watson_ch.R diff --git a/NAMESPACE b/NAMESPACE index ed2d2c6..8b74fbc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -148,6 +148,7 @@ S3method(pb_deliver_paper,volksstimme_de) S3method(pb_deliver_paper,vox_de) S3method(pb_deliver_paper,wa_de) S3method(pb_deliver_paper,washingtonpost_com) +S3method(pb_deliver_paper,watson_ch) S3method(pb_deliver_paper,watson_de) S3method(pb_deliver_paper,waz_de) S3method(pb_deliver_paper,wdr_de) diff --git a/R/deliver_watson_ch.R b/R/deliver_watson_ch.R new file mode 100644 index 0000000..1aaaae6 --- /dev/null +++ b/R/deliver_watson_ch.R @@ -0,0 +1,22 @@ +#' @export +pb_deliver_paper.watson_ch <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- json_df$articleBody + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index f6ec66f..9848118 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -160,6 +160,7 @@ "vox.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "washingtonpost.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.washingtonpost.com/rss/world" "watson.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA +"watson.ch","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "wa.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "waz.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.waz.de/rss" "wdr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www1.wdr.de/wissen/uebersicht-nachrichten-100.feed" From ba2023c09ab58f8820ab70791a7ba6da07e31b98 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Fri, 18 Oct 2024 20:12:26 +0200 Subject: [PATCH 104/121] added maz-online.de --- NAMESPACE | 1 + R/deliver_maz_online_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_maz_online_de.R diff --git a/NAMESPACE b/NAMESPACE index 8b74fbc..8dae9ab 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -75,6 +75,7 @@ S3method(pb_deliver_paper,lidovky_cz) S3method(pb_deliver_paper,lvz_de) S3method(pb_deliver_paper,manager_magazin_de) S3method(pb_deliver_paper,marketwatch_com) +S3method(pb_deliver_paper,maz_online_de) S3method(pb_deliver_paper,mdr_de) S3method(pb_deliver_paper,mediacourant_nl) S3method(pb_deliver_paper,merkur_de) diff --git a/R/deliver_maz_online_de.R b/R/deliver_maz_online_de.R new file mode 100644 index 0000000..d04a96e --- /dev/null +++ b/R/deliver_maz_online_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.maz_online_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[3] %>% rvest::html_text() + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("header .Textstyled__Text-sc-1cqv9mi-0, article .Textstyled__Text-sc-1cqv9mi-0, article h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) +} diff --git a/inst/status.csv b/inst/status.csv index 9848118..db93467 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -79,6 +79,7 @@ "lvz.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.lvz.de/arc/outboundfeeds/rss/" "manager-magazin.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.manager-magazin.de/news/index.rss" "marketwatch.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA +"maz-online.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "mdr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "mediacourant.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.mediacourant.nl/feed/" "merkur.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "http://www.merkur.de/rssfeed.rdf" From 29f154ea329e511ca4afda91bd3d02c095f39b67 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Sun, 20 Oct 2024 22:07:37 +0200 Subject: [PATCH 105/121] better json error handling (part 1) --- R/deliver_abendblatt_de.R | 36 ++++++++------ R/deliver_abendzeitung_muenchen_de.R | 36 ++++++++------ R/deliver_badische_zeitung_de.R | 36 ++++++++------ R/deliver_berliner_zeitung_de.R | 36 ++++++++------ R/deliver_bnn_de.R | 41 +++++++++------- R/deliver_br_de.R | 70 ++++++++++++++------------ R/deliver_businessinsider_de.R | 48 +++++++++--------- R/deliver_der_postillon_com.R | 52 +++++++++++--------- R/deliver_derwesten_de.R | 44 +++++++++-------- R/deliver_echo24_de.R | 38 ++++++++------- R/deliver_epochtimes_de.R | 30 +++++++----- R/deliver_express_de.R | 52 +++++++++++--------- R/deliver_finanzen_net.R | 36 ++++++++------ R/deliver_fnp_de.R | 29 +++++++++++ R/deliver_fnp_de.r | 25 ---------- R/deliver_focus_de.R | 36 ++++++++------ R/deliver_fr_de.R | 38 ++++++++------- R/deliver_freiepresse_de.R | 41 +++++++++------- R/deliver_haz_de.R | 41 +++++++++------- R/deliver_heidelberg24_de.R | 38 ++++++++------- R/deliver_heise_de.R | 34 +++++++------ R/deliver_hna_de.R | 38 ++++++++------- R/deliver_infranken_de.R | 30 +++++++----- R/deliver_jungefreiheit_de.R | 46 ++++++++++-------- R/deliver_kabeleins_de.R | 36 ++++++++------ R/deliver_karlsruhe_insider_de.R | 48 +++++++++--------- R/deliver_kreiszeitung_de.R | 38 ++++++++------- R/deliver_ksta_de.R | 52 +++++++++++--------- R/deliver_kurier_at.R | 36 ++++++++------ R/deliver_lvz_de.R | 36 ++++++++------ R/deliver_manager_magazin_de.R | 38 ++++++++------- R/deliver_maz_online_de.R | 36 ++++++++------ R/deliver_mdr_de.R | 36 ++++++++------ R/deliver_merkur_de.R | 38 ++++++++------- R/deliver_mopo_de.R | 52 +++++++++++--------- R/deliver_morgenpost_de.R | 36 ++++++++------ R/deliver_n-tv_de.R | 36 ++++++++------ R/deliver_ndr_de.R | 46 ++++++++++-------- R/deliver_news_de.R | 34 +++++++------ R/deliver_news_und_nachrichten_de.R | 30 +++++++----- R/deliver_newsflash24_de.R | 46 ++++++++++-------- R/deliver_nordkurier_de.R | 36 ++++++++------ R/deliver_noz_de.R | 36 ++++++++------ R/deliver_nw_de.R | 36 ++++++++------ R/deliver_nzz_ch.R | 36 ++++++++------ R/deliver_orf_at.R | 36 ++++++++------ R/deliver_ostsee_zeitung_de.R | 41 +++++++++------- R/deliver_presseportal_de.R | 36 ++++++++------ R/deliver_prosieben_de.R | 58 ++++++++++++---------- R/deliver_rnd_de.R | 44 +++++++++-------- R/deliver_rollingstone_de.R | 46 ++++++++++-------- R/deliver_rp_online_de.R | 36 ++++++++------ R/deliver_rtl_de.R | 48 +++++++++--------- R/deliver_ruhr24_de.R | 38 ++++++++------- R/deliver_ruhrnachrichten_de.R | 48 +++++++++--------- R/deliver_saechsische_de.R | 36 ++++++++------ R/deliver_shz_de.R | 36 ++++++++------ R/deliver_stern_de.R | 36 ++++++++------ R/deliver_stuttgarter_zeitung_de.R | 40 ++++++++------- R/deliver_sueddeutsche_de.R | 30 +++++++----- R/deliver_suedkurier_de.R | 46 ++++++++++-------- R/deliver_swp_de.R | 36 ++++++++------ R/deliver_t3n_de.R | 32 ++++++------ R/deliver_t_online_de.R | 38 ++++++++------- R/deliver_tag24_de.R | 30 +++++++----- R/deliver_tagesschau_de.R | 30 +++++++----- R/deliver_tagesspiegel_de.R | 36 ++++++++------ R/deliver_thueringer_allgemeine_de.R | 36 ++++++++------ R/deliver_tz_de.R | 29 +++++++++++ R/deliver_tz_de.r | 25 ---------- R/deliver_vice_com.R | 36 ++++++++------ R/deliver_volksstimme_de.R | 36 ++++++++------ R/deliver_vox_de.R | 46 ++++++++++-------- R/deliver_wa_de.R | 38 ++++++++------- R/deliver_watson_ch.R | 30 +++++++----- R/deliver_watson_de.R | 30 +++++++----- R/deliver_waz_de.R | 36 ++++++++------ R/deliver_wdr_de.R | 40 ++++++++------- R/deliver_welt_de.R | 30 +++++++----- R/deliver_wiwo_de.R | 52 +++++++++++--------- R/deliver_zdf_de.R | 73 +++++++++++++++------------- 81 files changed, 1746 insertions(+), 1417 deletions(-) create mode 100644 R/deliver_fnp_de.R delete mode 100644 R/deliver_fnp_de.r create mode 100644 R/deliver_tz_de.R delete mode 100644 R/deliver_tz_de.r diff --git a/R/deliver_abendblatt_de.R b/R/deliver_abendblatt_de.R index d043093..41bf0a8 100644 --- a/R/deliver_abendblatt_de.R +++ b/R/deliver_abendblatt_de.R @@ -4,24 +4,28 @@ pb_deliver_paper.abendblatt_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[2]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".article-body h3, .article-body p") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".article-body h3, .article-body p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } # rss feed includes pages that cannot be parsed because they are subpages # rss feed also includes podcast, which cannot be parsed diff --git a/R/deliver_abendzeitung_muenchen_de.R b/R/deliver_abendzeitung_muenchen_de.R index 3a99054..d9764db 100644 --- a/R/deliver_abendzeitung_muenchen_de.R +++ b/R/deliver_abendzeitung_muenchen_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.abendzeitung_muenchen_de <- function(x, verbose = NULL, pb, ... # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".artdetail_short ,.artdetail_text p,.artdetail_text h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".artdetail_short ,.artdetail_text p,.artdetail_text h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_badische_zeitung_de.R b/R/deliver_badische_zeitung_de.R index c94d4bb..da92a5f 100644 --- a/R/deliver_badische_zeitung_de.R +++ b/R/deliver_badische_zeitung_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.badische_zeitung_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(iconv(x$content_raw, from = "ISO-8859-1", to = "UTF-8")) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author) - text <- html %>% - rvest::html_nodes("section[role = \"article\"], .article-site__topic") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author) + text <- html %>% + rvest::html_nodes("section[role = \"article\"], .article-site__topic") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_berliner_zeitung_de.R b/R/deliver_berliner_zeitung_de.R index 009b61e..918fd0f 100644 --- a/R/deliver_berliner_zeitung_de.R +++ b/R/deliver_berliner_zeitung_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.berliner_zeitung_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".article_paragraph__hXYKJ") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".article_paragraph__hXYKJ") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_bnn_de.R b/R/deliver_bnn_de.R index 7551201..47d10fe 100644 --- a/R/deliver_bnn_de.R +++ b/R/deliver_bnn_de.R @@ -4,23 +4,30 @@ pb_deliver_paper.bnn_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - article <- grepl("\"NewsArticle\"", json_txt) - json_df <- jsonlite::fromJSON(json_txt[article]) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + article <- grepl("\"NewsArticle\"", json_txt) + if (!any(article)) { + return(s_n_list()) + } + json_df <- jsonlite::fromJSON(json_txt[article]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".intro,.article__body p,.article__body h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".intro,.article__body p,.article__body h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_br_de.R b/R/deliver_br_de.R index 6776d40..70ee3ad 100644 --- a/R/deliver_br_de.R +++ b/R/deliver_br_de.R @@ -4,38 +4,44 @@ pb_deliver_paper.br_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - json_df <- lapply(json_txt, jsonlite::fromJSON) - if (is.null(names(json_df))) { - types <- sapply(json_df, function(x) x$`@type`) - if (any(types == "NewsArticle")) { - json_df <- json_df[types == "NewsArticle"][[1]] - } else if (any(types == "VideoObject")) { - json_df <- json_df[types == "VideoObject"][[1]] - } else if (any(types == "AudioObject")) { - json_df <- json_df[types == "AudioObject"][[1]] - } - } - if (json_df$`@type` != "VideoObject" && json_df$`@type` != "AudioObject") { # NewsArticle - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_node(".RichText_richText__wS9Rz.body3") %>% - rvest::html_nodes("p, h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) } else { - datetime <- lubridate::as_datetime(json_df$uploadDate) - headline <- json_df$name - author <- "" - text <- json_df$description + json_df <- jsonlite::fromJSON(json_txt) + # if (is.null(names(json_df))) { + + if (any(json_df$`@type` == "NewsArticle")) { + json_df <- json_df[json_df$`@type` == "NewsArticle", ] + } else if (any(json_df$`@type` == "VideoObject")) { + json_df <- json_df[json_df$`@type` == "VideoObject", ] + } else if (any(json_df$`@type` == "AudioObject")) { + json_df <- json_df[json_df$`@type` == "AudioObject", ] + } else { + return(s_n_list()) + } + # } + if (json_df$`@type` != "VideoObject" && json_df$`@type` != "AudioObject") { # NewsArticle + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_node(".RichText_richText__wS9Rz.body3") %>% + rvest::html_nodes("p, h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + } else { + datetime <- lubridate::as_datetime(json_df$uploadDate) + headline <- json_df$name + author <- "" + text <- json_df$description + } + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) } - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) } diff --git a/R/deliver_businessinsider_de.R b/R/deliver_businessinsider_de.R index df4f50d..48e561f 100644 --- a/R/deliver_businessinsider_de.R +++ b/R/deliver_businessinsider_de.R @@ -4,28 +4,32 @@ pb_deliver_paper.businessinsider_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$`@graph` - if (any(json_df$`@type` == "Person")) { - author <- toString(json_df$name[json_df$`@type` == "Person"]) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) } else { - author <- "" - } - json_df <- json_df[1, ] - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - text <- html %>% - rvest::html_node(".article-main") %>% - rvest::html_nodes("p, h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[1, ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + text <- html %>% + rvest::html_node(".article-main") %>% + rvest::html_nodes("p, h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_der_postillon_com.R b/R/deliver_der_postillon_com.R index 96d211d..0372ce6 100644 --- a/R/deliver_der_postillon_com.R +++ b/R/deliver_der_postillon_com.R @@ -4,32 +4,36 @@ pb_deliver_paper.der_postillon_com <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".post-body p") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".post-body p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - # author abbr can be found at the end of the article - if (author == "Der Postillon") { - author_tmp <- html %>% - rvest::html_node("div[id='post-body'] span[style='font-size: x-small;']") %>% - rvest::html_text() %>% - sub("; Erstver.*$", "", .) - if (author_tmp != "") { - author <- author_tmp + # author abbr can be found at the end of the article + if (author == "Der Postillon") { + author_tmp <- html %>% + rvest::html_node("div[id='post-body'] span[style='font-size: x-small;']") %>% + rvest::html_text() %>% + sub("; Erstver.*$", "", .) + if (author_tmp != "") { + author <- author_tmp + } } + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) } - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) } diff --git a/R/deliver_derwesten_de.R b/R/deliver_derwesten_de.R index 3bcb4ae..0a5ddbf 100644 --- a/R/deliver_derwesten_de.R +++ b/R/deliver_derwesten_de.R @@ -4,26 +4,30 @@ pb_deliver_paper.derwesten_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$`@graph`[1, ] - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- html %>% - rvest::html_nodes(".author.vcard .url.fn.n") %>% - rvest::html_text() %>% - toString() + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$`@graph`[1, ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- html %>% + rvest::html_nodes(".author.vcard .url.fn.n") %>% + rvest::html_text() %>% + toString() - text <- html %>% - rvest::html_nodes(".lead p,.article-body p") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + text <- html %>% + rvest::html_nodes(".lead p,.article-body p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_echo24_de.R b/R/deliver_echo24_de.R index fc52a5c..f040036 100644 --- a/R/deliver_echo24_de.R +++ b/R/deliver_echo24_de.R @@ -4,23 +4,27 @@ pb_deliver_paper.echo24_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$mainEntity + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$mainEntity - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_epochtimes_de.R b/R/deliver_epochtimes_de.R index 1a91fc0..65afe8b 100644 --- a/R/deliver_epochtimes_de.R +++ b/R/deliver_epochtimes_de.R @@ -4,19 +4,23 @@ pb_deliver_paper.epochtimes_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[2]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- json_df$articleBody + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- json_df$articleBody - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_express_de.R b/R/deliver_express_de.R index 664f237..06fd094 100644 --- a/R/deliver_express_de.R +++ b/R/deliver_express_de.R @@ -4,30 +4,34 @@ pb_deliver_paper.express_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$`@graph` - if (any(json_df$`@type` == "Person")) { - author <- toString(json_df$name[json_df$`@type` == "Person"]) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) } else { - author <- "" + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- sub(" \\| .*", "", json_df$headline) + text <- html %>% + rvest::html_nodes(".dm-article__intro,.dm-paragraph,.dm-article__subheadline") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + if (author == "") { + # the text has the author abbr. at the end + author <- sub(".*\\(([^)]+)\\)$", "\\1", text) + } + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) } - json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- sub(" \\| .*", "", json_df$headline) - text <- html %>% - rvest::html_nodes(".dm-article__intro,.dm-paragraph,.dm-article__subheadline") %>% - rvest::html_text2() %>% - paste(collapse = "\n") - if (author == "") { - # the text has the author abbr. at the end - author <- sub(".*\\(([^)]+)\\)$", "\\1", text) - } - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) } diff --git a/R/deliver_finanzen_net.R b/R/deliver_finanzen_net.R index f6abc23..2f30c10 100644 --- a/R/deliver_finanzen_net.R +++ b/R/deliver_finanzen_net.R @@ -4,22 +4,26 @@ pb_deliver_paper.finanzen_net <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[2]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes("p.h3, .news-container__text p, .news-container__text h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("p.h3, .news-container__text p, .news-container__text h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_fnp_de.R b/R/deliver_fnp_de.R new file mode 100644 index 0000000..2053593 --- /dev/null +++ b/R/deliver_fnp_de.R @@ -0,0 +1,29 @@ +#' @export +pb_deliver_paper.fnp_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$mainEntity + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } +} diff --git a/R/deliver_fnp_de.r b/R/deliver_fnp_de.r deleted file mode 100644 index 4f2a587..0000000 --- a/R/deliver_fnp_de.r +++ /dev/null @@ -1,25 +0,0 @@ -#' @export -pb_deliver_paper.fnp_de <- function(x, verbose = NULL, pb, ...) { - pb_tick(x, verbose, pb) - # raw html is stored in column content_raw - html <- rvest::read_html(x$content_raw) - - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$mainEntity - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% - rvest::html_text2() %>% - paste(collapse = "\n") - - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) -} diff --git a/R/deliver_focus_de.R b/R/deliver_focus_de.R index 60b4d32..5f52706 100644 --- a/R/deliver_focus_de.R +++ b/R/deliver_focus_de.R @@ -4,23 +4,27 @@ pb_deliver_paper.focus_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_elements(".leadIn,.textBlock") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_elements(".leadIn,.textBlock") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_fr_de.R b/R/deliver_fr_de.R index e77617b..148d807 100644 --- a/R/deliver_fr_de.R +++ b/R/deliver_fr_de.R @@ -4,24 +4,28 @@ pb_deliver_paper.fr_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$mainEntity + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$mainEntity - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_freiepresse_de.R b/R/deliver_freiepresse_de.R index af3db23..fd6174d 100644 --- a/R/deliver_freiepresse_de.R +++ b/R/deliver_freiepresse_de.R @@ -4,23 +4,30 @@ pb_deliver_paper.freiepresse_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - json_txt <- json_txt[grepl("NewsArticle", json_txt)] - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_txt <- json_txt[grepl("NewsArticle", json_txt)] + if (length(json_txt) == 0) { + return(s_n_list()) + } + json_df <- jsonlite::fromJSON(json_txt) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author) - text <- html %>% - rvest::html_nodes(".article__text p,.article__text h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author) + text <- html %>% + rvest::html_nodes(".article__text p,.article__text h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_haz_de.R b/R/deliver_haz_de.R index f7cc1ba..5eade39 100644 --- a/R/deliver_haz_de.R +++ b/R/deliver_haz_de.R @@ -4,23 +4,30 @@ pb_deliver_paper.haz_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - json_txt <- json_txt[grepl("NewsArticle", json_txt)] - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_txt <- json_txt[grepl("NewsArticle", json_txt)] + if (length(json_txt) == 0) { + return(s_n_list()) + } + json_df <- jsonlite::fromJSON(json_txt) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2,.Textstyled__Text-sc-1cqv9mi-0.gqSIEH") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2,.Textstyled__Text-sc-1cqv9mi-0.gqSIEH") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_heidelberg24_de.R b/R/deliver_heidelberg24_de.R index 8b96a1d..fb1222f 100644 --- a/R/deliver_heidelberg24_de.R +++ b/R/deliver_heidelberg24_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.heidelberg24_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$mainEntity - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$mainEntity + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_heise_de.R b/R/deliver_heise_de.R index 3432b23..76c8b90 100644 --- a/R/deliver_heise_de.R +++ b/R/deliver_heise_de.R @@ -4,21 +4,25 @@ pb_deliver_paper.heise_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt)) | length(json_txt) == 0) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes("#lead,#article-content-body .ringCommonDetail.ringBlockType-paragraph,.article-content,.a-article-header__lead") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + text <- html %>% + rvest::html_nodes("#lead,#article-content-body .ringCommonDetail.ringBlockType-paragraph,.article-content,.a-article-header__lead") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text - ) + s_n_list( + datetime, + author, + headline, + text + ) + } } diff --git a/R/deliver_hna_de.R b/R/deliver_hna_de.R index 4346251..bd3b1c8 100644 --- a/R/deliver_hna_de.R +++ b/R/deliver_hna_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.hna_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$mainEntity - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$mainEntity + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_infranken_de.R b/R/deliver_infranken_de.R index 57616b3..e63416c 100644 --- a/R/deliver_infranken_de.R +++ b/R/deliver_infranken_de.R @@ -4,19 +4,23 @@ pb_deliver_paper.infranken_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- json_df$articleBody + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- json_df$articleBody - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_jungefreiheit_de.R b/R/deliver_jungefreiheit_de.R index 3c55803..8731873 100644 --- a/R/deliver_jungefreiheit_de.R +++ b/R/deliver_jungefreiheit_de.R @@ -4,27 +4,31 @@ pb_deliver_paper.jungefreiheit_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$`@graph` - if (any(json_df$`@type` == "Person")) { - author <- toString(json_df$name[json_df$`@type` == "Person"]) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) } else { - author <- "" - } - json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - text <- html %>% - rvest::html_nodes(".elementor-widget-container p, .elementor-widget-container h3") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + text <- html %>% + rvest::html_nodes(".elementor-widget-container p, .elementor-widget-container h3") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_kabeleins_de.R b/R/deliver_kabeleins_de.R index be93d01..f176536 100644 --- a/R/deliver_kabeleins_de.R +++ b/R/deliver_kabeleins_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.kabeleins_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[2]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes("p.css-1tkp8z5, h2.css-xfddm,p.css-1pcz62z") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("p.css-1tkp8z5, h2.css-xfddm,p.css-1pcz62z") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_karlsruhe_insider_de.R b/R/deliver_karlsruhe_insider_de.R index 66e531e..57a0aee 100644 --- a/R/deliver_karlsruhe_insider_de.R +++ b/R/deliver_karlsruhe_insider_de.R @@ -4,28 +4,32 @@ pb_deliver_paper.karlsruhe_insider_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$`@graph` - if (any(json_df$`@type` == "Person")) { - author <- toString(json_df$name[json_df$`@type` == "Person"]) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) } else { - author <- "" - } - json_df <- json_df[1, ] - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - text <- html %>% - rvest::html_node("article .td-post-content") %>% - rvest::html_nodes("p, h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[1, ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + text <- html %>% + rvest::html_node("article .td-post-content") %>% + rvest::html_nodes("p, h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_kreiszeitung_de.R b/R/deliver_kreiszeitung_de.R index 19536ab..d9595b2 100644 --- a/R/deliver_kreiszeitung_de.R +++ b/R/deliver_kreiszeitung_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.kreiszeitung_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$mainEntity - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$mainEntity + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_ksta_de.R b/R/deliver_ksta_de.R index 8934857..b55757f 100644 --- a/R/deliver_ksta_de.R +++ b/R/deliver_ksta_de.R @@ -4,30 +4,34 @@ pb_deliver_paper.ksta_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$`@graph` - if (any(json_df$`@type` == "Person")) { - author <- toString(json_df$name[json_df$`@type` == "Person"]) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) } else { - author <- "" + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- sub(" \\| .*", "", json_df$headline) + text <- html %>% + rvest::html_nodes(".dm-article__intro,.dm-paragraph,.dm-article__subheadline") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + if (author == "") { + # the text has the author abbr. at the end + author <- sub(".*\\(([^)]+)\\)$", "\\1", text) + } + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) } - json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- sub(" \\| .*", "", json_df$headline) - text <- html %>% - rvest::html_nodes(".dm-article__intro,.dm-paragraph,.dm-article__subheadline") %>% - rvest::html_text2() %>% - paste(collapse = "\n") - if (author == "") { - # the text has the author abbr. at the end - author <- sub(".*\\(([^)]+)\\)$", "\\1", text) - } - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) } diff --git a/R/deliver_kurier_at.R b/R/deliver_kurier_at.R index c40452b..ab3bc9d 100644 --- a/R/deliver_kurier_at.R +++ b/R/deliver_kurier_at.R @@ -4,22 +4,26 @@ pb_deliver_paper.kurier_at <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".headerComp-intro,.paragraph.copy") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".headerComp-intro,.paragraph.copy") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_lvz_de.R b/R/deliver_lvz_de.R index 136dc78..7c45d14 100644 --- a/R/deliver_lvz_de.R +++ b/R/deliver_lvz_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.lvz_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[3] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[3]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Headlinestyled__Headline-sc-mamptc-0,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Textstyled__Text-sc-1cqv9mi-0") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Headlinestyled__Headline-sc-mamptc-0,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Textstyled__Text-sc-1cqv9mi-0") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_manager_magazin_de.R b/R/deliver_manager_magazin_de.R index 137bf07..74ecd51 100644 --- a/R/deliver_manager_magazin_de.R +++ b/R/deliver_manager_magazin_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.manager_magazin_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df[json_df$`@type` == "NewsArticle", ] - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".leading-loose, .RichText p, .RichText h3") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df[json_df$`@type` == "NewsArticle", ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".leading-loose, .RichText p, .RichText h3") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_maz_online_de.R b/R/deliver_maz_online_de.R index d04a96e..75c73e1 100644 --- a/R/deliver_maz_online_de.R +++ b/R/deliver_maz_online_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.maz_online_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[3] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[3]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes("header .Textstyled__Text-sc-1cqv9mi-0, article .Textstyled__Text-sc-1cqv9mi-0, article h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("header .Textstyled__Text-sc-1cqv9mi-0, article .Textstyled__Text-sc-1cqv9mi-0, article h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_mdr_de.R b/R/deliver_mdr_de.R index 4650fc1..6854d56 100644 --- a/R/deliver_mdr_de.R +++ b/R/deliver_mdr_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.mdr_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".einleitung,.paragraph") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".einleitung,.paragraph") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_merkur_de.R b/R/deliver_merkur_de.R index 8f670a0..5ca6c91 100644 --- a/R/deliver_merkur_de.R +++ b/R/deliver_merkur_de.R @@ -4,23 +4,27 @@ pb_deliver_paper.merkur_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$mainEntity - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$mainEntity + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_mopo_de.R b/R/deliver_mopo_de.R index 4fc5456..6cae71d 100644 --- a/R/deliver_mopo_de.R +++ b/R/deliver_mopo_de.R @@ -4,30 +4,34 @@ pb_deliver_paper.mopo_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$`@graph` - if (any(json_df$`@type` == "Person")) { - author <- toString(json_df$name[json_df$`@type` == "Person"]) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) } else { - author <- "" + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- sub(" \\| .*", "", json_df$headline) + text <- html %>% + rvest::html_nodes("p, h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + if (author == "") { + # the text has the author abbr. at the end + author <- sub(".*\\(([^)]+)\\)$", "\\1", text) + } + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) } - json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- sub(" \\| .*", "", json_df$headline) - text <- html %>% - rvest::html_nodes("p, h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") - if (author == "") { - # the text has the author abbr. at the end - author <- sub(".*\\(([^)]+)\\)$", "\\1", text) - } - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) } diff --git a/R/deliver_morgenpost_de.R b/R/deliver_morgenpost_de.R index 6d4940d..67d1aa6 100644 --- a/R/deliver_morgenpost_de.R +++ b/R/deliver_morgenpost_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.morgenpost_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[2]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".article-body p") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".article-body p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_n-tv_de.R b/R/deliver_n-tv_de.R index 5e3f096..58b0eed 100644 --- a/R/deliver_n-tv_de.R +++ b/R/deliver_n-tv_de.R @@ -4,23 +4,27 @@ pb_deliver_paper.n_tv_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_elements(".article__text") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_elements(".article__text") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_ndr_de.R b/R/deliver_ndr_de.R index 8de58a9..910d083 100644 --- a/R/deliver_ndr_de.R +++ b/R/deliver_ndr_de.R @@ -4,27 +4,31 @@ pb_deliver_paper.ndr_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - if (json_df$`@type` != "VideoObject" && json_df$`@type` != "AudioObject") { # NewsArticle - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".modulepadding.copytext p, .modulepadding.copytext h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) } else { - datetime <- lubridate::as_datetime(json_df$uploadDate) - headline <- json_df$name - author <- "" - text <- json_df$description + json_df <- jsonlite::fromJSON(json_txt[1]) + if (json_df$`@type` != "VideoObject" && json_df$`@type` != "AudioObject") { # NewsArticle + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".modulepadding.copytext p, .modulepadding.copytext h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + } else { + datetime <- lubridate::as_datetime(json_df$uploadDate) + headline <- json_df$name + author <- "" + text <- json_df$description + } + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) } - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) } diff --git a/R/deliver_news_de.R b/R/deliver_news_de.R index 024b887..0ca9cca 100644 --- a/R/deliver_news_de.R +++ b/R/deliver_news_de.R @@ -4,20 +4,24 @@ pb_deliver_paper.news_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- trimws(gsub("\\+\\+\\+.*?\\+\\+\\+", "", json_df$articleBody)) - text <- gsub("\r\n", "\n", text) - text <- gsub("Folgen Sie.*", "", text) - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- trimws(gsub("\\+\\+\\+.*?\\+\\+\\+", "", json_df$articleBody)) + text <- gsub("\r\n", "\n", text) + text <- gsub("Folgen Sie.*", "", text) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_news_und_nachrichten_de.R b/R/deliver_news_und_nachrichten_de.R index a168e0d..8b2a7f2 100644 --- a/R/deliver_news_und_nachrichten_de.R +++ b/R/deliver_news_und_nachrichten_de.R @@ -4,19 +4,23 @@ pb_deliver_paper.news_und_nachrichten_de <- function(x, verbose = NULL, pb, ...) # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(gsub("[\r\n]*", "", json_txt)) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(gsub("[\r\n]*", "", json_txt[1])) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author) - text <- json_df$articleBody + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author) + text <- json_df$articleBody - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_newsflash24_de.R b/R/deliver_newsflash24_de.R index 66a56a4..95f41f8 100644 --- a/R/deliver_newsflash24_de.R +++ b/R/deliver_newsflash24_de.R @@ -4,27 +4,31 @@ pb_deliver_paper.newsflash24_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$`@graph` - if (any(json_df$`@type` == "Person")) { - author <- toString(json_df$name[json_df$`@type` == "Person"]) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) } else { - author <- "" - } - json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - text <- html %>% - rvest::html_nodes(".entry-content p, .entry-content h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + text <- html %>% + rvest::html_nodes(".entry-content p, .entry-content h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_nordkurier_de.R b/R/deliver_nordkurier_de.R index 22b0a63..709f4d5 100644 --- a/R/deliver_nordkurier_de.R +++ b/R/deliver_nordkurier_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.nordkurier_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[2]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".tw-text-title-md, .paragraph,h2.tw-mb-4") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".tw-text-title-md, .paragraph,h2.tw-mb-4") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_noz_de.R b/R/deliver_noz_de.R index 45446d7..02453de 100644 --- a/R/deliver_noz_de.R +++ b/R/deliver_noz_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.noz_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes("p.w-600,section.content--group p, section.content--group h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("p.w-600,section.content--group p, section.content--group h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_nw_de.R b/R/deliver_nw_de.R index 999f24f..a0f9c58 100644 --- a/R/deliver_nw_de.R +++ b/R/deliver_nw_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.nw_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[2]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes("p.em_text,h2.Zwischenzeile") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("p.em_text,h2.Zwischenzeile") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_nzz_ch.R b/R/deliver_nzz_ch.R index f3012a2..278382e 100644 --- a/R/deliver_nzz_ch.R +++ b/R/deliver_nzz_ch.R @@ -4,22 +4,26 @@ pb_deliver_paper.nzz_ch <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[2]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".headline__lead,.articlecomponent.text,.subtitle,.articlecomponent") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".headline__lead,.articlecomponent.text,.subtitle,.articlecomponent") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_orf_at.R b/R/deliver_orf_at.R index d1ce977..fa14ae9 100644 --- a/R/deliver_orf_at.R +++ b/R/deliver_orf_at.R @@ -4,22 +4,26 @@ pb_deliver_paper.orf_at <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".story-lead-text,.story-story p,.story-story h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".story-lead-text,.story-story p,.story-story h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_ostsee_zeitung_de.R b/R/deliver_ostsee_zeitung_de.R index 07b557a..bf9a88c 100644 --- a/R/deliver_ostsee_zeitung_de.R +++ b/R/deliver_ostsee_zeitung_de.R @@ -4,23 +4,30 @@ pb_deliver_paper.ostsee_zeitung_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - json_txt <- json_txt[grepl("NewsArticle", json_txt)] - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_txt <- json_txt[grepl("NewsArticle", json_txt)] + if (length(json_txt) == 0) { + return(s_n_list()) + } + json_df <- jsonlite::fromJSON(json_txt) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Textstyled__Text-sc-1cqv9mi-0,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Headlinestyled__Headline-sc-mamptc-0") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Textstyled__Text-sc-1cqv9mi-0,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Headlinestyled__Headline-sc-mamptc-0") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_presseportal_de.R b/R/deliver_presseportal_de.R index 4aff63a..b18aa77 100644 --- a/R/deliver_presseportal_de.R +++ b/R/deliver_presseportal_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.presseportal_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[2]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes("article.story p:not([class])") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("article.story p:not([class])") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_prosieben_de.R b/R/deliver_prosieben_de.R index 9ebbd50..2b0e1c8 100644 --- a/R/deliver_prosieben_de.R +++ b/R/deliver_prosieben_de.R @@ -4,33 +4,37 @@ pb_deliver_paper.prosieben_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ") - if (length(json_txt) == 2) { - json_txt <- json_txt[2] %>% rvest::html_text() + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) } else { - json_txt <- json_txt %>% rvest::html_text() - } - json_df <- jsonlite::fromJSON(json_txt) - if (json_df$`@type` != "VideoObject") { # NewsArticle - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_elements(".css-f9qfdi p.css-bq2685,.css-f9qfdi h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") - } else { - datetime <- lubridate::as_datetime(json_df$uploadDate) - headline <- json_df$name - author <- "" - text <- json_df$description # for video objects, use description as text - } + if (length(json_txt) == 2) { + json_txt <- json_txt[2] %>% rvest::html_text() + } else { + json_txt <- json_txt %>% rvest::html_text() + } + json_df <- jsonlite::fromJSON(json_txt) + if (json_df$`@type` != "VideoObject") { # NewsArticle + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_elements(".css-f9qfdi p.css-bq2685,.css-f9qfdi h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + } else { + datetime <- lubridate::as_datetime(json_df$uploadDate) + headline <- json_df$name + author <- "" + text <- json_df$description # for video objects, use description as text + } - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_rnd_de.R b/R/deliver_rnd_de.R index 5052eec..915f98d 100644 --- a/R/deliver_rnd_de.R +++ b/R/deliver_rnd_de.R @@ -4,26 +4,30 @@ pb_deliver_paper.rnd_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[3] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[3]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% - rvest::html_text2() + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% + rvest::html_text2() - more_items <- html %>% # delete content in lists of related items - rvest::html_nodes("div[data-is-element-rendered='true']") %>% - rvest::html_nodes(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% - rvest::html_text2() - text <- text[!text %in% more_items] %>% paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + more_items <- html %>% # delete content in lists of related items + rvest::html_nodes("div[data-is-element-rendered='true']") %>% + rvest::html_nodes(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% + rvest::html_text2() + text <- text[!text %in% more_items] %>% paste(collapse = "\n") + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_rollingstone_de.R b/R/deliver_rollingstone_de.R index c3d0415..6e0462a 100644 --- a/R/deliver_rollingstone_de.R +++ b/R/deliver_rollingstone_de.R @@ -4,27 +4,31 @@ pb_deliver_paper.rollingstone_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$`@graph` - if (any(json_df$`@type` == "Person")) { - author <- toString(json_df$name[json_df$`@type` == "Person"]) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) } else { - author <- "" - } - json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - text <- html %>% - rvest::html_nodes(".asmb-article-excerpt,.asmb-article-content-container h2,.asmb-article-content-container p") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[grepl("NewsArticle", json_df$`@type`), ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + text <- html %>% + rvest::html_nodes(".asmb-article-excerpt,.asmb-article-content-container h2,.asmb-article-content-container p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_rp_online_de.R b/R/deliver_rp_online_de.R index e8fb160..0064aba 100644 --- a/R/deliver_rp_online_de.R +++ b/R/deliver_rp_online_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.rp_online_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes("strong[data-cy=\"intro\"],div[data-cy=\"article_content\"] p") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("strong[data-cy=\"intro\"],div[data-cy=\"article_content\"] p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_rtl_de.R b/R/deliver_rtl_de.R index 3238812..75ea01a 100644 --- a/R/deliver_rtl_de.R +++ b/R/deliver_rtl_de.R @@ -4,28 +4,32 @@ pb_deliver_paper.rtl_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - if (json_df$`@type` != "VideoObject") { # NewsArticle - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_elements(".article-body .LeadText_lead__rfwFU,.article-body .AnnotatedMarkup_paragraph__IUT9l") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) } else { - datetime <- lubridate::as_datetime(json_df$uploadDate) - headline <- json_df$name - author <- "" - text <- json_df$transcript # for video objects, use transcript as text - } + json_df <- jsonlite::fromJSON(json_txt[2]) + if (json_df$`@type` != "VideoObject") { # NewsArticle + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_elements(".article-body .LeadText_lead__rfwFU,.article-body .AnnotatedMarkup_paragraph__IUT9l") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + } else { + datetime <- lubridate::as_datetime(json_df$uploadDate) + headline <- json_df$name + author <- "" + text <- json_df$transcript # for video objects, use transcript as text + } - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_ruhr24_de.R b/R/deliver_ruhr24_de.R index 8097e81..0a861ef 100644 --- a/R/deliver_ruhr24_de.R +++ b/R/deliver_ruhr24_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.ruhr24_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$mainEntity - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-crosshead,.id-StoryElement-paragraph") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$mainEntity + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-crosshead,.id-StoryElement-paragraph") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_ruhrnachrichten_de.R b/R/deliver_ruhrnachrichten_de.R index 355e1d9..2bb36ec 100644 --- a/R/deliver_ruhrnachrichten_de.R +++ b/R/deliver_ruhrnachrichten_de.R @@ -4,28 +4,32 @@ pb_deliver_paper.ruhrnachrichten_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$`@graph` - if (any(json_df$`@type` == "Person")) { - author <- toString(json_df$name[json_df$`@type` == "Person"]) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) } else { - author <- "" - } - json_df <- json_df[grepl("NewsArticle|Article", json_df$`@type`), ] - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - text <- html %>% - rvest::html_nodes("p.article__teaser-text,.article__content p, .article__content h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") %>% - gsub("\nZur Startseite$", "", .) + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$`@graph` + if (any(json_df$`@type` == "Person")) { + author <- toString(json_df$name[json_df$`@type` == "Person"]) + } else { + author <- "" + } + json_df <- json_df[grepl("NewsArticle|Article", json_df$`@type`), ] + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + text <- html %>% + rvest::html_nodes("p.article__teaser-text,.article__content p, .article__content h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") %>% + gsub("\nZur Startseite$", "", .) - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_saechsische_de.R b/R/deliver_saechsische_de.R index 7ad3951..cd84973 100644 --- a/R/deliver_saechsische_de.R +++ b/R/deliver_saechsische_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.saechsische_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[3] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[3]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_shz_de.R b/R/deliver_shz_de.R index 8706bae..acebf28 100644 --- a/R/deliver_shz_de.R +++ b/R/deliver_shz_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.shz_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes("p.w-600, p,h2.h4") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("p.w-600, p,h2.h4") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_stern_de.R b/R/deliver_stern_de.R index 33e93ce..4a64f3e 100644 --- a/R/deliver_stern_de.R +++ b/R/deliver_stern_de.R @@ -4,23 +4,27 @@ pb_deliver_paper.stern_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt)[1, ] + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt)[1, ] - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_elements(".intro,.text-element") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_elements(".intro,.text-element") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_stuttgarter_zeitung_de.R b/R/deliver_stuttgarter_zeitung_de.R index b811af2..140de47 100644 --- a/R/deliver_stuttgarter_zeitung_de.R +++ b/R/deliver_stuttgarter_zeitung_de.R @@ -4,27 +4,31 @@ pb_deliver_paper.stuttgarter_zeitung_de <- function(x, verbose = NULL, pb, ...) # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".brick.intro-text p,.brickgroup p,.brickgroup h2") %>% - rvest::html_text2() - rm_text <- c("StZ-Plus-Abonnement", "Vertrag mit Werbung") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".brick.intro-text p,.brickgroup p,.brickgroup h2") %>% + rvest::html_text2() + rm_text <- c("StZ-Plus-Abonnement", "Vertrag mit Werbung") - text <- text[!text %in% rm_text] %>% - paste(collapse = "\n") + text <- text[!text %in% rm_text] %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } # rss feed includes pages that cannot be parsed because they are subpages # rss feed also includes podcast, which cannot be parsed diff --git a/R/deliver_sueddeutsche_de.R b/R/deliver_sueddeutsche_de.R index bddcbd8..ad9a6e1 100644 --- a/R/deliver_sueddeutsche_de.R +++ b/R/deliver_sueddeutsche_de.R @@ -4,19 +4,23 @@ pb_deliver_paper.sueddeutsche_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- json_df$articleBody + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- json_df$articleBody - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_suedkurier_de.R b/R/deliver_suedkurier_de.R index cda60fe..640d1aa 100644 --- a/R/deliver_suedkurier_de.R +++ b/R/deliver_suedkurier_de.R @@ -4,26 +4,30 @@ pb_deliver_paper.suedkurier_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- html %>% - rvest::html_node("header h1") %>% - rvest::html_text() - author <- paste0("<p>", json_df$author$name, "</p>", collapse = ",") %>% - rvest::read_html() %>% - rvest::html_text() %>% - toString() - text <- html %>% - rvest::html_nodes(".article-summary,.article-jsonld.article-paywall-summary,.article-jsonld p") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- html %>% + rvest::html_node("header h1") %>% + rvest::html_text() + author <- paste0("<p>", json_df$author$name, "</p>", collapse = ",") %>% + rvest::read_html() %>% + rvest::html_text() %>% + toString() + text <- html %>% + rvest::html_nodes(".article-summary,.article-jsonld.article-paywall-summary,.article-jsonld p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_swp_de.R b/R/deliver_swp_de.R index 13009db..49080f7 100644 --- a/R/deliver_swp_de.R +++ b/R/deliver_swp_de.R @@ -4,21 +4,25 @@ pb_deliver_paper.swp_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".u-article-header .fs-4,.u-paragraph, .u-title.u-headline") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[2]) + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".u-article-header .fs-4,.u-paragraph, .u-title.u-headline") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_t3n_de.R b/R/deliver_t3n_de.R index 7345e19..22db194 100644 --- a/R/deliver_t3n_de.R +++ b/R/deliver_t3n_de.R @@ -4,20 +4,24 @@ pb_deliver_paper.t3n_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- gsub("\r\n", "\n", json_df$articleBody) - text <- gsub("\\[.*?\\]", "", text) + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- gsub("\r\n", "\n", json_df$articleBody) + text <- gsub("\\[.*?\\]", "", text) - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_t_online_de.R b/R/deliver_t_online_de.R index 05bdb8a..dd868a4 100644 --- a/R/deliver_t_online_de.R +++ b/R/deliver_t_online_de.R @@ -4,23 +4,27 @@ pb_deliver_paper.t_online_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$`@graph`[1, ] + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$`@graph`[1, ] - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author[[1]]$name) - text <- html %>% - rvest::html_nodes("div[data-testid=\"ArticleBody.StreamLayout\"] p") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author[[1]]$name) + text <- html %>% + rvest::html_nodes("div[data-testid=\"ArticleBody.StreamLayout\"] p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_tag24_de.R b/R/deliver_tag24_de.R index b8443e3..5de25f2 100644 --- a/R/deliver_tag24_de.R +++ b/R/deliver_tag24_de.R @@ -4,18 +4,22 @@ pb_deliver_paper.tag24_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- json_df$articleBody + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- json_df$articleBody - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_tagesschau_de.R b/R/deliver_tagesschau_de.R index effc1e2..2d76f70 100644 --- a/R/deliver_tagesschau_de.R +++ b/R/deliver_tagesschau_de.R @@ -4,19 +4,23 @@ pb_deliver_paper.tagesschau_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- trimws(gsub("<[^>]+>", "", json_df$articleBody)) + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- trimws(gsub("<[^>]+>", "", json_df$articleBody)) - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_tagesspiegel_de.R b/R/deliver_tagesspiegel_de.R index 2e40f49..d9ed5fc 100644 --- a/R/deliver_tagesspiegel_de.R +++ b/R/deliver_tagesspiegel_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.tagesspiegel_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes("#story-elements p") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("#story-elements p") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_thueringer_allgemeine_de.R b/R/deliver_thueringer_allgemeine_de.R index 4cab500..9033cb0 100644 --- a/R/deliver_thueringer_allgemeine_de.R +++ b/R/deliver_thueringer_allgemeine_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.thueringer_allgemeine_de <- function(x, verbose = NULL, pb, ... # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[2]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".article-body p, .article-body h3") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".article-body p, .article-body h3") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_tz_de.R b/R/deliver_tz_de.R new file mode 100644 index 0000000..55129ae --- /dev/null +++ b/R/deliver_tz_de.R @@ -0,0 +1,29 @@ +#' @export +pb_deliver_paper.tz_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$mainEntity + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-crosshead,.id-StoryElement-paragraph") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } +} diff --git a/R/deliver_tz_de.r b/R/deliver_tz_de.r deleted file mode 100644 index 5a81d1a..0000000 --- a/R/deliver_tz_de.r +++ /dev/null @@ -1,25 +0,0 @@ -#' @export -pb_deliver_paper.tz_de <- function(x, verbose = NULL, pb, ...) { - pb_tick(x, verbose, pb) - # raw html is stored in column content_raw - html <- rvest::read_html(x$content_raw) - - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$mainEntity - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-crosshead,.id-StoryElement-paragraph") %>% - rvest::html_text2() %>% - paste(collapse = "\n") - - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) -} diff --git a/R/deliver_vice_com.R b/R/deliver_vice_com.R index 45d5123..f7ef3aa 100644 --- a/R/deliver_vice_com.R +++ b/R/deliver_vice_com.R @@ -4,22 +4,26 @@ pb_deliver_paper.vice_com <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[2]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".entry-content.entry-content p,.entry-content entry-content h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".entry-content.entry-content p,.entry-content entry-content h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_volksstimme_de.R b/R/deliver_volksstimme_de.R index 735f8a2..5c1d73a 100644 --- a/R/deliver_volksstimme_de.R +++ b/R/deliver_volksstimme_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.volksstimme_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".fp-article-heading__excerpt,.fp-paragraph, .fp-subheading") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".fp-article-heading__excerpt,.fp-paragraph, .fp-subheading") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_vox_de.R b/R/deliver_vox_de.R index f0cf8ea..9f9b0ad 100644 --- a/R/deliver_vox_de.R +++ b/R/deliver_vox_de.R @@ -4,27 +4,31 @@ pb_deliver_paper.vox_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - if (nrow(json_df) > 1) { - json_df <- json_df[json_df$`@type` == "Article", ] - } - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- json_df$articleBody - if (author == "VOX Online") { - # the text might have the author abbr. at the end - author_abbr <- sub(".*\\(([^)]+)\\)$", "\\1", text) - if (author_abbr != "") { - author <- author_abbr + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + if (length(json_df$`@type`) > 1) { + json_df <- json_df[json_df$`@type` == "Article", ] + } + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- json_df$articleBody + if (author == "VOX Online") { + # the text might have the author abbr. at the end + author_abbr <- sub(".*\\(([^)]+)\\)$", "\\1", text) + if (author_abbr != "") { + author <- author_abbr + } } + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) } - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) } diff --git a/R/deliver_wa_de.R b/R/deliver_wa_de.R index f0bd62f..89c64d2 100644 --- a/R/deliver_wa_de.R +++ b/R/deliver_wa_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.wa_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - json_df <- json_df$mainEntity - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + json_df <- json_df$mainEntity + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_watson_ch.R b/R/deliver_watson_ch.R index 1aaaae6..f09473b 100644 --- a/R/deliver_watson_ch.R +++ b/R/deliver_watson_ch.R @@ -4,19 +4,23 @@ pb_deliver_paper.watson_ch <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[2]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- json_df$articleBody + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- json_df$articleBody - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_watson_de.R b/R/deliver_watson_de.R index 5812ce4..c0b62ed 100644 --- a/R/deliver_watson_de.R +++ b/R/deliver_watson_de.R @@ -4,19 +4,23 @@ pb_deliver_paper.watson_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[2]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- json_df$articleBody + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- json_df$articleBody - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_waz_de.R b/R/deliver_waz_de.R index b6d275a..94604b4 100644 --- a/R/deliver_waz_de.R +++ b/R/deliver_waz_de.R @@ -4,22 +4,26 @@ pb_deliver_paper.waz_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[2] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[2]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".article-body p,.article-body h3") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".article-body p,.article-body h3") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_wdr_de.R b/R/deliver_wdr_de.R index c4bfb62..26a4c13 100644 --- a/R/deliver_wdr_de.R +++ b/R/deliver_wdr_de.R @@ -5,24 +5,28 @@ pb_deliver_paper.wdr_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) # careful: json can have many objects but the first seems to be the article - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) - date_tmp <- json_df$datePublished # missing sec - date_tmp <- sub("(\\d{2}:\\d{2})(\\+\\d{2}:\\d{2})", "\\1:00\\2", date_tmp) - datetime <- lubridate::as_datetime(date_tmp) - headline <- json_df$headline - author <- toString(json_df$author$name) %>% gsub("/", ",", .) - text <- html %>% - rvest::html_nodes(".einleitung,.text,.subtitle") %>% - rvest::html_text2() %>% - paste(collapse = "\n") + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + date_tmp <- json_df$datePublished # missing sec + date_tmp <- sub("(\\d{2}:\\d{2})(\\+\\d{2}:\\d{2})", "\\1:00\\2", date_tmp) + datetime <- lubridate::as_datetime(date_tmp) + headline <- json_df$headline + author <- toString(json_df$author$name) %>% gsub("/", ",", .) + text <- html %>% + rvest::html_nodes(".einleitung,.text,.subtitle") %>% + rvest::html_text2() %>% + paste(collapse = "\n") - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } # rss feed contains also overviews of articles which make the parser fail diff --git a/R/deliver_welt_de.R b/R/deliver_welt_de.R index 66d30a5..6ab45b7 100644 --- a/R/deliver_welt_de.R +++ b/R/deliver_welt_de.R @@ -4,19 +4,23 @@ pb_deliver_paper.welt_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - json_df <- jsonlite::fromJSON(json_txt) + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- trimws(gsub("<[^>]+>", "", json_df$articleBody)) + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- trimws(gsub("<[^>]+>", "", json_df$articleBody)) - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) + } } diff --git a/R/deliver_wiwo_de.R b/R/deliver_wiwo_de.R index 33807a7..94f2976 100644 --- a/R/deliver_wiwo_de.R +++ b/R/deliver_wiwo_de.R @@ -4,30 +4,34 @@ pb_deliver_paper.wiwo_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() - if (length(json_txt) != 0) { # otherwise the article is paywalled and not scrapeable - json_df <- jsonlite::fromJSON(json_txt) - - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$creator) - text <- html %>% - rvest::html_nodes(".c-leadtext,.u-richtext h3,.u-richtext p") %>% - rvest::html_text2() %>% - .[!grepl("Lesen Sie auch", .)] %>% # Remove links in between - paste(collapse = "\n") + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) } else { - datetime <- NA - headline <- NA - author <- NA - text <- NA - json_df <- list("no access") + if (length(json_txt) != 0) { # otherwise the article is paywalled and not scrapeable + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$creator) + text <- html %>% + rvest::html_nodes(".c-leadtext,.u-richtext h3,.u-richtext p") %>% + rvest::html_text2() %>% + .[!grepl("Lesen Sie auch", .)] %>% # Remove links in between + paste(collapse = "\n") + } else { + datetime <- NA + headline <- NA + author <- NA + text <- NA + json_df <- list("no access") + } + s_n_list( + datetime, + author, + headline, + text, + json_df + ) } - s_n_list( - datetime, - author, - headline, - text, - json_df - ) } diff --git a/R/deliver_zdf_de.R b/R/deliver_zdf_de.R index e11177c..317dc71 100644 --- a/R/deliver_zdf_de.R +++ b/R/deliver_zdf_de.R @@ -4,41 +4,44 @@ pb_deliver_paper.zdf_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ") - if (length(json_txt) != 1) { - json_txt <- json_txt[2] - } - json_txt <- json_txt %>% rvest::html_text() - json_df <- jsonlite::fromJSON(gsub("\r\n", " ", json_txt)) - if (json_df$`@type` != "VideoObject" && json_df$`@type` != "BreadcrumbList") { - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_nodes(".r1nj4qn5") %>% - rvest::html_text2() %>% - paste(collapse = "\n") - } else if (json_df$`@type` == "VideoObject") { - datetime <- lubridate::as_datetime(json_df$uploadDate) - headline <- json_df$name - author <- toString(json_df$publisher$name) - text <- json_df$description + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + if (isTRUE(is.na(json_txt))) { + return(s_n_list()) } else { - datetime <- html %>% - rvest::html_node("time") %>% - rvest::html_attr("datetime") %>% - lubridate::as_datetime() - headline <- html %>% - rvest::html_node("main h2") %>% - rvest::html_text() - author <- "" - text <- "" + if (length(json_txt) != 1) { + json_txt <- json_txt[2] + } + json_df <- jsonlite::fromJSON(gsub("\r\n", " ", json_txt)) + if (json_df$`@type` != "VideoObject" && json_df$`@type` != "BreadcrumbList" && json_df$`@type` != "WebSite") { + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".r1nj4qn5") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + } else if (json_df$`@type` == "VideoObject") { + datetime <- lubridate::as_datetime(json_df$uploadDate) + headline <- json_df$name + author <- toString(json_df$publisher$name) + text <- json_df$description + } else { + datetime <- html %>% + rvest::html_node("time") %>% + rvest::html_attr("datetime") %>% + lubridate::as_datetime() + headline <- html %>% + rvest::html_node("main h2") %>% + rvest::html_text2() + author <- "" + text <- "" + } + s_n_list( + datetime, + author, + headline, + text, + json_df # dumping the whole json data of an article + ) } - s_n_list( - datetime, - author, - headline, - text, - json_df # dumping the whole json data of an article - ) } From c37f7b0d6a8cdf609b7e29e8f43dbffd6763965a Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Mon, 21 Oct 2024 06:46:38 +0200 Subject: [PATCH 106/121] better json error handling (part 2) --- R/deliver_abendblatt_de.R | 2 +- R/deliver_abendzeitung_muenchen_de.R | 2 +- R/deliver_badische_zeitung_de.R | 2 +- R/deliver_berliner_zeitung_de.R | 2 +- R/deliver_bnn_de.R | 2 +- R/deliver_br_de.R | 2 +- R/deliver_businessinsider_de.R | 2 +- R/deliver_der_postillon_com.R | 2 +- R/deliver_derwesten_de.R | 2 +- R/deliver_echo24_de.R | 2 +- R/deliver_epochtimes_de.R | 2 +- R/deliver_express_de.R | 2 +- R/deliver_finanzen_net.R | 2 +- R/deliver_fnp_de.R | 2 +- R/deliver_focus_de.R | 2 +- R/deliver_fr_de.R | 2 +- R/deliver_freiepresse_de.R | 2 +- R/deliver_haz_de.R | 2 +- R/deliver_heidelberg24_de.R | 2 +- R/deliver_hna_de.R | 2 +- R/deliver_infranken_de.R | 2 +- R/deliver_jungefreiheit_de.R | 2 +- R/deliver_kabeleins_de.R | 2 +- R/deliver_karlsruhe_insider_de.R | 2 +- R/deliver_kreiszeitung_de.R | 2 +- R/deliver_ksta_de.R | 2 +- R/deliver_kurier_at.R | 2 +- R/deliver_lvz_de.R | 2 +- R/deliver_manager_magazin_de.R | 2 +- R/deliver_maz_online_de.R | 2 +- R/deliver_mdr_de.R | 2 +- R/deliver_merkur_de.R | 2 +- R/deliver_mopo_de.R | 2 +- R/deliver_morgenpost_de.R | 2 +- R/deliver_n-tv_de.R | 2 +- R/deliver_ndr_de.R | 2 +- R/deliver_news_de.R | 2 +- R/deliver_news_und_nachrichten_de.R | 2 +- R/deliver_newsflash24_de.R | 2 +- R/deliver_nordkurier_de.R | 2 +- R/deliver_noz_de.R | 2 +- R/deliver_nw_de.R | 2 +- R/deliver_nzz_ch.R | 2 +- R/deliver_orf_at.R | 2 +- R/deliver_ostsee_zeitung_de.R | 2 +- R/deliver_presseportal_de.R | 2 +- R/deliver_prosieben_de.R | 2 +- R/deliver_rnd_de.R | 2 +- R/deliver_rollingstone_de.R | 2 +- R/deliver_rp_online_de.R | 2 +- R/deliver_rtl_de.R | 2 +- R/deliver_ruhr24_de.R | 2 +- R/deliver_ruhrnachrichten_de.R | 2 +- R/deliver_saechsische_de.R | 2 +- R/deliver_shz_de.R | 2 +- R/deliver_stern_de.R | 2 +- R/deliver_stuttgarter_zeitung_de.R | 2 +- R/deliver_sueddeutsche_de.R | 2 +- R/deliver_suedkurier_de.R | 2 +- R/deliver_swp_de.R | 2 +- R/deliver_t3n_de.R | 2 +- R/deliver_t_online_de.R | 2 +- R/deliver_tag24_de.R | 2 +- R/deliver_tagesschau_de.R | 2 +- R/deliver_tagesspiegel_de.R | 2 +- R/deliver_thueringer_allgemeine_de.R | 2 +- R/deliver_tz_de.R | 2 +- R/deliver_vice_com.R | 2 +- R/deliver_volksstimme_de.R | 2 +- R/deliver_vox_de.R | 2 +- R/deliver_wa_de.R | 2 +- R/deliver_watson_ch.R | 2 +- R/deliver_watson_de.R | 2 +- R/deliver_waz_de.R | 2 +- R/deliver_wdr_de.R | 2 +- R/deliver_welt_de.R | 2 +- R/deliver_wiwo_de.R | 2 +- R/deliver_zdf_de.R | 2 +- 78 files changed, 78 insertions(+), 78 deletions(-) diff --git a/R/deliver_abendblatt_de.R b/R/deliver_abendblatt_de.R index 41bf0a8..c505090 100644 --- a/R/deliver_abendblatt_de.R +++ b/R/deliver_abendblatt_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.abendblatt_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_abendzeitung_muenchen_de.R b/R/deliver_abendzeitung_muenchen_de.R index d9764db..c833491 100644 --- a/R/deliver_abendzeitung_muenchen_de.R +++ b/R/deliver_abendzeitung_muenchen_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.abendzeitung_muenchen_de <- function(x, verbose = NULL, pb, ... html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_badische_zeitung_de.R b/R/deliver_badische_zeitung_de.R index da92a5f..1ba5911 100644 --- a/R/deliver_badische_zeitung_de.R +++ b/R/deliver_badische_zeitung_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.badische_zeitung_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(iconv(x$content_raw, from = "ISO-8859-1", to = "UTF-8")) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_berliner_zeitung_de.R b/R/deliver_berliner_zeitung_de.R index 918fd0f..ee0a32b 100644 --- a/R/deliver_berliner_zeitung_de.R +++ b/R/deliver_berliner_zeitung_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.berliner_zeitung_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_bnn_de.R b/R/deliver_bnn_de.R index 47d10fe..1e8a40e 100644 --- a/R/deliver_bnn_de.R +++ b/R/deliver_bnn_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.bnn_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { article <- grepl("\"NewsArticle\"", json_txt) diff --git a/R/deliver_br_de.R b/R/deliver_br_de.R index 70ee3ad..7b4a8a4 100644 --- a/R/deliver_br_de.R +++ b/R/deliver_br_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.br_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt) diff --git a/R/deliver_businessinsider_de.R b/R/deliver_businessinsider_de.R index 48e561f..ff2eba2 100644 --- a/R/deliver_businessinsider_de.R +++ b/R/deliver_businessinsider_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.businessinsider_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_der_postillon_com.R b/R/deliver_der_postillon_com.R index 0372ce6..5da42d2 100644 --- a/R/deliver_der_postillon_com.R +++ b/R/deliver_der_postillon_com.R @@ -5,7 +5,7 @@ pb_deliver_paper.der_postillon_com <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_derwesten_de.R b/R/deliver_derwesten_de.R index 0a5ddbf..7b3a10e 100644 --- a/R/deliver_derwesten_de.R +++ b/R/deliver_derwesten_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.derwesten_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_echo24_de.R b/R/deliver_echo24_de.R index f040036..7e4c74d 100644 --- a/R/deliver_echo24_de.R +++ b/R/deliver_echo24_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.echo24_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_epochtimes_de.R b/R/deliver_epochtimes_de.R index 65afe8b..ed92023 100644 --- a/R/deliver_epochtimes_de.R +++ b/R/deliver_epochtimes_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.epochtimes_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_express_de.R b/R/deliver_express_de.R index 06fd094..c1ebe6c 100644 --- a/R/deliver_express_de.R +++ b/R/deliver_express_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.express_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_finanzen_net.R b/R/deliver_finanzen_net.R index 2f30c10..9515835 100644 --- a/R/deliver_finanzen_net.R +++ b/R/deliver_finanzen_net.R @@ -5,7 +5,7 @@ pb_deliver_paper.finanzen_net <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_fnp_de.R b/R/deliver_fnp_de.R index 2053593..b998439 100644 --- a/R/deliver_fnp_de.R +++ b/R/deliver_fnp_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.fnp_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_focus_de.R b/R/deliver_focus_de.R index 5f52706..deb040a 100644 --- a/R/deliver_focus_de.R +++ b/R/deliver_focus_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.focus_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_fr_de.R b/R/deliver_fr_de.R index 148d807..84ed5ac 100644 --- a/R/deliver_fr_de.R +++ b/R/deliver_fr_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.fr_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_freiepresse_de.R b/R/deliver_freiepresse_de.R index fd6174d..0419ff8 100644 --- a/R/deliver_freiepresse_de.R +++ b/R/deliver_freiepresse_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.freiepresse_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_txt <- json_txt[grepl("NewsArticle", json_txt)] diff --git a/R/deliver_haz_de.R b/R/deliver_haz_de.R index 5eade39..e2a65ec 100644 --- a/R/deliver_haz_de.R +++ b/R/deliver_haz_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.haz_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_txt <- json_txt[grepl("NewsArticle", json_txt)] diff --git a/R/deliver_heidelberg24_de.R b/R/deliver_heidelberg24_de.R index fb1222f..7a8f006 100644 --- a/R/deliver_heidelberg24_de.R +++ b/R/deliver_heidelberg24_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.heidelberg24_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_hna_de.R b/R/deliver_hna_de.R index bd3b1c8..70363fb 100644 --- a/R/deliver_hna_de.R +++ b/R/deliver_hna_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.hna_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_infranken_de.R b/R/deliver_infranken_de.R index e63416c..616be23 100644 --- a/R/deliver_infranken_de.R +++ b/R/deliver_infranken_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.infranken_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_jungefreiheit_de.R b/R/deliver_jungefreiheit_de.R index 8731873..3ec1fb5 100644 --- a/R/deliver_jungefreiheit_de.R +++ b/R/deliver_jungefreiheit_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.jungefreiheit_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_kabeleins_de.R b/R/deliver_kabeleins_de.R index f176536..f69af62 100644 --- a/R/deliver_kabeleins_de.R +++ b/R/deliver_kabeleins_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.kabeleins_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_karlsruhe_insider_de.R b/R/deliver_karlsruhe_insider_de.R index 57a0aee..f83d63b 100644 --- a/R/deliver_karlsruhe_insider_de.R +++ b/R/deliver_karlsruhe_insider_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.karlsruhe_insider_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_kreiszeitung_de.R b/R/deliver_kreiszeitung_de.R index d9595b2..12f8e11 100644 --- a/R/deliver_kreiszeitung_de.R +++ b/R/deliver_kreiszeitung_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.kreiszeitung_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_ksta_de.R b/R/deliver_ksta_de.R index b55757f..273b4b7 100644 --- a/R/deliver_ksta_de.R +++ b/R/deliver_ksta_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.ksta_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_kurier_at.R b/R/deliver_kurier_at.R index ab3bc9d..91814de 100644 --- a/R/deliver_kurier_at.R +++ b/R/deliver_kurier_at.R @@ -5,7 +5,7 @@ pb_deliver_paper.kurier_at <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_lvz_de.R b/R/deliver_lvz_de.R index 7c45d14..4e651ed 100644 --- a/R/deliver_lvz_de.R +++ b/R/deliver_lvz_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.lvz_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[3]) diff --git a/R/deliver_manager_magazin_de.R b/R/deliver_manager_magazin_de.R index 74ecd51..2aa8a51 100644 --- a/R/deliver_manager_magazin_de.R +++ b/R/deliver_manager_magazin_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.manager_magazin_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_maz_online_de.R b/R/deliver_maz_online_de.R index 75c73e1..3a16ca9 100644 --- a/R/deliver_maz_online_de.R +++ b/R/deliver_maz_online_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.maz_online_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[3]) diff --git a/R/deliver_mdr_de.R b/R/deliver_mdr_de.R index 6854d56..2e9b048 100644 --- a/R/deliver_mdr_de.R +++ b/R/deliver_mdr_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.mdr_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_merkur_de.R b/R/deliver_merkur_de.R index 5ca6c91..f048a1b 100644 --- a/R/deliver_merkur_de.R +++ b/R/deliver_merkur_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.merkur_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_mopo_de.R b/R/deliver_mopo_de.R index 6cae71d..46ca9f9 100644 --- a/R/deliver_mopo_de.R +++ b/R/deliver_mopo_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.mopo_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_morgenpost_de.R b/R/deliver_morgenpost_de.R index 67d1aa6..52063e7 100644 --- a/R/deliver_morgenpost_de.R +++ b/R/deliver_morgenpost_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.morgenpost_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_n-tv_de.R b/R/deliver_n-tv_de.R index 58b0eed..0bca141 100644 --- a/R/deliver_n-tv_de.R +++ b/R/deliver_n-tv_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.n_tv_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_ndr_de.R b/R/deliver_ndr_de.R index 910d083..dad07f3 100644 --- a/R/deliver_ndr_de.R +++ b/R/deliver_ndr_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.ndr_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_news_de.R b/R/deliver_news_de.R index 0ca9cca..83f51a5 100644 --- a/R/deliver_news_de.R +++ b/R/deliver_news_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.news_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_news_und_nachrichten_de.R b/R/deliver_news_und_nachrichten_de.R index 8b2a7f2..5c75beb 100644 --- a/R/deliver_news_und_nachrichten_de.R +++ b/R/deliver_news_und_nachrichten_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.news_und_nachrichten_de <- function(x, verbose = NULL, pb, ...) html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(gsub("[\r\n]*", "", json_txt[1])) diff --git a/R/deliver_newsflash24_de.R b/R/deliver_newsflash24_de.R index 95f41f8..e26463c 100644 --- a/R/deliver_newsflash24_de.R +++ b/R/deliver_newsflash24_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.newsflash24_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_nordkurier_de.R b/R/deliver_nordkurier_de.R index 709f4d5..bc1900d 100644 --- a/R/deliver_nordkurier_de.R +++ b/R/deliver_nordkurier_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.nordkurier_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_noz_de.R b/R/deliver_noz_de.R index 02453de..5412ac1 100644 --- a/R/deliver_noz_de.R +++ b/R/deliver_noz_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.noz_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_nw_de.R b/R/deliver_nw_de.R index a0f9c58..4937e7d 100644 --- a/R/deliver_nw_de.R +++ b/R/deliver_nw_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.nw_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_nzz_ch.R b/R/deliver_nzz_ch.R index 278382e..d2fe908 100644 --- a/R/deliver_nzz_ch.R +++ b/R/deliver_nzz_ch.R @@ -5,7 +5,7 @@ pb_deliver_paper.nzz_ch <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_orf_at.R b/R/deliver_orf_at.R index fa14ae9..041eeb0 100644 --- a/R/deliver_orf_at.R +++ b/R/deliver_orf_at.R @@ -5,7 +5,7 @@ pb_deliver_paper.orf_at <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_ostsee_zeitung_de.R b/R/deliver_ostsee_zeitung_de.R index bf9a88c..934a69c 100644 --- a/R/deliver_ostsee_zeitung_de.R +++ b/R/deliver_ostsee_zeitung_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.ostsee_zeitung_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_txt <- json_txt[grepl("NewsArticle", json_txt)] diff --git a/R/deliver_presseportal_de.R b/R/deliver_presseportal_de.R index b18aa77..1db8d9f 100644 --- a/R/deliver_presseportal_de.R +++ b/R/deliver_presseportal_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.presseportal_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_prosieben_de.R b/R/deliver_prosieben_de.R index 2b0e1c8..c3f1394 100644 --- a/R/deliver_prosieben_de.R +++ b/R/deliver_prosieben_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.prosieben_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { if (length(json_txt) == 2) { diff --git a/R/deliver_rnd_de.R b/R/deliver_rnd_de.R index 915f98d..5b03588 100644 --- a/R/deliver_rnd_de.R +++ b/R/deliver_rnd_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.rnd_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[3]) diff --git a/R/deliver_rollingstone_de.R b/R/deliver_rollingstone_de.R index 6e0462a..ad70856 100644 --- a/R/deliver_rollingstone_de.R +++ b/R/deliver_rollingstone_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.rollingstone_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_rp_online_de.R b/R/deliver_rp_online_de.R index 0064aba..471f186 100644 --- a/R/deliver_rp_online_de.R +++ b/R/deliver_rp_online_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.rp_online_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_rtl_de.R b/R/deliver_rtl_de.R index 75ea01a..bf836ac 100644 --- a/R/deliver_rtl_de.R +++ b/R/deliver_rtl_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.rtl_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_ruhr24_de.R b/R/deliver_ruhr24_de.R index 0a861ef..684b6d5 100644 --- a/R/deliver_ruhr24_de.R +++ b/R/deliver_ruhr24_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.ruhr24_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_ruhrnachrichten_de.R b/R/deliver_ruhrnachrichten_de.R index 2bb36ec..f1e9637 100644 --- a/R/deliver_ruhrnachrichten_de.R +++ b/R/deliver_ruhrnachrichten_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.ruhrnachrichten_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_saechsische_de.R b/R/deliver_saechsische_de.R index cd84973..d389c3a 100644 --- a/R/deliver_saechsische_de.R +++ b/R/deliver_saechsische_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.saechsische_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[3]) diff --git a/R/deliver_shz_de.R b/R/deliver_shz_de.R index acebf28..d6a6d78 100644 --- a/R/deliver_shz_de.R +++ b/R/deliver_shz_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.shz_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_stern_de.R b/R/deliver_stern_de.R index 4a64f3e..0c9eb78 100644 --- a/R/deliver_stern_de.R +++ b/R/deliver_stern_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.stern_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt)[1, ] diff --git a/R/deliver_stuttgarter_zeitung_de.R b/R/deliver_stuttgarter_zeitung_de.R index 140de47..5d68731 100644 --- a/R/deliver_stuttgarter_zeitung_de.R +++ b/R/deliver_stuttgarter_zeitung_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.stuttgarter_zeitung_de <- function(x, verbose = NULL, pb, ...) html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_sueddeutsche_de.R b/R/deliver_sueddeutsche_de.R index ad9a6e1..f621d62 100644 --- a/R/deliver_sueddeutsche_de.R +++ b/R/deliver_sueddeutsche_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.sueddeutsche_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_suedkurier_de.R b/R/deliver_suedkurier_de.R index 640d1aa..e4de4f4 100644 --- a/R/deliver_suedkurier_de.R +++ b/R/deliver_suedkurier_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.suedkurier_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_swp_de.R b/R/deliver_swp_de.R index 49080f7..c65d331 100644 --- a/R/deliver_swp_de.R +++ b/R/deliver_swp_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.swp_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_t3n_de.R b/R/deliver_t3n_de.R index 22db194..fa5c06e 100644 --- a/R/deliver_t3n_de.R +++ b/R/deliver_t3n_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.t3n_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_t_online_de.R b/R/deliver_t_online_de.R index dd868a4..e6598f4 100644 --- a/R/deliver_t_online_de.R +++ b/R/deliver_t_online_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.t_online_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_tag24_de.R b/R/deliver_tag24_de.R index 5de25f2..367c109 100644 --- a/R/deliver_tag24_de.R +++ b/R/deliver_tag24_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.tag24_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_tagesschau_de.R b/R/deliver_tagesschau_de.R index 2d76f70..7c2f48d 100644 --- a/R/deliver_tagesschau_de.R +++ b/R/deliver_tagesschau_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.tagesschau_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_tagesspiegel_de.R b/R/deliver_tagesspiegel_de.R index d9ed5fc..4b29277 100644 --- a/R/deliver_tagesspiegel_de.R +++ b/R/deliver_tagesspiegel_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.tagesspiegel_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_thueringer_allgemeine_de.R b/R/deliver_thueringer_allgemeine_de.R index 9033cb0..cfa9e2c 100644 --- a/R/deliver_thueringer_allgemeine_de.R +++ b/R/deliver_thueringer_allgemeine_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.thueringer_allgemeine_de <- function(x, verbose = NULL, pb, ... html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_tz_de.R b/R/deliver_tz_de.R index 55129ae..dc15107 100644 --- a/R/deliver_tz_de.R +++ b/R/deliver_tz_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.tz_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_vice_com.R b/R/deliver_vice_com.R index f7ef3aa..6945c9f 100644 --- a/R/deliver_vice_com.R +++ b/R/deliver_vice_com.R @@ -5,7 +5,7 @@ pb_deliver_paper.vice_com <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_volksstimme_de.R b/R/deliver_volksstimme_de.R index 5c1d73a..1f1491d 100644 --- a/R/deliver_volksstimme_de.R +++ b/R/deliver_volksstimme_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.volksstimme_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_vox_de.R b/R/deliver_vox_de.R index 9f9b0ad..1b71995 100644 --- a/R/deliver_vox_de.R +++ b/R/deliver_vox_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.vox_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_wa_de.R b/R/deliver_wa_de.R index 89c64d2..865008b 100644 --- a/R/deliver_wa_de.R +++ b/R/deliver_wa_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.wa_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_watson_ch.R b/R/deliver_watson_ch.R index f09473b..81b882a 100644 --- a/R/deliver_watson_ch.R +++ b/R/deliver_watson_ch.R @@ -5,7 +5,7 @@ pb_deliver_paper.watson_ch <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_watson_de.R b/R/deliver_watson_de.R index c0b62ed..de4da25 100644 --- a/R/deliver_watson_de.R +++ b/R/deliver_watson_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.watson_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_waz_de.R b/R/deliver_waz_de.R index 94604b4..8ae5f1a 100644 --- a/R/deliver_waz_de.R +++ b/R/deliver_waz_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.waz_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_wdr_de.R b/R/deliver_wdr_de.R index 26a4c13..be383a0 100644 --- a/R/deliver_wdr_de.R +++ b/R/deliver_wdr_de.R @@ -6,7 +6,7 @@ pb_deliver_paper.wdr_de <- function(x, verbose = NULL, pb, ...) { # careful: json can have many objects but the first seems to be the article json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_welt_de.R b/R/deliver_welt_de.R index 6ab45b7..ab03a37 100644 --- a/R/deliver_welt_de.R +++ b/R/deliver_welt_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.welt_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) diff --git a/R/deliver_wiwo_de.R b/R/deliver_wiwo_de.R index 94f2976..f182a17 100644 --- a/R/deliver_wiwo_de.R +++ b/R/deliver_wiwo_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.wiwo_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { if (length(json_txt) != 0) { # otherwise the article is paywalled and not scrapeable diff --git a/R/deliver_zdf_de.R b/R/deliver_zdf_de.R index 317dc71..7107ace 100644 --- a/R/deliver_zdf_de.R +++ b/R/deliver_zdf_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.zdf_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt))) { + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { if (length(json_txt) != 1) { From b7d142b217e0a4ce913fc83fd0a342773557cc66 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Mon, 21 Oct 2024 07:06:00 +0200 Subject: [PATCH 107/121] rm json dumping --- R/deliver_abendblatt_de.R | 3 +-- R/deliver_abendzeitung_muenchen_de.R | 3 +-- R/deliver_badische_zeitung_de.R | 3 +-- R/deliver_berliner_kurier_de.R | 3 +-- R/deliver_berliner_zeitung_de.R | 3 +-- R/deliver_bnn_de.R | 3 +-- R/deliver_br_de.R | 3 +-- R/deliver_businessinsider_de.R | 3 +-- R/deliver_der_postillon_com.R | 3 +-- R/deliver_derwesten_de.R | 3 +-- R/deliver_echo24_de.R | 3 +-- R/deliver_epochtimes_de.R | 3 +-- R/deliver_express_de.R | 3 +-- R/deliver_finanzen_net.R | 3 +-- R/deliver_fnp_de.R | 3 +-- R/deliver_focus_de.R | 3 +-- R/deliver_fr_de.R | 3 +-- R/deliver_freiepresse_de.R | 3 +-- R/deliver_handelsblatt_de.R | 3 +-- R/deliver_haz_de.R | 3 +-- R/deliver_heidelberg24_de.R | 3 +-- R/deliver_hna_de.R | 3 +-- R/deliver_infranken_de.R | 3 +-- R/deliver_jungefreiheit_de.R | 3 +-- R/deliver_kabeleins_de.R | 3 +-- R/deliver_karlsruhe_insider_de.R | 3 +-- R/deliver_kreiszeitung_de.R | 3 +-- R/deliver_ksta_de.R | 3 +-- R/deliver_kurier_at.R | 3 +-- R/deliver_lvz_de.R | 3 +-- R/deliver_manager_magazin_de.R | 3 +-- R/deliver_maz_online_de.R | 3 +-- R/deliver_mdr_de.R | 3 +-- R/deliver_merkur_de.R | 3 +-- R/deliver_mopo_de.R | 3 +-- R/deliver_morgenpost_de.R | 3 +-- R/deliver_n-tv_de.R | 3 +-- R/deliver_ndr_de.R | 3 +-- R/deliver_news_de.R | 3 +-- R/deliver_news_und_nachrichten_de.R | 3 +-- R/deliver_newsflash24_de.R | 3 +-- R/deliver_nordkurier_de.R | 3 +-- R/deliver_noz_de.R | 3 +-- R/deliver_nw_de.R | 3 +-- R/deliver_nzz_ch.R | 3 +-- R/deliver_orf_at.R | 3 +-- R/deliver_ostsee_zeitung_de.R | 3 +-- R/deliver_presseportal_de.R | 3 +-- R/deliver_prosieben_de.R | 3 +-- R/deliver_rnd_de.R | 3 +-- R/deliver_rollingstone_de.R | 3 +-- R/deliver_rp_online_de.R | 3 +-- R/deliver_rtl_de.R | 3 +-- R/deliver_ruhr24_de.R | 3 +-- R/deliver_ruhrnachrichten_de.R | 3 +-- R/deliver_saechsische_de.R | 3 +-- R/deliver_shz_de.R | 3 +-- R/deliver_stern_de.R | 3 +-- R/deliver_stuttgarter_zeitung_de.R | 3 +-- R/deliver_sueddeutsche_de.R | 3 +-- R/deliver_suedkurier_de.R | 3 +-- R/deliver_swp_de.R | 3 +-- R/deliver_t3n_de.R | 3 +-- R/deliver_t_online_de.R | 3 +-- R/deliver_tag24_de.R | 3 +-- R/deliver_tagesschau_de.R | 3 +-- R/deliver_tagesspiegel_de.R | 3 +-- R/deliver_thueringer_allgemeine_de.R | 3 +-- R/deliver_tz_de.R | 3 +-- R/deliver_vice_com.R | 3 +-- R/deliver_volksstimme_de.R | 3 +-- R/deliver_vox_de.R | 3 +-- R/deliver_wa_de.R | 3 +-- R/deliver_watson_ch.R | 3 +-- R/deliver_watson_de.R | 3 +-- R/deliver_waz_de.R | 3 +-- R/deliver_wdr_de.R | 3 +-- R/deliver_welt_de.R | 3 +-- R/deliver_zdf_de.R | 3 +-- 79 files changed, 79 insertions(+), 158 deletions(-) diff --git a/R/deliver_abendblatt_de.R b/R/deliver_abendblatt_de.R index c505090..9ff3c1f 100644 --- a/R/deliver_abendblatt_de.R +++ b/R/deliver_abendblatt_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.abendblatt_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_abendzeitung_muenchen_de.R b/R/deliver_abendzeitung_muenchen_de.R index c833491..d75a8e9 100644 --- a/R/deliver_abendzeitung_muenchen_de.R +++ b/R/deliver_abendzeitung_muenchen_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.abendzeitung_muenchen_de <- function(x, verbose = NULL, pb, ... datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_badische_zeitung_de.R b/R/deliver_badische_zeitung_de.R index 1ba5911..3e49034 100644 --- a/R/deliver_badische_zeitung_de.R +++ b/R/deliver_badische_zeitung_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.badische_zeitung_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_berliner_kurier_de.R b/R/deliver_berliner_kurier_de.R index d840e75..6299738 100644 --- a/R/deliver_berliner_kurier_de.R +++ b/R/deliver_berliner_kurier_de.R @@ -19,7 +19,6 @@ pb_deliver_paper.berliner_kurier_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } diff --git a/R/deliver_berliner_zeitung_de.R b/R/deliver_berliner_zeitung_de.R index ee0a32b..27eb9d3 100644 --- a/R/deliver_berliner_zeitung_de.R +++ b/R/deliver_berliner_zeitung_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.berliner_zeitung_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_bnn_de.R b/R/deliver_bnn_de.R index 1e8a40e..51f78d9 100644 --- a/R/deliver_bnn_de.R +++ b/R/deliver_bnn_de.R @@ -26,8 +26,7 @@ pb_deliver_paper.bnn_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_br_de.R b/R/deliver_br_de.R index 7b4a8a4..ea92e0b 100644 --- a/R/deliver_br_de.R +++ b/R/deliver_br_de.R @@ -40,8 +40,7 @@ pb_deliver_paper.br_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_businessinsider_de.R b/R/deliver_businessinsider_de.R index ff2eba2..df358d7 100644 --- a/R/deliver_businessinsider_de.R +++ b/R/deliver_businessinsider_de.R @@ -28,8 +28,7 @@ pb_deliver_paper.businessinsider_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_der_postillon_com.R b/R/deliver_der_postillon_com.R index 5da42d2..0ce69f8 100644 --- a/R/deliver_der_postillon_com.R +++ b/R/deliver_der_postillon_com.R @@ -32,8 +32,7 @@ pb_deliver_paper.der_postillon_com <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_derwesten_de.R b/R/deliver_derwesten_de.R index 7b3a10e..017bec4 100644 --- a/R/deliver_derwesten_de.R +++ b/R/deliver_derwesten_de.R @@ -26,8 +26,7 @@ pb_deliver_paper.derwesten_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_echo24_de.R b/R/deliver_echo24_de.R index 7e4c74d..7503ccf 100644 --- a/R/deliver_echo24_de.R +++ b/R/deliver_echo24_de.R @@ -23,8 +23,7 @@ pb_deliver_paper.echo24_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_epochtimes_de.R b/R/deliver_epochtimes_de.R index ed92023..a374a5b 100644 --- a/R/deliver_epochtimes_de.R +++ b/R/deliver_epochtimes_de.R @@ -19,8 +19,7 @@ pb_deliver_paper.epochtimes_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_express_de.R b/R/deliver_express_de.R index c1ebe6c..21901ee 100644 --- a/R/deliver_express_de.R +++ b/R/deliver_express_de.R @@ -30,8 +30,7 @@ pb_deliver_paper.express_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_finanzen_net.R b/R/deliver_finanzen_net.R index 9515835..ee78407 100644 --- a/R/deliver_finanzen_net.R +++ b/R/deliver_finanzen_net.R @@ -22,8 +22,7 @@ pb_deliver_paper.finanzen_net <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_fnp_de.R b/R/deliver_fnp_de.R index b998439..a6c514f 100644 --- a/R/deliver_fnp_de.R +++ b/R/deliver_fnp_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.fnp_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_focus_de.R b/R/deliver_focus_de.R index deb040a..41cbf96 100644 --- a/R/deliver_focus_de.R +++ b/R/deliver_focus_de.R @@ -23,8 +23,7 @@ pb_deliver_paper.focus_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_fr_de.R b/R/deliver_fr_de.R index 84ed5ac..ee9031e 100644 --- a/R/deliver_fr_de.R +++ b/R/deliver_fr_de.R @@ -24,8 +24,7 @@ pb_deliver_paper.fr_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_freiepresse_de.R b/R/deliver_freiepresse_de.R index 0419ff8..0e0cf89 100644 --- a/R/deliver_freiepresse_de.R +++ b/R/deliver_freiepresse_de.R @@ -26,8 +26,7 @@ pb_deliver_paper.freiepresse_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_handelsblatt_de.R b/R/deliver_handelsblatt_de.R index af2b67b..baa086d 100644 --- a/R/deliver_handelsblatt_de.R +++ b/R/deliver_handelsblatt_de.R @@ -21,7 +21,6 @@ pb_deliver_paper.handelsblatt_com <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } diff --git a/R/deliver_haz_de.R b/R/deliver_haz_de.R index e2a65ec..d3dfb95 100644 --- a/R/deliver_haz_de.R +++ b/R/deliver_haz_de.R @@ -26,8 +26,7 @@ pb_deliver_paper.haz_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_heidelberg24_de.R b/R/deliver_heidelberg24_de.R index 7a8f006..dedc0f2 100644 --- a/R/deliver_heidelberg24_de.R +++ b/R/deliver_heidelberg24_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.heidelberg24_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_hna_de.R b/R/deliver_hna_de.R index 70363fb..556bc45 100644 --- a/R/deliver_hna_de.R +++ b/R/deliver_hna_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.hna_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_infranken_de.R b/R/deliver_infranken_de.R index 616be23..ef30165 100644 --- a/R/deliver_infranken_de.R +++ b/R/deliver_infranken_de.R @@ -19,8 +19,7 @@ pb_deliver_paper.infranken_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_jungefreiheit_de.R b/R/deliver_jungefreiheit_de.R index 3ec1fb5..c13aa5b 100644 --- a/R/deliver_jungefreiheit_de.R +++ b/R/deliver_jungefreiheit_de.R @@ -27,8 +27,7 @@ pb_deliver_paper.jungefreiheit_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_kabeleins_de.R b/R/deliver_kabeleins_de.R index f69af62..ca22970 100644 --- a/R/deliver_kabeleins_de.R +++ b/R/deliver_kabeleins_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.kabeleins_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_karlsruhe_insider_de.R b/R/deliver_karlsruhe_insider_de.R index f83d63b..059b1b7 100644 --- a/R/deliver_karlsruhe_insider_de.R +++ b/R/deliver_karlsruhe_insider_de.R @@ -28,8 +28,7 @@ pb_deliver_paper.karlsruhe_insider_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_kreiszeitung_de.R b/R/deliver_kreiszeitung_de.R index 12f8e11..419ff29 100644 --- a/R/deliver_kreiszeitung_de.R +++ b/R/deliver_kreiszeitung_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.kreiszeitung_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_ksta_de.R b/R/deliver_ksta_de.R index 273b4b7..165e7a4 100644 --- a/R/deliver_ksta_de.R +++ b/R/deliver_ksta_de.R @@ -30,8 +30,7 @@ pb_deliver_paper.ksta_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_kurier_at.R b/R/deliver_kurier_at.R index 91814de..b904cc1 100644 --- a/R/deliver_kurier_at.R +++ b/R/deliver_kurier_at.R @@ -22,8 +22,7 @@ pb_deliver_paper.kurier_at <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_lvz_de.R b/R/deliver_lvz_de.R index 4e651ed..07d67d2 100644 --- a/R/deliver_lvz_de.R +++ b/R/deliver_lvz_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.lvz_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_manager_magazin_de.R b/R/deliver_manager_magazin_de.R index 2aa8a51..bfd3cae 100644 --- a/R/deliver_manager_magazin_de.R +++ b/R/deliver_manager_magazin_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.manager_magazin_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_maz_online_de.R b/R/deliver_maz_online_de.R index 3a16ca9..8c909cb 100644 --- a/R/deliver_maz_online_de.R +++ b/R/deliver_maz_online_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.maz_online_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_mdr_de.R b/R/deliver_mdr_de.R index 2e9b048..20366cc 100644 --- a/R/deliver_mdr_de.R +++ b/R/deliver_mdr_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.mdr_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_merkur_de.R b/R/deliver_merkur_de.R index f048a1b..09700c9 100644 --- a/R/deliver_merkur_de.R +++ b/R/deliver_merkur_de.R @@ -23,8 +23,7 @@ pb_deliver_paper.merkur_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_mopo_de.R b/R/deliver_mopo_de.R index 46ca9f9..3254b4a 100644 --- a/R/deliver_mopo_de.R +++ b/R/deliver_mopo_de.R @@ -30,8 +30,7 @@ pb_deliver_paper.mopo_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_morgenpost_de.R b/R/deliver_morgenpost_de.R index 52063e7..461d210 100644 --- a/R/deliver_morgenpost_de.R +++ b/R/deliver_morgenpost_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.morgenpost_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_n-tv_de.R b/R/deliver_n-tv_de.R index 0bca141..213a624 100644 --- a/R/deliver_n-tv_de.R +++ b/R/deliver_n-tv_de.R @@ -23,8 +23,7 @@ pb_deliver_paper.n_tv_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_ndr_de.R b/R/deliver_ndr_de.R index dad07f3..96a61c9 100644 --- a/R/deliver_ndr_de.R +++ b/R/deliver_ndr_de.R @@ -27,8 +27,7 @@ pb_deliver_paper.ndr_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_news_de.R b/R/deliver_news_de.R index 83f51a5..4d02dcf 100644 --- a/R/deliver_news_de.R +++ b/R/deliver_news_de.R @@ -20,8 +20,7 @@ pb_deliver_paper.news_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_news_und_nachrichten_de.R b/R/deliver_news_und_nachrichten_de.R index 5c75beb..c7d7276 100644 --- a/R/deliver_news_und_nachrichten_de.R +++ b/R/deliver_news_und_nachrichten_de.R @@ -19,8 +19,7 @@ pb_deliver_paper.news_und_nachrichten_de <- function(x, verbose = NULL, pb, ...) datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_newsflash24_de.R b/R/deliver_newsflash24_de.R index e26463c..71e5b98 100644 --- a/R/deliver_newsflash24_de.R +++ b/R/deliver_newsflash24_de.R @@ -27,8 +27,7 @@ pb_deliver_paper.newsflash24_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_nordkurier_de.R b/R/deliver_nordkurier_de.R index bc1900d..51fc9a6 100644 --- a/R/deliver_nordkurier_de.R +++ b/R/deliver_nordkurier_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.nordkurier_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_noz_de.R b/R/deliver_noz_de.R index 5412ac1..422d01d 100644 --- a/R/deliver_noz_de.R +++ b/R/deliver_noz_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.noz_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_nw_de.R b/R/deliver_nw_de.R index 4937e7d..0dfcf0e 100644 --- a/R/deliver_nw_de.R +++ b/R/deliver_nw_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.nw_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_nzz_ch.R b/R/deliver_nzz_ch.R index d2fe908..8b2f148 100644 --- a/R/deliver_nzz_ch.R +++ b/R/deliver_nzz_ch.R @@ -22,8 +22,7 @@ pb_deliver_paper.nzz_ch <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_orf_at.R b/R/deliver_orf_at.R index 041eeb0..90b3f5e 100644 --- a/R/deliver_orf_at.R +++ b/R/deliver_orf_at.R @@ -22,8 +22,7 @@ pb_deliver_paper.orf_at <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_ostsee_zeitung_de.R b/R/deliver_ostsee_zeitung_de.R index 934a69c..06a9ee3 100644 --- a/R/deliver_ostsee_zeitung_de.R +++ b/R/deliver_ostsee_zeitung_de.R @@ -26,8 +26,7 @@ pb_deliver_paper.ostsee_zeitung_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_presseportal_de.R b/R/deliver_presseportal_de.R index 1db8d9f..7d177f7 100644 --- a/R/deliver_presseportal_de.R +++ b/R/deliver_presseportal_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.presseportal_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_prosieben_de.R b/R/deliver_prosieben_de.R index c3f1394..8261317 100644 --- a/R/deliver_prosieben_de.R +++ b/R/deliver_prosieben_de.R @@ -33,8 +33,7 @@ pb_deliver_paper.prosieben_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_rnd_de.R b/R/deliver_rnd_de.R index 5b03588..ceea221 100644 --- a/R/deliver_rnd_de.R +++ b/R/deliver_rnd_de.R @@ -26,8 +26,7 @@ pb_deliver_paper.rnd_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_rollingstone_de.R b/R/deliver_rollingstone_de.R index ad70856..4e4b0ff 100644 --- a/R/deliver_rollingstone_de.R +++ b/R/deliver_rollingstone_de.R @@ -27,8 +27,7 @@ pb_deliver_paper.rollingstone_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_rp_online_de.R b/R/deliver_rp_online_de.R index 471f186..3c9bce3 100644 --- a/R/deliver_rp_online_de.R +++ b/R/deliver_rp_online_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.rp_online_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_rtl_de.R b/R/deliver_rtl_de.R index bf836ac..0c92eb3 100644 --- a/R/deliver_rtl_de.R +++ b/R/deliver_rtl_de.R @@ -28,8 +28,7 @@ pb_deliver_paper.rtl_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_ruhr24_de.R b/R/deliver_ruhr24_de.R index 684b6d5..20ccd3a 100644 --- a/R/deliver_ruhr24_de.R +++ b/R/deliver_ruhr24_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.ruhr24_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_ruhrnachrichten_de.R b/R/deliver_ruhrnachrichten_de.R index f1e9637..3486f80 100644 --- a/R/deliver_ruhrnachrichten_de.R +++ b/R/deliver_ruhrnachrichten_de.R @@ -28,8 +28,7 @@ pb_deliver_paper.ruhrnachrichten_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_saechsische_de.R b/R/deliver_saechsische_de.R index d389c3a..3f9fde7 100644 --- a/R/deliver_saechsische_de.R +++ b/R/deliver_saechsische_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.saechsische_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_shz_de.R b/R/deliver_shz_de.R index d6a6d78..ddbe9e8 100644 --- a/R/deliver_shz_de.R +++ b/R/deliver_shz_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.shz_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_stern_de.R b/R/deliver_stern_de.R index 0c9eb78..2fa5fe1 100644 --- a/R/deliver_stern_de.R +++ b/R/deliver_stern_de.R @@ -23,8 +23,7 @@ pb_deliver_paper.stern_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_stuttgarter_zeitung_de.R b/R/deliver_stuttgarter_zeitung_de.R index 5d68731..6d731fd 100644 --- a/R/deliver_stuttgarter_zeitung_de.R +++ b/R/deliver_stuttgarter_zeitung_de.R @@ -25,8 +25,7 @@ pb_deliver_paper.stuttgarter_zeitung_de <- function(x, verbose = NULL, pb, ...) datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_sueddeutsche_de.R b/R/deliver_sueddeutsche_de.R index f621d62..fad18da 100644 --- a/R/deliver_sueddeutsche_de.R +++ b/R/deliver_sueddeutsche_de.R @@ -19,8 +19,7 @@ pb_deliver_paper.sueddeutsche_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_suedkurier_de.R b/R/deliver_suedkurier_de.R index e4de4f4..0ecb6a8 100644 --- a/R/deliver_suedkurier_de.R +++ b/R/deliver_suedkurier_de.R @@ -26,8 +26,7 @@ pb_deliver_paper.suedkurier_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_swp_de.R b/R/deliver_swp_de.R index c65d331..6afa8d7 100644 --- a/R/deliver_swp_de.R +++ b/R/deliver_swp_de.R @@ -21,8 +21,7 @@ pb_deliver_paper.swp_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_t3n_de.R b/R/deliver_t3n_de.R index fa5c06e..98c3e5b 100644 --- a/R/deliver_t3n_de.R +++ b/R/deliver_t3n_de.R @@ -20,8 +20,7 @@ pb_deliver_paper.t3n_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_t_online_de.R b/R/deliver_t_online_de.R index e6598f4..13f0de3 100644 --- a/R/deliver_t_online_de.R +++ b/R/deliver_t_online_de.R @@ -23,8 +23,7 @@ pb_deliver_paper.t_online_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_tag24_de.R b/R/deliver_tag24_de.R index 367c109..700dc72 100644 --- a/R/deliver_tag24_de.R +++ b/R/deliver_tag24_de.R @@ -18,8 +18,7 @@ pb_deliver_paper.tag24_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_tagesschau_de.R b/R/deliver_tagesschau_de.R index 7c2f48d..8e485e8 100644 --- a/R/deliver_tagesschau_de.R +++ b/R/deliver_tagesschau_de.R @@ -19,8 +19,7 @@ pb_deliver_paper.tagesschau_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_tagesspiegel_de.R b/R/deliver_tagesspiegel_de.R index 4b29277..802c278 100644 --- a/R/deliver_tagesspiegel_de.R +++ b/R/deliver_tagesspiegel_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.tagesspiegel_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_thueringer_allgemeine_de.R b/R/deliver_thueringer_allgemeine_de.R index cfa9e2c..94b41fa 100644 --- a/R/deliver_thueringer_allgemeine_de.R +++ b/R/deliver_thueringer_allgemeine_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.thueringer_allgemeine_de <- function(x, verbose = NULL, pb, ... datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_tz_de.R b/R/deliver_tz_de.R index dc15107..289421d 100644 --- a/R/deliver_tz_de.R +++ b/R/deliver_tz_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.tz_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_vice_com.R b/R/deliver_vice_com.R index 6945c9f..1d7cb56 100644 --- a/R/deliver_vice_com.R +++ b/R/deliver_vice_com.R @@ -22,8 +22,7 @@ pb_deliver_paper.vice_com <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_volksstimme_de.R b/R/deliver_volksstimme_de.R index 1f1491d..219cef1 100644 --- a/R/deliver_volksstimme_de.R +++ b/R/deliver_volksstimme_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.volksstimme_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_vox_de.R b/R/deliver_vox_de.R index 1b71995..59d391c 100644 --- a/R/deliver_vox_de.R +++ b/R/deliver_vox_de.R @@ -27,8 +27,7 @@ pb_deliver_paper.vox_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_wa_de.R b/R/deliver_wa_de.R index 865008b..b0cd136 100644 --- a/R/deliver_wa_de.R +++ b/R/deliver_wa_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.wa_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_watson_ch.R b/R/deliver_watson_ch.R index 81b882a..d866558 100644 --- a/R/deliver_watson_ch.R +++ b/R/deliver_watson_ch.R @@ -19,8 +19,7 @@ pb_deliver_paper.watson_ch <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_watson_de.R b/R/deliver_watson_de.R index de4da25..931f1a3 100644 --- a/R/deliver_watson_de.R +++ b/R/deliver_watson_de.R @@ -19,8 +19,7 @@ pb_deliver_paper.watson_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_waz_de.R b/R/deliver_waz_de.R index 8ae5f1a..3db7ec8 100644 --- a/R/deliver_waz_de.R +++ b/R/deliver_waz_de.R @@ -22,8 +22,7 @@ pb_deliver_paper.waz_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_wdr_de.R b/R/deliver_wdr_de.R index be383a0..60cc46a 100644 --- a/R/deliver_wdr_de.R +++ b/R/deliver_wdr_de.R @@ -24,8 +24,7 @@ pb_deliver_paper.wdr_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_welt_de.R b/R/deliver_welt_de.R index ab03a37..c037382 100644 --- a/R/deliver_welt_de.R +++ b/R/deliver_welt_de.R @@ -19,8 +19,7 @@ pb_deliver_paper.welt_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } diff --git a/R/deliver_zdf_de.R b/R/deliver_zdf_de.R index 7107ace..bb275d2 100644 --- a/R/deliver_zdf_de.R +++ b/R/deliver_zdf_de.R @@ -40,8 +40,7 @@ pb_deliver_paper.zdf_de <- function(x, verbose = NULL, pb, ...) { datetime, author, headline, - text, - json_df # dumping the whole json data of an article + text ) } } From d0b3e704efa5681589fcd0f1143319c39cea9c82 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Mon, 21 Oct 2024 10:55:35 +0200 Subject: [PATCH 108/121] better error handling (based on webtrack data) --- R/deliver_bild_de.R | 63 ++++++++++++++++++------------------- R/deliver_br_de.R | 61 ++++++++++++++--------------------- R/deliver_handelsblatt_de.R | 44 +++++++++++++++++++++----- R/deliver_nw_de.R | 2 +- R/deliver_presseportal_de.R | 4 +-- R/deliver_prosieben_de.R | 8 ++--- R/deliver_rnd_de.R | 3 ++ R/deliver_saechsische_de.R | 3 ++ R/deliver_waz_de.R | 3 ++ R/deliver_wiwo_de.R | 13 +++----- 10 files changed, 110 insertions(+), 94 deletions(-) diff --git a/R/deliver_bild_de.R b/R/deliver_bild_de.R index 1f5b68f..b8df76b 100644 --- a/R/deliver_bild_de.R +++ b/R/deliver_bild_de.R @@ -1,37 +1,36 @@ #' @export pb_deliver_paper.bild_de <- function(x, verbose = NULL, pb, ...) { - pb_tick(x, verbose, pb) - # raw html is stored in column content_raw - html <- rvest::read_html(x$content_raw) + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) - datetime <- html %>% - rvest::html_nodes("time.datetime, time.datetime--video datetime") %>% - rvest::html_text() %>% - lubridate::as_datetime(format = "%d.%m.%Y - %H:%M Uhr ") - - # headline - headline <- html %>% - rvest::html_nodes(".document-title__headline") %>% - rvest::html_text() - - # author - author <- html %>% - rvest::html_nodes(".authors") %>% - rvest::html_text() %>% - toString() - - # text - text <- html %>% - rvest::html_nodes(".article-body") %>% - rvest::html_text() %>% - paste(collapse = "\n") + datetime <- html %>% + rvest::html_node("time") %>% + rvest::html_attr("datetime") %>% + lubridate::as_datetime() - # the helper function safely creates a named list from objects - s_n_list( - datetime, - author, - headline, - text - ) + # headline + headline <- html %>% + rvest::html_nodes(".document-title__headline") %>% + rvest::html_text() + + # author + author <- html %>% + rvest::html_nodes(".article_author") %>% + rvest::html_text() %>% + toString() -} \ No newline at end of file + # text + text <- html %>% + rvest::html_nodes(".article-body") %>% + rvest::html_text() %>% + paste(collapse = "\n") + + # the helper function safely creates a named list from objects + s_n_list( + datetime, + author, + headline, + text + ) +} diff --git a/R/deliver_br_de.R b/R/deliver_br_de.R index ea92e0b..602049f 100644 --- a/R/deliver_br_de.R +++ b/R/deliver_br_de.R @@ -4,43 +4,28 @@ pb_deliver_paper.br_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { - return(s_n_list()) - } else { - json_df <- jsonlite::fromJSON(json_txt) - # if (is.null(names(json_df))) { + datetime <- html %>% + rvest::html_node("time") %>% + rvest::html_attr("datetime") %>% + lubridate::as_datetime() - if (any(json_df$`@type` == "NewsArticle")) { - json_df <- json_df[json_df$`@type` == "NewsArticle", ] - } else if (any(json_df$`@type` == "VideoObject")) { - json_df <- json_df[json_df$`@type` == "VideoObject", ] - } else if (any(json_df$`@type` == "AudioObject")) { - json_df <- json_df[json_df$`@type` == "AudioObject", ] - } else { - return(s_n_list()) - } - # } - if (json_df$`@type` != "VideoObject" && json_df$`@type` != "AudioObject") { # NewsArticle - datetime <- lubridate::as_datetime(json_df$datePublished) - headline <- json_df$headline - author <- toString(json_df$author$name) - text <- html %>% - rvest::html_node(".RichText_richText__wS9Rz.body3") %>% - rvest::html_nodes("p, h2") %>% - rvest::html_text2() %>% - paste(collapse = "\n") - } else { - datetime <- lubridate::as_datetime(json_df$uploadDate) - headline <- json_df$name - author <- "" - text <- json_df$description - } - s_n_list( - datetime, - author, - headline, - text - ) - } + headline <- html %>% + rvest::html_node(".heading1") %>% + rvest::html_text2() + + author <- html %>% + rvest::html_node(".ArticleModuleTeaser_authorName__Q7ctt") %>% + rvest::html_text2() %>% + toString() + text <- html %>% + rvest::html_node(".RichText_richText__wS9Rz.body3") %>% + rvest::html_nodes("p, h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + s_n_list( + datetime, + author, + headline, + text + ) } diff --git a/R/deliver_handelsblatt_de.R b/R/deliver_handelsblatt_de.R index baa086d..60a25e2 100644 --- a/R/deliver_handelsblatt_de.R +++ b/R/deliver_handelsblatt_de.R @@ -5,18 +5,46 @@ pb_deliver_paper.handelsblatt_com <- function(x, verbose = NULL, pb, ...) { # html <- rvest::read_html(x$content_raw) base_url <- "https://content.www.handelsblatt.com/api/content/eager/?url=" path <- adaR::ada_get_pathname(x$expanded_url) - json_df <- jsonlite::fromJSON(paste0(base_url, path)) + json_df <- tryCatch(jsonlite::fromJSON(paste0(base_url, path)), error = function(e) list(type = "404")) if (json_df$type == "redirect") { path <- json_df$location json_df <- jsonlite::fromJSON(paste0(base_url, path)) - } - datetime <- lubridate::as_datetime(json_df$header$dates$published) - headline <- json_df$header$headline - author <- toString(paste(json_df$authors$firstName, json_df$authors$lastName)) - text <- jsonlite::fromJSON(json_df$seo$jsonLd)$articleBody - text <- text[!is.na(text)] - text <- paste(text, collapse = "\n") + } else if (json_df$type == "404") { + html <- rvest::read_html(x$content_raw) + weekdays_de <- paste0(c("Montag", "Dienstag", "Mittwoch", "Donnerstag", "Freitag", "Samstag", "Sonntag"), collapse = "|") + months_de <- c("Januar", "Februar", "März", "April", "Mai", "Juni", "Juli", "August", "September", "Oktober", "November", "Dezember") + + date_string <- html %>% + rvest::html_node(".post-date .meta-text") |> + rvest::html_text2() + date_string <- gsub(weekdays_de, "", date_string) + + for (i in seq_along(months_de)) { + date_string <- gsub(months_de[i], i, date_string) + } + + date_string <- gsub("Uhr", "", date_string) + date_string <- gsub("‒", "", date_string) + date_string <- gsub(",", "", date_string) + + datetime <- lubridate::as_datetime(date_string, format = "%d. %m %Y %H:%M ") + headline <- html %>% + rvest::html_node("h1.entry-title") %>% + rvest::html_text() + author <- "" + text <- html %>% + rvest::html_nodes(".entry-content p, .entry-content h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + } else { + datetime <- lubridate::as_datetime(json_df$header$dates$published) + headline <- json_df$header$headline + author <- toString(paste(json_df$authors$firstName, json_df$authors$lastName)) + text <- jsonlite::fromJSON(json_df$seo$jsonLd)$articleBody + text <- text[!is.na(text)] + text <- paste(text, collapse = "\n") + } s_n_list( datetime, author, diff --git a/R/deliver_nw_de.R b/R/deliver_nw_de.R index 0dfcf0e..42d16a7 100644 --- a/R/deliver_nw_de.R +++ b/R/deliver_nw_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.nw_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { + if (isTRUE(is.na(json_txt)) || length(json_txt) <= 1) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_presseportal_de.R b/R/deliver_presseportal_de.R index 7d177f7..6e33ec2 100644 --- a/R/deliver_presseportal_de.R +++ b/R/deliver_presseportal_de.R @@ -4,8 +4,8 @@ pb_deliver_paper.presseportal_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() - if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2() + if (isTRUE(is.na(json_txt)) || length(json_txt) <= 1) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) diff --git a/R/deliver_prosieben_de.R b/R/deliver_prosieben_de.R index 8261317..345468a 100644 --- a/R/deliver_prosieben_de.R +++ b/R/deliver_prosieben_de.R @@ -9,12 +9,10 @@ pb_deliver_paper.prosieben_de <- function(x, verbose = NULL, pb, ...) { return(s_n_list()) } else { if (length(json_txt) == 2) { - json_txt <- json_txt[2] %>% rvest::html_text() - } else { - json_txt <- json_txt %>% rvest::html_text() + json_txt <- json_txt[2] } json_df <- jsonlite::fromJSON(json_txt) - if (json_df$`@type` != "VideoObject") { # NewsArticle + if (json_df$`@type` != "VideoObject" && json_df$`@type` != "FAQPage") { # NewsArticle datetime <- lubridate::as_datetime(json_df$datePublished) headline <- json_df$headline author <- toString(json_df$author$name) @@ -22,6 +20,8 @@ pb_deliver_paper.prosieben_de <- function(x, verbose = NULL, pb, ...) { rvest::html_elements(".css-f9qfdi p.css-bq2685,.css-f9qfdi h2") %>% rvest::html_text2() %>% paste(collapse = "\n") + } else if (json_df$`@type` != "FAQPage") { + return(s_n_list()) } else { datetime <- lubridate::as_datetime(json_df$uploadDate) headline <- json_df$name diff --git a/R/deliver_rnd_de.R b/R/deliver_rnd_de.R index ceea221..88947e9 100644 --- a/R/deliver_rnd_de.R +++ b/R/deliver_rnd_de.R @@ -8,6 +8,9 @@ pb_deliver_paper.rnd_de <- function(x, verbose = NULL, pb, ...) { if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { + if (length(json_txt) <= 2) { + return(s_n_list()) + } json_df <- jsonlite::fromJSON(json_txt[3]) datetime <- lubridate::as_datetime(json_df$datePublished) diff --git a/R/deliver_saechsische_de.R b/R/deliver_saechsische_de.R index 3f9fde7..31a32cb 100644 --- a/R/deliver_saechsische_de.R +++ b/R/deliver_saechsische_de.R @@ -8,6 +8,9 @@ pb_deliver_paper.saechsische_de <- function(x, verbose = NULL, pb, ...) { if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { + if (length(json_txt) >= 2) { + return(s_n_list()) + } json_df <- jsonlite::fromJSON(json_txt[3]) datetime <- lubridate::as_datetime(json_df$datePublished) diff --git a/R/deliver_waz_de.R b/R/deliver_waz_de.R index 3db7ec8..d6d590b 100644 --- a/R/deliver_waz_de.R +++ b/R/deliver_waz_de.R @@ -8,6 +8,9 @@ pb_deliver_paper.waz_de <- function(x, verbose = NULL, pb, ...) { if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { + if (length(json_txt) == 1) { + return(s_n_list()) + } json_df <- jsonlite::fromJSON(json_txt[2]) datetime <- lubridate::as_datetime(json_df$datePublished) diff --git a/R/deliver_wiwo_de.R b/R/deliver_wiwo_de.R index f182a17..52625f3 100644 --- a/R/deliver_wiwo_de.R +++ b/R/deliver_wiwo_de.R @@ -4,12 +4,12 @@ pb_deliver_paper.wiwo_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text() + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2() if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { return(s_n_list()) } else { if (length(json_txt) != 0) { # otherwise the article is paywalled and not scrapeable - json_df <- jsonlite::fromJSON(json_txt) + json_df <- jsonlite::fromJSON(json_txt[1]) datetime <- lubridate::as_datetime(json_df$datePublished) headline <- json_df$headline @@ -20,18 +20,13 @@ pb_deliver_paper.wiwo_de <- function(x, verbose = NULL, pb, ...) { .[!grepl("Lesen Sie auch", .)] %>% # Remove links in between paste(collapse = "\n") } else { - datetime <- NA - headline <- NA - author <- NA - text <- NA - json_df <- list("no access") + return(s_n_list()) } s_n_list( datetime, author, headline, - text, - json_df + text ) } } From 3ec0c8e3f80701fc10a03d6056488991a4418b0d Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Mon, 21 Oct 2024 11:03:10 +0200 Subject: [PATCH 109/121] added taz.de --- NAMESPACE | 1 + R/deliver_taz_de.R | 25 +++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 27 insertions(+) create mode 100644 R/deliver_taz_de.R diff --git a/NAMESPACE b/NAMESPACE index 8dae9ab..ff8b5e6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -134,6 +134,7 @@ S3method(pb_deliver_paper,t_online_de) S3method(pb_deliver_paper,tag24_de) S3method(pb_deliver_paper,tagesschau_de) S3method(pb_deliver_paper,tagesspiegel_de) +S3method(pb_deliver_paper,taz_de) S3method(pb_deliver_paper,telegraaf_nl) S3method(pb_deliver_paper,telegraph_co_uk) S3method(pb_deliver_paper,thecanary_co) diff --git a/R/deliver_taz_de.R b/R/deliver_taz_de.R new file mode 100644 index 0000000..10e07f7 --- /dev/null +++ b/R/deliver_taz_de.R @@ -0,0 +1,25 @@ +#' @export +pb_deliver_paper.taz_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2() + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[3]) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- json_df$articleBody + + s_n_list( + datetime, + author, + headline, + text + ) + } +} diff --git a/inst/status.csv b/inst/status.csv index db93467..4b6a73a 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -138,6 +138,7 @@ "tag24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "tagesschau.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "tagesspiegel.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.tagesspiegel.de/contentexport/feed/home" +"taz.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)",NA "t3n.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://t3n.de/rss.xml" "telegraaf.nl","![](https://img.shields.io/badge/status-silver-%23C0C0C0.svg)","[@JBGruber](https://github.com/JBGruber/)","[#17](https://github.com/JBGruber/paperboy/issues/17)","https://www.telegraaf.nl/rss.xml" "telegraph.co.uk","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From a95b7bf2c84afb18be6b05518b152ec03cab3df2 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Mon, 21 Oct 2024 11:03:51 +0200 Subject: [PATCH 110/121] better error handling tz --- R/deliver_taz_de.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/deliver_taz_de.R b/R/deliver_taz_de.R index 10e07f7..dacf1fd 100644 --- a/R/deliver_taz_de.R +++ b/R/deliver_taz_de.R @@ -5,7 +5,7 @@ pb_deliver_paper.taz_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2() - if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { + if (isTRUE(is.na(json_txt)) || length(json_txt) <= 2) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[3]) From 4ae7b6bb1ab7cc19161eed0be7a015cab274e7a9 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Mon, 21 Oct 2024 11:08:19 +0200 Subject: [PATCH 111/121] added schwaebische.de --- NAMESPACE | 1 + R/deliver_schwaebische_de.R | 28 ++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 30 insertions(+) create mode 100644 R/deliver_schwaebische_de.R diff --git a/NAMESPACE b/NAMESPACE index ff8b5e6..5d2413e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -114,6 +114,7 @@ S3method(pb_deliver_paper,rtl_nl) S3method(pb_deliver_paper,ruhr24_de) S3method(pb_deliver_paper,ruhrnachrichten_de) S3method(pb_deliver_paper,saechsische_de) +S3method(pb_deliver_paper,schwaebische_de) S3method(pb_deliver_paper,seznamzpravy_cz) S3method(pb_deliver_paper,sfgate_com) S3method(pb_deliver_paper,shz_de) diff --git a/R/deliver_schwaebische_de.R b/R/deliver_schwaebische_de.R new file mode 100644 index 0000000..5434abb --- /dev/null +++ b/R/deliver_schwaebische_de.R @@ -0,0 +1,28 @@ +#' @export +pb_deliver_paper.schwaebische_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2() + if (isTRUE(is.na(json_txt)) || length(json_txt) <= 1) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[2]) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".tw-text-title-md, p.paragraph, h2.tw-mb-4") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text + ) + } +} diff --git a/inst/status.csv b/inst/status.csv index 4b6a73a..6175a76 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -119,6 +119,7 @@ "rtl.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.rtlnieuws.nl/rss.xml" "ruhr24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "ruhrnachrichten.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA +"schwaebische.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "saechsische.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", "https://www.saechsische.de/arc/outboundfeeds/rss/" "seznamzpravy.cz","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.seznamzpravy.cz/rss" "sfgate.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA From 7bb9b339995a4359381bbbf6f2b3f4d6f3c52151 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Mon, 21 Oct 2024 11:15:31 +0200 Subject: [PATCH 112/121] added wz.de --- NAMESPACE | 1 + R/deliver_wz_de.R | 28 ++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 30 insertions(+) create mode 100644 R/deliver_wz_de.R diff --git a/NAMESPACE b/NAMESPACE index 5d2413e..be17428 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -158,6 +158,7 @@ S3method(pb_deliver_paper,wdr_de) S3method(pb_deliver_paper,welt_de) S3method(pb_deliver_paper,wiwo_de) S3method(pb_deliver_paper,wsj_com) +S3method(pb_deliver_paper,wz_de) S3method(pb_deliver_paper,yahoo_com) S3method(pb_deliver_paper,zdf_de) S3method(pb_deliver_paper,zeit_de) diff --git a/R/deliver_wz_de.R b/R/deliver_wz_de.R new file mode 100644 index 0000000..624522c --- /dev/null +++ b/R/deliver_wz_de.R @@ -0,0 +1,28 @@ +#' @export +pb_deliver_paper.wz_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2() + if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[1]) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes("article p.richtext,article h2.font-sans") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text + ) + } +} diff --git a/inst/status.csv b/inst/status.csv index 6175a76..48f3038 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -170,6 +170,7 @@ "welt.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.welt.de/feeds/latest.rss" "wiwo.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.wiwo.de/contentexport/feed/rss/schlagzeilen" "wsj.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://feeds.a.dj.com/rss/RSSWorldNews.xml" +"wz.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "yahoo.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://news.yahoo.com/rss.xml" "zdf.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.zdf.de/rss/zdf/nachrichten" "zeit.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.zeit.de/index" From 7cd635c60c588ce8182891bd47bd98a2ead58890 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Mon, 21 Oct 2024 11:19:17 +0200 Subject: [PATCH 113/121] added dnn.de --- NAMESPACE | 1 + R/deliver_dnn_de.R | 28 ++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 30 insertions(+) create mode 100644 R/deliver_dnn_de.R diff --git a/NAMESPACE b/NAMESPACE index be17428..072c262 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -35,6 +35,7 @@ S3method(pb_deliver_paper,derstandard_at) S3method(pb_deliver_paper,derwesten_de) S3method(pb_deliver_paper,deutschlandfunk_de) S3method(pb_deliver_paper,deutschlandfunkkultur_de) +S3method(pb_deliver_paper,dnn_de) S3method(pb_deliver_paper,echo24_de) S3method(pb_deliver_paper,epochtimes_de) S3method(pb_deliver_paper,evolvepolitics_com) diff --git a/R/deliver_dnn_de.R b/R/deliver_dnn_de.R new file mode 100644 index 0000000..aa3a4dc --- /dev/null +++ b/R/deliver_dnn_de.R @@ -0,0 +1,28 @@ +#' @export +pb_deliver_paper.dnn_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2() + if (isTRUE(is.na(json_txt)) || length(json_txt) <= 2) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt[3]) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 p,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text + ) + } +} diff --git a/inst/status.csv b/inst/status.csv index 48f3038..5f0a96d 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -33,6 +33,7 @@ "der-postillon.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "deutschlandfunk.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.deutschlandfunk.de/nachrichten-100.rss" "deutschlandfunkkultur.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.deutschlandfunkkultur.de/politik-114.rss" +"dnn.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "echo24.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "edition.cnn.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","http://rss.cnn.com/rss/edition.rss" "epochtimes.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA From b69303fac72588ea083b6b7c6c6d5a5191ddb51d Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Mon, 21 Oct 2024 11:23:10 +0200 Subject: [PATCH 114/121] added frankenpost.de --- NAMESPACE | 1 + R/deliver_frankenpost_de.R | 28 ++++++++++++++++++++++++++++ inst/status.csv | 1 + 3 files changed, 30 insertions(+) create mode 100644 R/deliver_frankenpost_de.R diff --git a/NAMESPACE b/NAMESPACE index 072c262..2c595c4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -84,6 +84,7 @@ S3method(pb_deliver_paper,metronieuws_nl) S3method(pb_deliver_paper,mopo_de) S3method(pb_deliver_paper,morgenpost_de) S3method(pb_deliver_paper,n_tv_de) +S3method(pb_deliver_paper,name_de) S3method(pb_deliver_paper,ndr_de) S3method(pb_deliver_paper,news_de) S3method(pb_deliver_paper,news_und_nachrichten_de) diff --git a/R/deliver_frankenpost_de.R b/R/deliver_frankenpost_de.R new file mode 100644 index 0000000..e7295cb --- /dev/null +++ b/R/deliver_frankenpost_de.R @@ -0,0 +1,28 @@ +#' @export +pb_deliver_paper.name_de <- function(x, verbose = NULL, pb, ...) { + pb_tick(x, verbose, pb) + # raw html is stored in column content_raw + html <- rvest::read_html(x$content_raw) + + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text2() + if (isTRUE(is.na(json_txt)) || length(json_txt) <= 0) { + return(s_n_list()) + } else { + json_df <- jsonlite::fromJSON(json_txt) + + datetime <- lubridate::as_datetime(json_df$datePublished) + headline <- json_df$headline + author <- toString(json_df$author$name) + text <- html %>% + rvest::html_nodes(".article-text p, .article-text h2") %>% + rvest::html_text2() %>% + paste(collapse = "\n") + + s_n_list( + datetime, + author, + headline, + text + ) + } +} diff --git a/inst/status.csv b/inst/status.csv index 5f0a96d..e9b552c 100644 --- a/inst/status.csv +++ b/inst/status.csv @@ -49,6 +49,7 @@ "foxbusiness.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "foxnews.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "fr.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","http://www.fr.de/rssfeed.rdf" +"frankenpost.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)","https://www.frankenpost.de/topmeldung.rss2.feed" "freiepresse.de","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@schochastics](https://github.com/schochastics)","[#23](https://github.com/JBGruber/paperboy/issues/23)", NA "ftw.usatoday.com","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","",NA "geenstijl.nl","![](https://img.shields.io/badge/status-gold-%23ffd700.svg)","[@JBGruber](https://github.com/JBGruber/)","","https://www.geenstijl.nl/feeds/recent.atom" From 920f4e53e874e1c276722650985387254af7dce6 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Mon, 21 Oct 2024 12:12:14 +0200 Subject: [PATCH 115/121] removed non ascii in handelsblat scraper --- R/deliver_handelsblatt_de.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/deliver_handelsblatt_de.R b/R/deliver_handelsblatt_de.R index 60a25e2..67ceb8f 100644 --- a/R/deliver_handelsblatt_de.R +++ b/R/deliver_handelsblatt_de.R @@ -12,7 +12,7 @@ pb_deliver_paper.handelsblatt_com <- function(x, verbose = NULL, pb, ...) { } else if (json_df$type == "404") { html <- rvest::read_html(x$content_raw) weekdays_de <- paste0(c("Montag", "Dienstag", "Mittwoch", "Donnerstag", "Freitag", "Samstag", "Sonntag"), collapse = "|") - months_de <- c("Januar", "Februar", "März", "April", "Mai", "Juni", "Juli", "August", "September", "Oktober", "November", "Dezember") + months_de <- c("Januar", "Februar", "M\U00E4rz", "April", "Mai", "Juni", "Juli", "August", "September", "Oktober", "November", "Dezember") date_string <- html %>% rvest::html_node(".post-date .meta-text") |> @@ -25,7 +25,7 @@ pb_deliver_paper.handelsblatt_com <- function(x, verbose = NULL, pb, ...) { } date_string <- gsub("Uhr", "", date_string) - date_string <- gsub("‒", "", date_string) + date_string <- gsub("-", "", date_string) date_string <- gsub(",", "", date_string) datetime <- lubridate::as_datetime(date_string, format = "%d. %m %Y %H:%M ") From 78aeb720bdd7afe37768e07dadbec67111d0a87f Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Mon, 21 Oct 2024 12:29:47 +0200 Subject: [PATCH 116/121] better error handling focus --- R/deliver_focus_de.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/deliver_focus_de.R b/R/deliver_focus_de.R index 41cbf96..0dee60a 100644 --- a/R/deliver_focus_de.R +++ b/R/deliver_focus_de.R @@ -9,7 +9,9 @@ pb_deliver_paper.focus_de <- function(x, verbose = NULL, pb, ...) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) - + if (json_df$`@type` != "NewsArticle") { + return(s_n_list()) + } datetime <- lubridate::as_datetime(json_df$datePublished) headline <- json_df$headline author <- toString(json_df$author$name) From 72768e8533e05a1a2240d5eece04873500b50f75 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Mon, 21 Oct 2024 12:41:35 +0200 Subject: [PATCH 117/121] removed call to deprecated html_node --- R/deliver_3sat_de.R | 6 +++--- R/deliver_abendblatt_de.R | 2 +- R/deliver_abendzeitung_muenchen_de.R | 2 +- R/deliver_augsburger_allgemeine.R | 8 ++++---- R/deliver_badische_zeitung_de.R | 2 +- R/deliver_berliner_kurier_de.R | 4 ++-- R/deliver_berliner_zeitung_de.R | 2 +- R/deliver_bild_de.R | 8 ++++---- R/deliver_bnn_de.R | 2 +- R/deliver_br_de.R | 10 +++++----- R/deliver_businessinsider_de.R | 4 ++-- R/deliver_der_postillon_com.R | 4 ++-- R/deliver_derstandard_at.R | 8 ++++---- R/deliver_derwesten_de.R | 4 ++-- R/deliver_deutschlandfunk_de.R | 6 +++--- R/deliver_deutschlandfunkkultur_de.R | 8 ++++---- R/deliver_dnn_de.R | 2 +- R/deliver_echo24_de.R | 2 +- R/deliver_express_de.R | 2 +- R/deliver_finanzen_net.R | 2 +- R/deliver_fnp_de.R | 2 +- R/deliver_frankenpost_de.R | 2 +- R/deliver_freiepresse_de.R | 2 +- R/deliver_handelsblatt_de.R | 6 +++--- R/deliver_haz_de.R | 2 +- R/deliver_heidelberg24_de.R | 2 +- R/deliver_heise_de.R | 2 +- R/deliver_hna_de.R | 2 +- R/deliver_jungefreiheit_de.R | 2 +- R/deliver_kabeleins_de.R | 2 +- R/deliver_karlsruhe_insider_de.R | 4 ++-- R/deliver_kreiszeitung_de.R | 2 +- R/deliver_ksta_de.R | 2 +- R/deliver_kurier_at.R | 2 +- R/deliver_lvz_de.R | 2 +- R/deliver_manager_magazin_de.R | 2 +- R/deliver_maz_online_de.R | 2 +- R/deliver_mdr_de.R | 2 +- R/deliver_merkur_de.R | 2 +- R/deliver_mopo_de.R | 2 +- R/deliver_morgenpost_de.R | 2 +- R/deliver_ndr_de.R | 2 +- R/deliver_newsflash24_de.R | 2 +- R/deliver_nordkurier_de.R | 2 +- R/deliver_noz_de.R | 2 +- R/deliver_nw_de.R | 2 +- R/deliver_nzz_ch.R | 2 +- R/deliver_orf_at.R | 2 +- R/deliver_ostsee_zeitung_de.R | 2 +- R/deliver_presseportal_de.R | 2 +- R/deliver_rbb24_de.R | 6 +++--- R/deliver_rnd_de.R | 6 +++--- R/deliver_rollingstone_de.R | 2 +- R/deliver_rp_online_de.R | 2 +- R/deliver_ruhr24_de.R | 2 +- R/deliver_ruhrnachrichten_de.R | 2 +- R/deliver_saechsische_de.R | 2 +- R/deliver_schwaebische_de.R | 2 +- R/deliver_shz_de.R | 2 +- R/deliver_srf_ch.R | 6 +++--- R/deliver_stuttgarter_zeitung_de.R | 2 +- R/deliver_suedkurier_de.R | 4 ++-- R/deliver_swp_de.R | 2 +- R/deliver_swr3_de.R | 8 ++++---- R/deliver_swr_de.R | 8 ++++---- R/deliver_swrfernsehen_de.R | 8 ++++---- R/deliver_t_online_de.R | 2 +- R/deliver_tagesspiegel_de.R | 2 +- R/deliver_thueringer_allgemeine_de.R | 2 +- R/deliver_tz_de.R | 2 +- R/deliver_vice_com.R | 2 +- R/deliver_volksstimme_de.R | 2 +- R/deliver_wa_de.R | 2 +- R/deliver_waz_de.R | 2 +- R/deliver_wdr_de.R | 2 +- R/deliver_wiwo_de.R | 2 +- R/deliver_wz_de.R | 2 +- R/deliver_zdf_de.R | 6 +++--- 78 files changed, 123 insertions(+), 123 deletions(-) diff --git a/R/deliver_3sat_de.R b/R/deliver_3sat_de.R index 8362825..5d47cc9 100644 --- a/R/deliver_3sat_de.R +++ b/R/deliver_3sat_de.R @@ -4,18 +4,18 @@ pb_deliver_paper.3sat_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) datetime <- html %>% - rvest::html_nodes("time") %>% + rvest::html_elements("time") %>% rvest::html_attr("datetime") %>% lubridate::as_datetime() headline <- html %>% - rvest::html_nodes(".main-content-details h2") %>% + rvest::html_elements(".main-content-details h2") %>% rvest::html_text() author <- "" # no author info found text <- html %>% - rvest::html_nodes(".o--post-long p") %>% + rvest::html_elements(".o--post-long p") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_abendblatt_de.R b/R/deliver_abendblatt_de.R index 9ff3c1f..515acb6 100644 --- a/R/deliver_abendblatt_de.R +++ b/R/deliver_abendblatt_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.abendblatt_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".article-body h3, .article-body p") %>% + rvest::html_elements(".article-body h3, .article-body p") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_abendzeitung_muenchen_de.R b/R/deliver_abendzeitung_muenchen_de.R index d75a8e9..4dd06fc 100644 --- a/R/deliver_abendzeitung_muenchen_de.R +++ b/R/deliver_abendzeitung_muenchen_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.abendzeitung_muenchen_de <- function(x, verbose = NULL, pb, ... headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".artdetail_short ,.artdetail_text p,.artdetail_text h2") %>% + rvest::html_elements(".artdetail_short ,.artdetail_text p,.artdetail_text h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_augsburger_allgemeine.R b/R/deliver_augsburger_allgemeine.R index 83d448e..23fd7fc 100644 --- a/R/deliver_augsburger_allgemeine.R +++ b/R/deliver_augsburger_allgemeine.R @@ -6,18 +6,18 @@ pb_deliver_paper.augsburger_allgemeine_de <- function(x, verbose = NULL, pb, ... datetime <- html %>% - rvest::html_node("time") %>% + rvest::html_element("time") %>% rvest::html_attr("datetime") %>% lubridate::as_datetime() headline <- html %>% - rvest::html_node("h2.typo-teaserheadline-SoleXL, h2.typo-articleheadline-Recife") %>% + rvest::html_element("h2.typo-teaserheadline-SoleXL, h2.typo-articleheadline-Recife") %>% rvest::html_text() author <- html %>% - rvest::html_nodes("a.typo-author-link") %>% + rvest::html_elements("a.typo-author-link") %>% rvest::html_text2() %>% toString() text <- html %>% - rvest::html_nodes(".typo-article-teaser-Recife, .typo-article-teaser, .article-body-paid-content, .typo-subhead, p.text-xs") %>% + rvest::html_elements(".typo-article-teaser-Recife, .typo-article-teaser, .article-body-paid-content, .typo-subhead, p.text-xs") %>% rvest::html_text2() %>% unique() %>% # teaser might be duplicated paste(collapse = "\n") diff --git a/R/deliver_badische_zeitung_de.R b/R/deliver_badische_zeitung_de.R index 3e49034..2fdbf68 100644 --- a/R/deliver_badische_zeitung_de.R +++ b/R/deliver_badische_zeitung_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.badische_zeitung_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author) text <- html %>% - rvest::html_nodes("section[role = \"article\"], .article-site__topic") %>% + rvest::html_elements("section[role = \"article\"], .article-site__topic") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_berliner_kurier_de.R b/R/deliver_berliner_kurier_de.R index 6299738..1308cf6 100644 --- a/R/deliver_berliner_kurier_de.R +++ b/R/deliver_berliner_kurier_de.R @@ -4,14 +4,14 @@ pb_deliver_paper.berliner_kurier_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) - json_txt <- rvest::html_nodes(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() + json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text() json_df <- jsonlite::fromJSON(json_txt) datetime <- lubridate::as_datetime(json_df$datePublished) headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".article_header-lead__0E3Bn, p.article_paragraph__hXYKJ, h2.article_subtitle__wx1Lu") %>% + rvest::html_elements(".article_header-lead__0E3Bn, p.article_paragraph__hXYKJ, h2.article_subtitle__wx1Lu") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_berliner_zeitung_de.R b/R/deliver_berliner_zeitung_de.R index 27eb9d3..2813ee5 100644 --- a/R/deliver_berliner_zeitung_de.R +++ b/R/deliver_berliner_zeitung_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.berliner_zeitung_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".article_paragraph__hXYKJ") %>% + rvest::html_elements(".article_paragraph__hXYKJ") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_bild_de.R b/R/deliver_bild_de.R index b8df76b..0d7b7c2 100644 --- a/R/deliver_bild_de.R +++ b/R/deliver_bild_de.R @@ -5,24 +5,24 @@ pb_deliver_paper.bild_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) datetime <- html %>% - rvest::html_node("time") %>% + rvest::html_element("time") %>% rvest::html_attr("datetime") %>% lubridate::as_datetime() # headline headline <- html %>% - rvest::html_nodes(".document-title__headline") %>% + rvest::html_elements(".document-title__headline") %>% rvest::html_text() # author author <- html %>% - rvest::html_nodes(".article_author") %>% + rvest::html_elements(".article_author") %>% rvest::html_text() %>% toString() # text text <- html %>% - rvest::html_nodes(".article-body") %>% + rvest::html_elements(".article-body") %>% rvest::html_text() %>% paste(collapse = "\n") diff --git a/R/deliver_bnn_de.R b/R/deliver_bnn_de.R index 51f78d9..354edbd 100644 --- a/R/deliver_bnn_de.R +++ b/R/deliver_bnn_de.R @@ -18,7 +18,7 @@ pb_deliver_paper.bnn_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".intro,.article__body p,.article__body h2") %>% + rvest::html_elements(".intro,.article__body p,.article__body h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_br_de.R b/R/deliver_br_de.R index 602049f..6f4ed32 100644 --- a/R/deliver_br_de.R +++ b/R/deliver_br_de.R @@ -5,21 +5,21 @@ pb_deliver_paper.br_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) datetime <- html %>% - rvest::html_node("time") %>% + rvest::html_element("time") %>% rvest::html_attr("datetime") %>% lubridate::as_datetime() headline <- html %>% - rvest::html_node(".heading1") %>% + rvest::html_element(".heading1") %>% rvest::html_text2() author <- html %>% - rvest::html_node(".ArticleModuleTeaser_authorName__Q7ctt") %>% + rvest::html_element(".ArticleModuleTeaser_authorName__Q7ctt") %>% rvest::html_text2() %>% toString() text <- html %>% - rvest::html_node(".RichText_richText__wS9Rz.body3") %>% - rvest::html_nodes("p, h2") %>% + rvest::html_element(".RichText_richText__wS9Rz.body3") %>% + rvest::html_elements("p, h2") %>% rvest::html_text2() %>% paste(collapse = "\n") s_n_list( diff --git a/R/deliver_businessinsider_de.R b/R/deliver_businessinsider_de.R index df358d7..0b51d86 100644 --- a/R/deliver_businessinsider_de.R +++ b/R/deliver_businessinsider_de.R @@ -19,8 +19,8 @@ pb_deliver_paper.businessinsider_de <- function(x, verbose = NULL, pb, ...) { datetime <- lubridate::as_datetime(json_df$datePublished) headline <- json_df$headline text <- html %>% - rvest::html_node(".article-main") %>% - rvest::html_nodes("p, h2") %>% + rvest::html_element(".article-main") %>% + rvest::html_elements("p, h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_der_postillon_com.R b/R/deliver_der_postillon_com.R index 0ce69f8..a3de3b8 100644 --- a/R/deliver_der_postillon_com.R +++ b/R/deliver_der_postillon_com.R @@ -14,14 +14,14 @@ pb_deliver_paper.der_postillon_com <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".post-body p") %>% + rvest::html_elements(".post-body p") %>% rvest::html_text2() %>% paste(collapse = "\n") # author abbr can be found at the end of the article if (author == "Der Postillon") { author_tmp <- html %>% - rvest::html_node("div[id='post-body'] span[style='font-size: x-small;']") %>% + rvest::html_element("div[id='post-body'] span[style='font-size: x-small;']") %>% rvest::html_text() %>% sub("; Erstver.*$", "", .) if (author_tmp != "") { diff --git a/R/deliver_derstandard_at.R b/R/deliver_derstandard_at.R index d23094c..95c0d84 100644 --- a/R/deliver_derstandard_at.R +++ b/R/deliver_derstandard_at.R @@ -4,21 +4,21 @@ pb_deliver_paper.derstandard_at <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) datetime <- html %>% - rvest::html_nodes(".article-meta") %>% + rvest::html_elements(".article-meta") %>% rvest::html_text() %>% lubridate::as_datetime() headline <- html %>% - rvest::html_nodes("h1.article-title") %>% + rvest::html_elements("h1.article-title") %>% rvest::html_text() author <- html %>% - rvest::html_nodes(".article-origins") %>% + rvest::html_elements(".article-origins") %>% rvest::html_text() %>% toString() text <- html %>% - rvest::html_nodes(".article-body p, .article-body h3") %>% + rvest::html_elements(".article-body p, .article-body h3") %>% rvest::html_text2() %>% paste(collapse = "\n") # There is a note that parts of the website are blocked diff --git a/R/deliver_derwesten_de.R b/R/deliver_derwesten_de.R index 017bec4..2f59255 100644 --- a/R/deliver_derwesten_de.R +++ b/R/deliver_derwesten_de.R @@ -13,12 +13,12 @@ pb_deliver_paper.derwesten_de <- function(x, verbose = NULL, pb, ...) { datetime <- lubridate::as_datetime(json_df$datePublished) headline <- json_df$headline author <- html %>% - rvest::html_nodes(".author.vcard .url.fn.n") %>% + rvest::html_elements(".author.vcard .url.fn.n") %>% rvest::html_text() %>% toString() text <- html %>% - rvest::html_nodes(".lead p,.article-body p") %>% + rvest::html_elements(".lead p,.article-body p") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_deutschlandfunk_de.R b/R/deliver_deutschlandfunk_de.R index 06601ed..396f394 100644 --- a/R/deliver_deutschlandfunk_de.R +++ b/R/deliver_deutschlandfunk_de.R @@ -5,15 +5,15 @@ pb_deliver_paper.deutschlandfunk_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) datetime <- html %>% - rvest::html_node("time") %>% + rvest::html_element("time") %>% rvest::html_attr("datetime") %>% lubridate::as_datetime() headline <- html %>% - rvest::html_node(".headline-title") %>% + rvest::html_element(".headline-title") %>% rvest::html_text() author <- "deutschlandfunk.de" # could not find article with author text <- html %>% - rvest::html_nodes(".article-header-description,.article-details-text:not(.u-text-italic),.article-details-title") %>% + rvest::html_elements(".article-header-description,.article-details-text:not(.u-text-italic),.article-details-title") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_deutschlandfunkkultur_de.R b/R/deliver_deutschlandfunkkultur_de.R index 154b3fc..db59109 100644 --- a/R/deliver_deutschlandfunkkultur_de.R +++ b/R/deliver_deutschlandfunkkultur_de.R @@ -5,17 +5,17 @@ pb_deliver_paper.deutschlandfunkkultur_de <- function(x, verbose = NULL, pb, ... html <- rvest::read_html(x$content_raw) datetime <- html %>% - rvest::html_node("time") %>% + rvest::html_element("time") %>% rvest::html_attr("datetime") %>% lubridate::as_datetime() headline <- html %>% - rvest::html_node(".headline-title,.section-article-head-area-title") %>% + rvest::html_element(".headline-title,.section-article-head-area-title") %>% rvest::html_text() author <- html %>% - rvest::html_node(".article-header-author") %>% + rvest::html_element(".article-header-author") %>% rvest::html_text() text <- html %>% - rvest::html_nodes(".section-article-head-area-description,.article-header-description,.article-details-text:not(.u-text-italic),.article-details-title") %>% + rvest::html_elements(".section-article-head-area-description,.article-header-description,.article-details-text:not(.u-text-italic),.article-details-title") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_dnn_de.R b/R/deliver_dnn_de.R index aa3a4dc..43bbb1a 100644 --- a/R/deliver_dnn_de.R +++ b/R/deliver_dnn_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.dnn_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 p,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 h2") %>% + rvest::html_elements(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 p,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_echo24_de.R b/R/deliver_echo24_de.R index 7503ccf..d222d46 100644 --- a/R/deliver_echo24_de.R +++ b/R/deliver_echo24_de.R @@ -15,7 +15,7 @@ pb_deliver_paper.echo24_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_express_de.R b/R/deliver_express_de.R index 21901ee..9279783 100644 --- a/R/deliver_express_de.R +++ b/R/deliver_express_de.R @@ -19,7 +19,7 @@ pb_deliver_paper.express_de <- function(x, verbose = NULL, pb, ...) { datetime <- lubridate::as_datetime(json_df$datePublished) headline <- sub(" \\| .*", "", json_df$headline) text <- html %>% - rvest::html_nodes(".dm-article__intro,.dm-paragraph,.dm-article__subheadline") %>% + rvest::html_elements(".dm-article__intro,.dm-paragraph,.dm-article__subheadline") %>% rvest::html_text2() %>% paste(collapse = "\n") if (author == "") { diff --git a/R/deliver_finanzen_net.R b/R/deliver_finanzen_net.R index ee78407..c712b3e 100644 --- a/R/deliver_finanzen_net.R +++ b/R/deliver_finanzen_net.R @@ -14,7 +14,7 @@ pb_deliver_paper.finanzen_net <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes("p.h3, .news-container__text p, .news-container__text h2") %>% + rvest::html_elements("p.h3, .news-container__text p, .news-container__text h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_fnp_de.R b/R/deliver_fnp_de.R index a6c514f..faca40d 100644 --- a/R/deliver_fnp_de.R +++ b/R/deliver_fnp_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.fnp_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_frankenpost_de.R b/R/deliver_frankenpost_de.R index e7295cb..562e851 100644 --- a/R/deliver_frankenpost_de.R +++ b/R/deliver_frankenpost_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.name_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".article-text p, .article-text h2") %>% + rvest::html_elements(".article-text p, .article-text h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_freiepresse_de.R b/R/deliver_freiepresse_de.R index 0e0cf89..f468f14 100644 --- a/R/deliver_freiepresse_de.R +++ b/R/deliver_freiepresse_de.R @@ -18,7 +18,7 @@ pb_deliver_paper.freiepresse_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author) text <- html %>% - rvest::html_nodes(".article__text p,.article__text h2") %>% + rvest::html_elements(".article__text p,.article__text h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_handelsblatt_de.R b/R/deliver_handelsblatt_de.R index 67ceb8f..dd5ef3f 100644 --- a/R/deliver_handelsblatt_de.R +++ b/R/deliver_handelsblatt_de.R @@ -15,7 +15,7 @@ pb_deliver_paper.handelsblatt_com <- function(x, verbose = NULL, pb, ...) { months_de <- c("Januar", "Februar", "M\U00E4rz", "April", "Mai", "Juni", "Juli", "August", "September", "Oktober", "November", "Dezember") date_string <- html %>% - rvest::html_node(".post-date .meta-text") |> + rvest::html_element(".post-date .meta-text") |> rvest::html_text2() date_string <- gsub(weekdays_de, "", date_string) @@ -30,11 +30,11 @@ pb_deliver_paper.handelsblatt_com <- function(x, verbose = NULL, pb, ...) { datetime <- lubridate::as_datetime(date_string, format = "%d. %m %Y %H:%M ") headline <- html %>% - rvest::html_node("h1.entry-title") %>% + rvest::html_element("h1.entry-title") %>% rvest::html_text() author <- "" text <- html %>% - rvest::html_nodes(".entry-content p, .entry-content h2") %>% + rvest::html_elements(".entry-content p, .entry-content h2") %>% rvest::html_text2() %>% paste(collapse = "\n") } else { diff --git a/R/deliver_haz_de.R b/R/deliver_haz_de.R index d3dfb95..c95d020 100644 --- a/R/deliver_haz_de.R +++ b/R/deliver_haz_de.R @@ -18,7 +18,7 @@ pb_deliver_paper.haz_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2,.Textstyled__Text-sc-1cqv9mi-0.gqSIEH") %>% + rvest::html_elements(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2,.Textstyled__Text-sc-1cqv9mi-0.gqSIEH") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_heidelberg24_de.R b/R/deliver_heidelberg24_de.R index dedc0f2..7bd61af 100644 --- a/R/deliver_heidelberg24_de.R +++ b/R/deliver_heidelberg24_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.heidelberg24_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_heise_de.R b/R/deliver_heise_de.R index 76c8b90..eac850e 100644 --- a/R/deliver_heise_de.R +++ b/R/deliver_heise_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.heise_de <- function(x, verbose = NULL, pb, ...) { author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes("#lead,#article-content-body .ringCommonDetail.ringBlockType-paragraph,.article-content,.a-article-header__lead") %>% + rvest::html_elements("#lead,#article-content-body .ringCommonDetail.ringBlockType-paragraph,.article-content,.a-article-header__lead") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_hna_de.R b/R/deliver_hna_de.R index 556bc45..10a4176 100644 --- a/R/deliver_hna_de.R +++ b/R/deliver_hna_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.hna_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_jungefreiheit_de.R b/R/deliver_jungefreiheit_de.R index c13aa5b..48a1519 100644 --- a/R/deliver_jungefreiheit_de.R +++ b/R/deliver_jungefreiheit_de.R @@ -19,7 +19,7 @@ pb_deliver_paper.jungefreiheit_de <- function(x, verbose = NULL, pb, ...) { datetime <- lubridate::as_datetime(json_df$datePublished) headline <- json_df$headline text <- html %>% - rvest::html_nodes(".elementor-widget-container p, .elementor-widget-container h3") %>% + rvest::html_elements(".elementor-widget-container p, .elementor-widget-container h3") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_kabeleins_de.R b/R/deliver_kabeleins_de.R index ca22970..18bd177 100644 --- a/R/deliver_kabeleins_de.R +++ b/R/deliver_kabeleins_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.kabeleins_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes("p.css-1tkp8z5, h2.css-xfddm,p.css-1pcz62z") %>% + rvest::html_elements("p.css-1tkp8z5, h2.css-xfddm,p.css-1pcz62z") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_karlsruhe_insider_de.R b/R/deliver_karlsruhe_insider_de.R index 059b1b7..fd69a92 100644 --- a/R/deliver_karlsruhe_insider_de.R +++ b/R/deliver_karlsruhe_insider_de.R @@ -19,8 +19,8 @@ pb_deliver_paper.karlsruhe_insider_de <- function(x, verbose = NULL, pb, ...) { datetime <- lubridate::as_datetime(json_df$datePublished) headline <- json_df$headline text <- html %>% - rvest::html_node("article .td-post-content") %>% - rvest::html_nodes("p, h2") %>% + rvest::html_element("article .td-post-content") %>% + rvest::html_elements("p, h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_kreiszeitung_de.R b/R/deliver_kreiszeitung_de.R index 419ff29..a3449fc 100644 --- a/R/deliver_kreiszeitung_de.R +++ b/R/deliver_kreiszeitung_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.kreiszeitung_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_ksta_de.R b/R/deliver_ksta_de.R index 165e7a4..ea48e63 100644 --- a/R/deliver_ksta_de.R +++ b/R/deliver_ksta_de.R @@ -19,7 +19,7 @@ pb_deliver_paper.ksta_de <- function(x, verbose = NULL, pb, ...) { datetime <- lubridate::as_datetime(json_df$datePublished) headline <- sub(" \\| .*", "", json_df$headline) text <- html %>% - rvest::html_nodes(".dm-article__intro,.dm-paragraph,.dm-article__subheadline") %>% + rvest::html_elements(".dm-article__intro,.dm-paragraph,.dm-article__subheadline") %>% rvest::html_text2() %>% paste(collapse = "\n") if (author == "") { diff --git a/R/deliver_kurier_at.R b/R/deliver_kurier_at.R index b904cc1..31deb4b 100644 --- a/R/deliver_kurier_at.R +++ b/R/deliver_kurier_at.R @@ -14,7 +14,7 @@ pb_deliver_paper.kurier_at <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".headerComp-intro,.paragraph.copy") %>% + rvest::html_elements(".headerComp-intro,.paragraph.copy") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_lvz_de.R b/R/deliver_lvz_de.R index 07d67d2..9cb41c9 100644 --- a/R/deliver_lvz_de.R +++ b/R/deliver_lvz_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.lvz_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Headlinestyled__Headline-sc-mamptc-0,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Textstyled__Text-sc-1cqv9mi-0") %>% + rvest::html_elements(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Headlinestyled__Headline-sc-mamptc-0,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Textstyled__Text-sc-1cqv9mi-0") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_manager_magazin_de.R b/R/deliver_manager_magazin_de.R index bfd3cae..9336320 100644 --- a/R/deliver_manager_magazin_de.R +++ b/R/deliver_manager_magazin_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.manager_magazin_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".leading-loose, .RichText p, .RichText h3") %>% + rvest::html_elements(".leading-loose, .RichText p, .RichText h3") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_maz_online_de.R b/R/deliver_maz_online_de.R index 8c909cb..f75365f 100644 --- a/R/deliver_maz_online_de.R +++ b/R/deliver_maz_online_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.maz_online_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes("header .Textstyled__Text-sc-1cqv9mi-0, article .Textstyled__Text-sc-1cqv9mi-0, article h2") %>% + rvest::html_elements("header .Textstyled__Text-sc-1cqv9mi-0, article .Textstyled__Text-sc-1cqv9mi-0, article h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_mdr_de.R b/R/deliver_mdr_de.R index 20366cc..b7b0b6f 100644 --- a/R/deliver_mdr_de.R +++ b/R/deliver_mdr_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.mdr_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".einleitung,.paragraph") %>% + rvest::html_elements(".einleitung,.paragraph") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_merkur_de.R b/R/deliver_merkur_de.R index 09700c9..c613fce 100644 --- a/R/deliver_merkur_de.R +++ b/R/deliver_merkur_de.R @@ -15,7 +15,7 @@ pb_deliver_paper.merkur_de <- function(x, verbose = NULL, pb, ...) { author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_mopo_de.R b/R/deliver_mopo_de.R index 3254b4a..0378266 100644 --- a/R/deliver_mopo_de.R +++ b/R/deliver_mopo_de.R @@ -19,7 +19,7 @@ pb_deliver_paper.mopo_de <- function(x, verbose = NULL, pb, ...) { datetime <- lubridate::as_datetime(json_df$datePublished) headline <- sub(" \\| .*", "", json_df$headline) text <- html %>% - rvest::html_nodes("p, h2") %>% + rvest::html_elements("p, h2") %>% rvest::html_text2() %>% paste(collapse = "\n") if (author == "") { diff --git a/R/deliver_morgenpost_de.R b/R/deliver_morgenpost_de.R index 461d210..e8b8657 100644 --- a/R/deliver_morgenpost_de.R +++ b/R/deliver_morgenpost_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.morgenpost_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".article-body p") %>% + rvest::html_elements(".article-body p") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_ndr_de.R b/R/deliver_ndr_de.R index 96a61c9..ef1f28f 100644 --- a/R/deliver_ndr_de.R +++ b/R/deliver_ndr_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.ndr_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".modulepadding.copytext p, .modulepadding.copytext h2") %>% + rvest::html_elements(".modulepadding.copytext p, .modulepadding.copytext h2") %>% rvest::html_text2() %>% paste(collapse = "\n") } else { diff --git a/R/deliver_newsflash24_de.R b/R/deliver_newsflash24_de.R index 71e5b98..f1593a4 100644 --- a/R/deliver_newsflash24_de.R +++ b/R/deliver_newsflash24_de.R @@ -19,7 +19,7 @@ pb_deliver_paper.newsflash24_de <- function(x, verbose = NULL, pb, ...) { datetime <- lubridate::as_datetime(json_df$datePublished) headline <- json_df$headline text <- html %>% - rvest::html_nodes(".entry-content p, .entry-content h2") %>% + rvest::html_elements(".entry-content p, .entry-content h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_nordkurier_de.R b/R/deliver_nordkurier_de.R index 51fc9a6..fc99167 100644 --- a/R/deliver_nordkurier_de.R +++ b/R/deliver_nordkurier_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.nordkurier_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".tw-text-title-md, .paragraph,h2.tw-mb-4") %>% + rvest::html_elements(".tw-text-title-md, .paragraph,h2.tw-mb-4") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_noz_de.R b/R/deliver_noz_de.R index 422d01d..0f2349a 100644 --- a/R/deliver_noz_de.R +++ b/R/deliver_noz_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.noz_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes("p.w-600,section.content--group p, section.content--group h2") %>% + rvest::html_elements("p.w-600,section.content--group p, section.content--group h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_nw_de.R b/R/deliver_nw_de.R index 42d16a7..1673ca2 100644 --- a/R/deliver_nw_de.R +++ b/R/deliver_nw_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.nw_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes("p.em_text,h2.Zwischenzeile") %>% + rvest::html_elements("p.em_text,h2.Zwischenzeile") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_nzz_ch.R b/R/deliver_nzz_ch.R index 8b2f148..a60e1e8 100644 --- a/R/deliver_nzz_ch.R +++ b/R/deliver_nzz_ch.R @@ -14,7 +14,7 @@ pb_deliver_paper.nzz_ch <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".headline__lead,.articlecomponent.text,.subtitle,.articlecomponent") %>% + rvest::html_elements(".headline__lead,.articlecomponent.text,.subtitle,.articlecomponent") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_orf_at.R b/R/deliver_orf_at.R index 90b3f5e..159ae38 100644 --- a/R/deliver_orf_at.R +++ b/R/deliver_orf_at.R @@ -14,7 +14,7 @@ pb_deliver_paper.orf_at <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".story-lead-text,.story-story p,.story-story h2") %>% + rvest::html_elements(".story-lead-text,.story-story p,.story-story h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_ostsee_zeitung_de.R b/R/deliver_ostsee_zeitung_de.R index 06a9ee3..d9f3bea 100644 --- a/R/deliver_ostsee_zeitung_de.R +++ b/R/deliver_ostsee_zeitung_de.R @@ -18,7 +18,7 @@ pb_deliver_paper.ostsee_zeitung_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Textstyled__Text-sc-1cqv9mi-0,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Headlinestyled__Headline-sc-mamptc-0") %>% + rvest::html_elements(".Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Textstyled__Text-sc-1cqv9mi-0,.Articlestyled__ArticleBodyWrapper-sc-7y75gq-2 .Headlinestyled__Headline-sc-mamptc-0") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_presseportal_de.R b/R/deliver_presseportal_de.R index 6e33ec2..74258cf 100644 --- a/R/deliver_presseportal_de.R +++ b/R/deliver_presseportal_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.presseportal_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes("article.story p:not([class])") %>% + rvest::html_elements("article.story p:not([class])") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_rbb24_de.R b/R/deliver_rbb24_de.R index a255e14..b6f5145 100644 --- a/R/deliver_rbb24_de.R +++ b/R/deliver_rbb24_de.R @@ -5,20 +5,20 @@ pb_deliver_paper.rbb24_de <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) datetime <- html %>% - rvest::html_nodes(".technicalline .lineinfo") %>% + rvest::html_elements(".technicalline .lineinfo") %>% rvest::html_text2() %>% gsub(".*(\\d{2}\\.\\d{2}\\.\\d{2}) \\| (\\d{2}:\\d{2}).*", "\\1 \\2", .) %>% lubridate::as_datetime(format = "%d.%m.%y %H:%M", tz = "UTC") # This will not be the correct timezone headline <- html %>% - rvest::html_nodes(".titletext") %>% + rvest::html_elements(".titletext") %>% rvest::html_text2() author <- "" # no article with author info found text <- html %>% - rvest::html_nodes(".shorttext p, .textblock p, h4.texttitle") %>% + rvest::html_elements(".shorttext p, .textblock p, h4.texttitle") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_rnd_de.R b/R/deliver_rnd_de.R index 88947e9..98c451f 100644 --- a/R/deliver_rnd_de.R +++ b/R/deliver_rnd_de.R @@ -17,12 +17,12 @@ pb_deliver_paper.rnd_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% + rvest::html_elements(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% rvest::html_text2() more_items <- html %>% # delete content in lists of related items - rvest::html_nodes("div[data-is-element-rendered='true']") %>% - rvest::html_nodes(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% + rvest::html_elements("div[data-is-element-rendered='true']") %>% + rvest::html_elements(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% rvest::html_text2() text <- text[!text %in% more_items] %>% paste(collapse = "\n") s_n_list( diff --git a/R/deliver_rollingstone_de.R b/R/deliver_rollingstone_de.R index 4e4b0ff..1f4c8ce 100644 --- a/R/deliver_rollingstone_de.R +++ b/R/deliver_rollingstone_de.R @@ -19,7 +19,7 @@ pb_deliver_paper.rollingstone_de <- function(x, verbose = NULL, pb, ...) { datetime <- lubridate::as_datetime(json_df$datePublished) headline <- json_df$headline text <- html %>% - rvest::html_nodes(".asmb-article-excerpt,.asmb-article-content-container h2,.asmb-article-content-container p") %>% + rvest::html_elements(".asmb-article-excerpt,.asmb-article-content-container h2,.asmb-article-content-container p") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_rp_online_de.R b/R/deliver_rp_online_de.R index 3c9bce3..810a370 100644 --- a/R/deliver_rp_online_de.R +++ b/R/deliver_rp_online_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.rp_online_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes("strong[data-cy=\"intro\"],div[data-cy=\"article_content\"] p") %>% + rvest::html_elements("strong[data-cy=\"intro\"],div[data-cy=\"article_content\"] p") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_ruhr24_de.R b/R/deliver_ruhr24_de.R index 20ccd3a..c88f01b 100644 --- a/R/deliver_ruhr24_de.R +++ b/R/deliver_ruhr24_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.ruhr24_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-crosshead,.id-StoryElement-paragraph") %>% + rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-crosshead,.id-StoryElement-paragraph") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_ruhrnachrichten_de.R b/R/deliver_ruhrnachrichten_de.R index 3486f80..c3bd085 100644 --- a/R/deliver_ruhrnachrichten_de.R +++ b/R/deliver_ruhrnachrichten_de.R @@ -19,7 +19,7 @@ pb_deliver_paper.ruhrnachrichten_de <- function(x, verbose = NULL, pb, ...) { datetime <- lubridate::as_datetime(json_df$datePublished) headline <- json_df$headline text <- html %>% - rvest::html_nodes("p.article__teaser-text,.article__content p, .article__content h2") %>% + rvest::html_elements("p.article__teaser-text,.article__content p, .article__content h2") %>% rvest::html_text2() %>% paste(collapse = "\n") %>% gsub("\nZur Startseite$", "", .) diff --git a/R/deliver_saechsische_de.R b/R/deliver_saechsische_de.R index 31a32cb..4ce59a6 100644 --- a/R/deliver_saechsische_de.R +++ b/R/deliver_saechsische_de.R @@ -17,7 +17,7 @@ pb_deliver_paper.saechsische_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% + rvest::html_elements(".Textstyled__Text-sc-1cqv9mi-0, .Headlinestyled__Headline-sc-mamptc-0") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_schwaebische_de.R b/R/deliver_schwaebische_de.R index 5434abb..4a271ab 100644 --- a/R/deliver_schwaebische_de.R +++ b/R/deliver_schwaebische_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.schwaebische_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".tw-text-title-md, p.paragraph, h2.tw-mb-4") %>% + rvest::html_elements(".tw-text-title-md, p.paragraph, h2.tw-mb-4") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_shz_de.R b/R/deliver_shz_de.R index ddbe9e8..e713d90 100644 --- a/R/deliver_shz_de.R +++ b/R/deliver_shz_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.shz_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes("p.w-600, p,h2.h4") %>% + rvest::html_elements("p.w-600, p,h2.h4") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_srf_ch.R b/R/deliver_srf_ch.R index 8ef8d3e..3352cee 100644 --- a/R/deliver_srf_ch.R +++ b/R/deliver_srf_ch.R @@ -5,20 +5,20 @@ pb_deliver_paper.srf_ch <- function(x, verbose = NULL, pb, ...) { html <- rvest::read_html(x$content_raw) json_df <- html %>% - rvest::html_node("span#config__js") %>% + rvest::html_element("span#config__js") %>% rvest::html_attr("data-analytics-webtrekk-survey-gizmo-value-object") %>% jsonlite::fromJSON() datetime <- lubridate::as_datetime(json_df$params$content_publication_datetime) headline <- html %>% - rvest::html_nodes("h1 .article-title__text") %>% + rvest::html_elements("h1 .article-title__text") %>% rvest::html_text() author <- "" # no article with author info founds text <- html %>% - rvest::html_nodes(".article-content p, .article-content h2") %>% + rvest::html_elements(".article-content p, .article-content h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_stuttgarter_zeitung_de.R b/R/deliver_stuttgarter_zeitung_de.R index 6d731fd..b0b07dd 100644 --- a/R/deliver_stuttgarter_zeitung_de.R +++ b/R/deliver_stuttgarter_zeitung_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.stuttgarter_zeitung_de <- function(x, verbose = NULL, pb, ...) headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".brick.intro-text p,.brickgroup p,.brickgroup h2") %>% + rvest::html_elements(".brick.intro-text p,.brickgroup p,.brickgroup h2") %>% rvest::html_text2() rm_text <- c("StZ-Plus-Abonnement", "Vertrag mit Werbung") diff --git a/R/deliver_suedkurier_de.R b/R/deliver_suedkurier_de.R index 0ecb6a8..c802c39 100644 --- a/R/deliver_suedkurier_de.R +++ b/R/deliver_suedkurier_de.R @@ -11,14 +11,14 @@ pb_deliver_paper.suedkurier_de <- function(x, verbose = NULL, pb, ...) { json_df <- jsonlite::fromJSON(json_txt[1]) datetime <- lubridate::as_datetime(json_df$datePublished) headline <- html %>% - rvest::html_node("header h1") %>% + rvest::html_element("header h1") %>% rvest::html_text() author <- paste0("<p>", json_df$author$name, "</p>", collapse = ",") %>% rvest::read_html() %>% rvest::html_text() %>% toString() text <- html %>% - rvest::html_nodes(".article-summary,.article-jsonld.article-paywall-summary,.article-jsonld p") %>% + rvest::html_elements(".article-summary,.article-jsonld.article-paywall-summary,.article-jsonld p") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_swp_de.R b/R/deliver_swp_de.R index 6afa8d7..7558b31 100644 --- a/R/deliver_swp_de.R +++ b/R/deliver_swp_de.R @@ -13,7 +13,7 @@ pb_deliver_paper.swp_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".u-article-header .fs-4,.u-paragraph, .u-title.u-headline") %>% + rvest::html_elements(".u-article-header .fs-4,.u-paragraph, .u-title.u-headline") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_swr3_de.R b/R/deliver_swr3_de.R index 85d400a..1d77c7e 100644 --- a/R/deliver_swr3_de.R +++ b/R/deliver_swr3_de.R @@ -4,21 +4,21 @@ pb_deliver_paper.swr3_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) datetime <- html %>% - rvest::html_nodes(".meta-top time") %>% + rvest::html_elements(".meta-top time") %>% rvest::html_attr("datetime") %>% lubridate::as_datetime() headline <- html %>% - rvest::html_nodes("h1.headline") %>% + rvest::html_elements("h1.headline") %>% rvest::html_text() author <- html %>% - rvest::html_nodes(".meta-top .meta-author-name a") %>% + rvest::html_elements(".meta-top .meta-author-name a") %>% rvest::html_text2() %>% toString() text <- html %>% - rvest::html_nodes("p.lead, .bodytext p, .bodytext h2") %>% + rvest::html_elements("p.lead, .bodytext p, .bodytext h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_swr_de.R b/R/deliver_swr_de.R index eeac63b..221a955 100644 --- a/R/deliver_swr_de.R +++ b/R/deliver_swr_de.R @@ -6,18 +6,18 @@ pb_deliver_paper.swr_de <- function(x, verbose = NULL, pb, ...) { datetime <- html %>% - rvest::html_node("time") %>% + rvest::html_element("time") %>% rvest::html_attr("datetime") %>% lubridate::as_datetime() headline <- html %>% - rvest::html_node("h1.headline") %>% + rvest::html_element("h1.headline") %>% rvest::html_text() author <- html %>% - rvest::html_nodes(".meta-top .meta-authors .meta-author-name a") %>% + rvest::html_elements(".meta-top .meta-authors .meta-author-name a") %>% rvest::html_text2() %>% toString() text <- html %>% - rvest::html_nodes(".detail-body .lead, .bodytext p, .bodytext h2") %>% + rvest::html_elements(".detail-body .lead, .bodytext p, .bodytext h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_swrfernsehen_de.R b/R/deliver_swrfernsehen_de.R index 8506be7..70567e3 100644 --- a/R/deliver_swrfernsehen_de.R +++ b/R/deliver_swrfernsehen_de.R @@ -4,21 +4,21 @@ pb_deliver_paper.swrfernsehen_de <- function(x, verbose = NULL, pb, ...) { # raw html is stored in column content_raw html <- rvest::read_html(x$content_raw) datetime <- html %>% - rvest::html_nodes(".meta-top .meta-description time") %>% + rvest::html_elements(".meta-top .meta-description time") %>% rvest::html_attr("datetime") %>% lubridate::as_datetime() headline <- html %>% - rvest::html_nodes("h1.headline") %>% + rvest::html_elements("h1.headline") %>% rvest::html_text() author <- html %>% - rvest::html_nodes(".meta-top .meta-author-name a") %>% + rvest::html_elements(".meta-top .meta-author-name a") %>% rvest::html_text2() %>% toString() text <- html %>% - rvest::html_nodes(".detail-body .lead,.bodytext p,.bodytext h2") %>% + rvest::html_elements(".detail-body .lead,.bodytext p,.bodytext h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_t_online_de.R b/R/deliver_t_online_de.R index 13f0de3..f5961c9 100644 --- a/R/deliver_t_online_de.R +++ b/R/deliver_t_online_de.R @@ -15,7 +15,7 @@ pb_deliver_paper.t_online_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author[[1]]$name) text <- html %>% - rvest::html_nodes("div[data-testid=\"ArticleBody.StreamLayout\"] p") %>% + rvest::html_elements("div[data-testid=\"ArticleBody.StreamLayout\"] p") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_tagesspiegel_de.R b/R/deliver_tagesspiegel_de.R index 802c278..48c547f 100644 --- a/R/deliver_tagesspiegel_de.R +++ b/R/deliver_tagesspiegel_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.tagesspiegel_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes("#story-elements p") %>% + rvest::html_elements("#story-elements p") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_thueringer_allgemeine_de.R b/R/deliver_thueringer_allgemeine_de.R index 94b41fa..5bba2cc 100644 --- a/R/deliver_thueringer_allgemeine_de.R +++ b/R/deliver_thueringer_allgemeine_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.thueringer_allgemeine_de <- function(x, verbose = NULL, pb, ... headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".article-body p, .article-body h3") %>% + rvest::html_elements(".article-body p, .article-body h3") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_tz_de.R b/R/deliver_tz_de.R index 289421d..1792a41 100644 --- a/R/deliver_tz_de.R +++ b/R/deliver_tz_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.tz_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-crosshead,.id-StoryElement-paragraph") %>% + rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-crosshead,.id-StoryElement-paragraph") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_vice_com.R b/R/deliver_vice_com.R index 1d7cb56..df3f4e6 100644 --- a/R/deliver_vice_com.R +++ b/R/deliver_vice_com.R @@ -14,7 +14,7 @@ pb_deliver_paper.vice_com <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".entry-content.entry-content p,.entry-content entry-content h2") %>% + rvest::html_elements(".entry-content.entry-content p,.entry-content entry-content h2") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_volksstimme_de.R b/R/deliver_volksstimme_de.R index 219cef1..a41548e 100644 --- a/R/deliver_volksstimme_de.R +++ b/R/deliver_volksstimme_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.volksstimme_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".fp-article-heading__excerpt,.fp-paragraph, .fp-subheading") %>% + rvest::html_elements(".fp-article-heading__excerpt,.fp-paragraph, .fp-subheading") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_wa_de.R b/R/deliver_wa_de.R index b0cd136..1583ce3 100644 --- a/R/deliver_wa_de.R +++ b/R/deliver_wa_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.wa_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% + rvest::html_elements(".id-StoryElement-leadText,.id-StoryElement-paragraph,.id-StoryElement-crosshead") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_waz_de.R b/R/deliver_waz_de.R index d6d590b..6b4229c 100644 --- a/R/deliver_waz_de.R +++ b/R/deliver_waz_de.R @@ -17,7 +17,7 @@ pb_deliver_paper.waz_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".article-body p,.article-body h3") %>% + rvest::html_elements(".article-body p,.article-body h3") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_wdr_de.R b/R/deliver_wdr_de.R index 60cc46a..4ccd781 100644 --- a/R/deliver_wdr_de.R +++ b/R/deliver_wdr_de.R @@ -16,7 +16,7 @@ pb_deliver_paper.wdr_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) %>% gsub("/", ",", .) text <- html %>% - rvest::html_nodes(".einleitung,.text,.subtitle") %>% + rvest::html_elements(".einleitung,.text,.subtitle") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_wiwo_de.R b/R/deliver_wiwo_de.R index 52625f3..f8d25f1 100644 --- a/R/deliver_wiwo_de.R +++ b/R/deliver_wiwo_de.R @@ -15,7 +15,7 @@ pb_deliver_paper.wiwo_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$creator) text <- html %>% - rvest::html_nodes(".c-leadtext,.u-richtext h3,.u-richtext p") %>% + rvest::html_elements(".c-leadtext,.u-richtext h3,.u-richtext p") %>% rvest::html_text2() %>% .[!grepl("Lesen Sie auch", .)] %>% # Remove links in between paste(collapse = "\n") diff --git a/R/deliver_wz_de.R b/R/deliver_wz_de.R index 624522c..686259a 100644 --- a/R/deliver_wz_de.R +++ b/R/deliver_wz_de.R @@ -14,7 +14,7 @@ pb_deliver_paper.wz_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes("article p.richtext,article h2.font-sans") %>% + rvest::html_elements("article p.richtext,article h2.font-sans") %>% rvest::html_text2() %>% paste(collapse = "\n") diff --git a/R/deliver_zdf_de.R b/R/deliver_zdf_de.R index bb275d2..3097e02 100644 --- a/R/deliver_zdf_de.R +++ b/R/deliver_zdf_de.R @@ -17,7 +17,7 @@ pb_deliver_paper.zdf_de <- function(x, verbose = NULL, pb, ...) { headline <- json_df$headline author <- toString(json_df$author$name) text <- html %>% - rvest::html_nodes(".r1nj4qn5") %>% + rvest::html_elements(".r1nj4qn5") %>% rvest::html_text2() %>% paste(collapse = "\n") } else if (json_df$`@type` == "VideoObject") { @@ -27,11 +27,11 @@ pb_deliver_paper.zdf_de <- function(x, verbose = NULL, pb, ...) { text <- json_df$description } else { datetime <- html %>% - rvest::html_node("time") %>% + rvest::html_element("time") %>% rvest::html_attr("datetime") %>% lubridate::as_datetime() headline <- html %>% - rvest::html_node("main h2") %>% + rvest::html_element("main h2") %>% rvest::html_text2() author <- "" text <- "" From a7605e350954c9e13bbaf98da9c01051860b231f Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Mon, 21 Oct 2024 12:45:52 +0200 Subject: [PATCH 118/121] further focus.de error handling --- R/deliver_focus_de.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/deliver_focus_de.R b/R/deliver_focus_de.R index 0dee60a..157127d 100644 --- a/R/deliver_focus_de.R +++ b/R/deliver_focus_de.R @@ -9,6 +9,9 @@ pb_deliver_paper.focus_de <- function(x, verbose = NULL, pb, ...) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[1]) + if ("@graph" %in% names(json_df)) { + json_df <- json_df$`@graph` + } if (json_df$`@type` != "NewsArticle") { return(s_n_list()) } From d54653b79e3f28881675a9a175a21697838486fb Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Tue, 22 Oct 2024 06:49:31 +0200 Subject: [PATCH 119/121] added David as ctb --- DESCRIPTION | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0365775..9a09b90 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,11 +3,16 @@ Title: Comprehensive Collection of News Media Scrapers Version: 0.0.7.9000 Date: 2024-07-17 Authors@R: - person(given = "Johannes B.", - family = "Gruber", - email = "JohannesB.Gruber@gmail.com", - role = c("aut", "cre"), - comment = c(ORCID = "0000-0001-9177-1772")) + c(person(given = "Johannes B.", + family = "Gruber", + email = "JohannesB.Gruber@gmail.com", + role = c("aut", "cre"), + comment = c(ORCID = "0000-0001-9177-1772")), + person(given = "David", + family = "Schoch", + email = "david@schochastics.net", + role = "ctb", + comment = c(ORCID = "0000-0003-2952-4812"))) Description: A comprehensive collection of webscraping scripts for news media sites. Depends: R (>= 3.5.0) From 230db676a571328d5f58d04ea22babe576ed7621 Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 24 Oct 2024 11:01:13 +0200 Subject: [PATCH 120/121] better error handling rtl.de --- R/deliver_rtl_de.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/R/deliver_rtl_de.R b/R/deliver_rtl_de.R index 0c92eb3..0121096 100644 --- a/R/deliver_rtl_de.R +++ b/R/deliver_rtl_de.R @@ -9,6 +9,9 @@ pb_deliver_paper.rtl_de <- function(x, verbose = NULL, pb, ...) { return(s_n_list()) } else { json_df <- jsonlite::fromJSON(json_txt[2]) + if (any(json_df$`@type` %in% c("VideoGame"))) { + return(s_n_list()) + } if (json_df$`@type` != "VideoObject") { # NewsArticle datetime <- lubridate::as_datetime(json_df$datePublished) headline <- json_df$headline From 3b4b3f97b587c050cee8685b0cbb370334f6468f Mon Sep 17 00:00:00 2001 From: schochastics <david@schochastics.net> Date: Thu, 24 Oct 2024 14:25:25 +0200 Subject: [PATCH 121/121] changed text scraping for spiegel.de --- R/deliver_spiegel_de.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/deliver_spiegel_de.R b/R/deliver_spiegel_de.R index 5a80679..17e7073 100644 --- a/R/deliver_spiegel_de.R +++ b/R/deliver_spiegel_de.R @@ -21,7 +21,7 @@ pb_deliver_paper.spiegel_de <- function(x, verbose = NULL, pb, ...) { # text text <- html %>% - rvest::html_elements("div[data-area = \"body\"]") %>% + rvest::html_elements("div[data-area = \"text\"]") %>% rvest::html_text2() %>% paste(collapse = "\n")