Skip to content

Commit

Permalink
Merge pull request #24 from schochastics/german_media
Browse files Browse the repository at this point in the history
add german media (#23)
  • Loading branch information
JBGruber authored Dec 26, 2024
2 parents cdcf4d5 + 3b4b3f9 commit 3234c0c
Show file tree
Hide file tree
Showing 101 changed files with 3,152 additions and 6 deletions.
17 changes: 11 additions & 6 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,16 @@ Title: Comprehensive Collection of News Media Scrapers
Version: 0.0.7.9000
Date: 2024-07-17
Authors@R:
person(given = "Johannes B.",
family = "Gruber",
email = "[email protected]",
role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-9177-1772"))
c(person(given = "Johannes B.",
family = "Gruber",
email = "[email protected]",
role = c("aut", "cre"),
comment = c(ORCID = "0000-0001-9177-1772")),
person(given = "David",
family = "Schoch",
email = "[email protected]",
role = "ctb",
comment = c(ORCID = "0000-0003-2952-4812")))
Description: A comprehensive collection of webscraping scripts for news media sites.
Depends:
R (>= 3.5.0)
Expand Down Expand Up @@ -44,6 +49,6 @@ Suggests:
URL: https://github.com/JBGruber/paperboy
Encoding: UTF-8
BugReports: https://github.com/JBGruber/paperboy/issues
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
VignetteBuilder: knitr
Language: en-GB
98 changes: 98 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,25 @@
S3method(pb_deliver,character)
S3method(pb_deliver,data.frame)
S3method(pb_deliver,default)
S3method(pb_deliver_paper,"3sat_de")
S3method(pb_deliver_paper,abendblatt_de)
S3method(pb_deliver_paper,abendzeitung_muenchen_de)
S3method(pb_deliver_paper,ac24_cz)
S3method(pb_deliver_paper,ad_nl)
S3method(pb_deliver_paper,aktualne_cz)
S3method(pb_deliver_paper,anotherangryvoice_blogspot_com)
S3method(pb_deliver_paper,augsburger_allgemeine_de)
S3method(pb_deliver_paper,badische_zeitung_de)
S3method(pb_deliver_paper,bbc_co_uk)
S3method(pb_deliver_paper,berliner_kurier_de)
S3method(pb_deliver_paper,berliner_zeitung_de)
S3method(pb_deliver_paper,bild_de)
S3method(pb_deliver_paper,blesk_cz)
S3method(pb_deliver_paper,bnn_de)
S3method(pb_deliver_paper,br_de)
S3method(pb_deliver_paper,breakingnews_ie)
S3method(pb_deliver_paper,breitbart_com)
S3method(pb_deliver_paper,businessinsider_de)
S3method(pb_deliver_paper,buzzfeed_com)
S3method(pb_deliver_paper,cbsnews_com)
S3method(pb_deliver_paper,ceskatelevize_cz)
Expand All @@ -19,52 +30,139 @@ S3method(pb_deliver_paper,cnn_com)
S3method(pb_deliver_paper,dailymail_co_uk)
S3method(pb_deliver_paper,default)
S3method(pb_deliver_paper,denikn_cz)
S3method(pb_deliver_paper,der_postillon_com)
S3method(pb_deliver_paper,derstandard_at)
S3method(pb_deliver_paper,derwesten_de)
S3method(pb_deliver_paper,deutschlandfunk_de)
S3method(pb_deliver_paper,deutschlandfunkkultur_de)
S3method(pb_deliver_paper,dnn_de)
S3method(pb_deliver_paper,echo24_de)
S3method(pb_deliver_paper,epochtimes_de)
S3method(pb_deliver_paper,evolvepolitics_com)
S3method(pb_deliver_paper,express_de)
S3method(pb_deliver_paper,faz_net)
S3method(pb_deliver_paper,finanzen_net)
S3method(pb_deliver_paper,fnp_de)
S3method(pb_deliver_paper,focus_de)
S3method(pb_deliver_paper,forbes_com)
S3method(pb_deliver_paper,foxbusiness_com)
S3method(pb_deliver_paper,fr_de)
S3method(pb_deliver_paper,freiepresse_de)
S3method(pb_deliver_paper,geenstijl_nl)
S3method(pb_deliver_paper,handelsblatt_com)
S3method(pb_deliver_paper,haz_de)
S3method(pb_deliver_paper,heidelberg24_de)
S3method(pb_deliver_paper,heise_de)
S3method(pb_deliver_paper,hn_cz)
S3method(pb_deliver_paper,hna_de)
S3method(pb_deliver_paper,huffpost_com)
S3method(pb_deliver_paper,idnes_cz)
S3method(pb_deliver_paper,independent_co_uk)
S3method(pb_deliver_paper,independent_ie)
S3method(pb_deliver_paper,infranken_de)
S3method(pb_deliver_paper,irishexaminer_com)
S3method(pb_deliver_paper,irishmirror_ie)
S3method(pb_deliver_paper,irishtimes_com)
S3method(pb_deliver_paper,irozhlas_cz)
S3method(pb_deliver_paper,joe_ie)
S3method(pb_deliver_paper,jungefreiheit_de)
S3method(pb_deliver_paper,kabeleins_de)
S3method(pb_deliver_paper,karlsruhe_insider_de)
S3method(pb_deliver_paper,kreiszeitung_de)
S3method(pb_deliver_paper,ksta_de)
S3method(pb_deliver_paper,kurier_at)
S3method(pb_deliver_paper,latimes_com)
S3method(pb_deliver_paper,lidovky_cz)
S3method(pb_deliver_paper,lvz_de)
S3method(pb_deliver_paper,manager_magazin_de)
S3method(pb_deliver_paper,marketwatch_com)
S3method(pb_deliver_paper,maz_online_de)
S3method(pb_deliver_paper,mdr_de)
S3method(pb_deliver_paper,mediacourant_nl)
S3method(pb_deliver_paper,merkur_de)
S3method(pb_deliver_paper,metronieuws_nl)
S3method(pb_deliver_paper,mopo_de)
S3method(pb_deliver_paper,morgenpost_de)
S3method(pb_deliver_paper,n_tv_de)
S3method(pb_deliver_paper,name_de)
S3method(pb_deliver_paper,ndr_de)
S3method(pb_deliver_paper,news_de)
S3method(pb_deliver_paper,news_und_nachrichten_de)
S3method(pb_deliver_paper,newsflash24_de)
S3method(pb_deliver_paper,newstatesman_com)
S3method(pb_deliver_paper,newsweek_com)
S3method(pb_deliver_paper,nordkurier_de)
S3method(pb_deliver_paper,nos_nl)
S3method(pb_deliver_paper,novinky_cz)
S3method(pb_deliver_paper,noz_de)
S3method(pb_deliver_paper,nrc_nl)
S3method(pb_deliver_paper,nu_nl)
S3method(pb_deliver_paper,nw_de)
S3method(pb_deliver_paper,nypost_com)
S3method(pb_deliver_paper,nytimes_com)
S3method(pb_deliver_paper,nzz_ch)
S3method(pb_deliver_paper,orf_at)
S3method(pb_deliver_paper,ostsee_zeitung_de)
S3method(pb_deliver_paper,parlamentnilisty_cz)
S3method(pb_deliver_paper,presseportal_de)
S3method(pb_deliver_paper,prosieben_de)
S3method(pb_deliver_paper,rbb24_de)
S3method(pb_deliver_paper,rnd_de)
S3method(pb_deliver_paper,rollingstone_de)
S3method(pb_deliver_paper,rp_online_de)
S3method(pb_deliver_paper,rte_ie)
S3method(pb_deliver_paper,rtl_de)
S3method(pb_deliver_paper,rtl_nl)
S3method(pb_deliver_paper,ruhr24_de)
S3method(pb_deliver_paper,ruhrnachrichten_de)
S3method(pb_deliver_paper,saechsische_de)
S3method(pb_deliver_paper,schwaebische_de)
S3method(pb_deliver_paper,seznamzpravy_cz)
S3method(pb_deliver_paper,sfgate_com)
S3method(pb_deliver_paper,shz_de)
S3method(pb_deliver_paper,skwawkbox_org)
S3method(pb_deliver_paper,sky_com)
S3method(pb_deliver_paper,spiegel_de)
S3method(pb_deliver_paper,srf_ch)
S3method(pb_deliver_paper,stern_de)
S3method(pb_deliver_paper,stuttgarter_zeitung_de)
S3method(pb_deliver_paper,sueddeutsche_de)
S3method(pb_deliver_paper,suedkurier_de)
S3method(pb_deliver_paper,swp_de)
S3method(pb_deliver_paper,swr3_de)
S3method(pb_deliver_paper,swr_de)
S3method(pb_deliver_paper,swrfernsehen_de)
S3method(pb_deliver_paper,t3n_de)
S3method(pb_deliver_paper,t_online_de)
S3method(pb_deliver_paper,tag24_de)
S3method(pb_deliver_paper,tagesschau_de)
S3method(pb_deliver_paper,tagesspiegel_de)
S3method(pb_deliver_paper,taz_de)
S3method(pb_deliver_paper,telegraaf_nl)
S3method(pb_deliver_paper,telegraph_co_uk)
S3method(pb_deliver_paper,thecanary_co)
S3method(pb_deliver_paper,theguardian_com)
S3method(pb_deliver_paper,thejournal_ie)
S3method(pb_deliver_paper,thesun_ie)
S3method(pb_deliver_paper,thueringer_allgemeine_de)
S3method(pb_deliver_paper,tz_de)
S3method(pb_deliver_paper,usatoday_com)
S3method(pb_deliver_paper,vice_com)
S3method(pb_deliver_paper,volkskrant_nl)
S3method(pb_deliver_paper,volksstimme_de)
S3method(pb_deliver_paper,vox_de)
S3method(pb_deliver_paper,wa_de)
S3method(pb_deliver_paper,washingtonpost_com)
S3method(pb_deliver_paper,watson_ch)
S3method(pb_deliver_paper,watson_de)
S3method(pb_deliver_paper,waz_de)
S3method(pb_deliver_paper,wdr_de)
S3method(pb_deliver_paper,welt_de)
S3method(pb_deliver_paper,wiwo_de)
S3method(pb_deliver_paper,wsj_com)
S3method(pb_deliver_paper,wz_de)
S3method(pb_deliver_paper,yahoo_com)
S3method(pb_deliver_paper,zdf_de)
S3method(pb_deliver_paper,zeit_de)
export("%>%")
export(pb_available)
Expand Down
28 changes: 28 additions & 0 deletions R/deliver_3sat_de.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#' @export
pb_deliver_paper.3sat_de <- function(x, verbose = NULL, pb, ...) {
pb_tick(x, verbose, pb)
# raw html is stored in column content_raw
html <- rvest::read_html(x$content_raw)
datetime <- html %>%
rvest::html_elements("time") %>%
rvest::html_attr("datetime") %>%
lubridate::as_datetime()

headline <- html %>%
rvest::html_elements(".main-content-details h2") %>%
rvest::html_text()

author <- "" # no author info found

text <- html %>%
rvest::html_elements(".o--post-long p") %>%
rvest::html_text2() %>%
paste(collapse = "\n")

s_n_list(
datetime,
author,
headline,
text
)
}
30 changes: 30 additions & 0 deletions R/deliver_abendblatt_de.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#' @export
pb_deliver_paper.abendblatt_de <- function(x, verbose = NULL, pb, ...) {
pb_tick(x, verbose, pb)
# raw html is stored in column content_raw
html <- rvest::read_html(x$content_raw)

json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
return(s_n_list())
} else {
json_df <- jsonlite::fromJSON(json_txt[2])

datetime <- lubridate::as_datetime(json_df$datePublished)
headline <- json_df$headline
author <- toString(json_df$author$name)
text <- html %>%
rvest::html_elements(".article-body h3, .article-body p") %>%
rvest::html_text2() %>%
paste(collapse = "\n")

s_n_list(
datetime,
author,
headline,
text
)
}
}
# rss feed includes pages that cannot be parsed because they are subpages
# rss feed also includes podcast, which cannot be parsed
28 changes: 28 additions & 0 deletions R/deliver_abendzeitung_muenchen_de.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#' @export
pb_deliver_paper.abendzeitung_muenchen_de <- function(x, verbose = NULL, pb, ...) {
pb_tick(x, verbose, pb)
# raw html is stored in column content_raw
html <- rvest::read_html(x$content_raw)

json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
return(s_n_list())
} else {
json_df <- jsonlite::fromJSON(json_txt[1])

datetime <- lubridate::as_datetime(json_df$datePublished)
headline <- json_df$headline
author <- toString(json_df$author$name)
text <- html %>%
rvest::html_elements(".artdetail_short ,.artdetail_text p,.artdetail_text h2") %>%
rvest::html_text2() %>%
paste(collapse = "\n")

s_n_list(
datetime,
author,
headline,
text
)
}
}
31 changes: 31 additions & 0 deletions R/deliver_augsburger_allgemeine.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#' @export
pb_deliver_paper.augsburger_allgemeine_de <- function(x, verbose = NULL, pb, ...) {
pb_tick(x, verbose, pb)
# raw html is stored in column content_raw
html <- rvest::read_html(x$content_raw)


datetime <- html %>%
rvest::html_element("time") %>%
rvest::html_attr("datetime") %>%
lubridate::as_datetime()
headline <- html %>%
rvest::html_element("h2.typo-teaserheadline-SoleXL, h2.typo-articleheadline-Recife") %>%
rvest::html_text()
author <- html %>%
rvest::html_elements("a.typo-author-link") %>%
rvest::html_text2() %>%
toString()
text <- html %>%
rvest::html_elements(".typo-article-teaser-Recife, .typo-article-teaser, .article-body-paid-content, .typo-subhead, p.text-xs") %>%
rvest::html_text2() %>%
unique() %>% # teaser might be duplicated
paste(collapse = "\n")

s_n_list(
datetime,
author,
headline,
text
)
}
28 changes: 28 additions & 0 deletions R/deliver_badische_zeitung_de.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#' @export
pb_deliver_paper.badische_zeitung_de <- function(x, verbose = NULL, pb, ...) {
pb_tick(x, verbose, pb)
# raw html is stored in column content_raw
html <- rvest::read_html(iconv(x$content_raw, from = "ISO-8859-1", to = "UTF-8"))

json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
return(s_n_list())
} else {
json_df <- jsonlite::fromJSON(json_txt[1])

datetime <- lubridate::as_datetime(json_df$datePublished)
headline <- json_df$headline
author <- toString(json_df$author)
text <- html %>%
rvest::html_elements("section[role = \"article\"], .article-site__topic") %>%
rvest::html_text2() %>%
paste(collapse = "\n")

s_n_list(
datetime,
author,
headline,
text
)
}
}
24 changes: 24 additions & 0 deletions R/deliver_berliner_kurier_de.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#' @export
pb_deliver_paper.berliner_kurier_de <- function(x, verbose = NULL, pb, ...) {
pb_tick(x, verbose, pb)
# raw html is stored in column content_raw
html <- rvest::read_html(x$content_raw)

json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ")[1] %>% rvest::html_text()
json_df <- jsonlite::fromJSON(json_txt)

datetime <- lubridate::as_datetime(json_df$datePublished)
headline <- json_df$headline
author <- toString(json_df$author$name)
text <- html %>%
rvest::html_elements(".article_header-lead__0E3Bn, p.article_paragraph__hXYKJ, h2.article_subtitle__wx1Lu") %>%
rvest::html_text2() %>%
paste(collapse = "\n")

s_n_list(
datetime,
author,
headline,
text
)
}
28 changes: 28 additions & 0 deletions R/deliver_berliner_zeitung_de.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#' @export
pb_deliver_paper.berliner_zeitung_de <- function(x, verbose = NULL, pb, ...) {
pb_tick(x, verbose, pb)
# raw html is stored in column content_raw
html <- rvest::read_html(x$content_raw)

json_txt <- rvest::html_elements(html, "script[type = \"application/ld+json\"] ") %>% rvest::html_text()
if (isTRUE(is.na(json_txt)) || length(json_txt) == 0) {
return(s_n_list())
} else {
json_df <- jsonlite::fromJSON(json_txt[1])

datetime <- lubridate::as_datetime(json_df$datePublished)
headline <- json_df$headline
author <- toString(json_df$author$name)
text <- html %>%
rvest::html_elements(".article_paragraph__hXYKJ") %>%
rvest::html_text2() %>%
paste(collapse = "\n")

s_n_list(
datetime,
author,
headline,
text
)
}
}
Loading

0 comments on commit 3234c0c

Please sign in to comment.