rc-file.adl

# =====================================================
# General Parsers, not specific to Mediathek-Archives #
# =====================================================


# ---------------
# Parser "empty":
# ---------------
# empty means something like "after implicit get do NOP"
# the parser definition is emty and only for testing.
# This can make sense in checking if the implicit get works+
# (check if the start-url works at all).
# also can be used together with "-v" option to get all
# sorts of information without starting any download actions
# and without injvoking any other ecreen-filling parsers like "dump".
parsername "empty": ()
start
  dummy;
end


parsername "linkextract": ( )
start
  linkextract;
  print;
end


# seconds since the unix-epoche ( seconds since 00 h 00 min 00 s, 01.01.1970 )
# ----------------------------------------------------------------------------
parsername "unix-time": ( )
start
  recall("NOW");
  print;
end


parsername "linkextract-uniq": ( )
start
  linkextract;
  uniq;
  print;
end


parsername "linkextract-uniq-sorted": ( )
start
  linkextract;
  uniq;
  sort;
  print;
end


parsername "linkextract2csv": ( )
start
  linkextract;
  to_matchres;
  csv_save_as("links.csv");
  print;
end


parsername "linkextract_xml": ( )
start
  linkextract_xml;
  print;
end


# html-deparse-dump
parsername "dump": ( )
start
  dump;
end


# show tags
# ---------
parsername "show_tags": ( )
start
  show_tags;
end


# show tags with complete html-path
# ---------------------------------
parsername "show_tags_full": ( )
start
  show_tags_fullpath;
end


# show URL and title
# -------------------
parsername "pageinfo": ( )
start
  print_string("\n"); # blank line
  titleextract;
  print;
  print( "   ", $STARTURL, "\n" );
end


# show URL and title - print as markdown link
# -------------------------------------------
# To be used as:
#   $ any-dl -sep "" -p pageinfo.md <url.1> <url.2> ... <url.n>
parsername "pageinfo.md": ( )
start
  titleextract;
  store("title");
  print( "[", $title, "](", $STARTURL, ")" );
end


# show URL and title, html-decoded
# -------------------
parsername "pageinfo2": ( )
start
  print_string("\n"); # blank line
  titleextract;
  htmldecode;
  print;
  print( "   ", $STARTURL, "\n" );
end


# show URL and title, html-decoded
# -------------------
# Htmldecode is done before text extraction
# Therefore the title is also converted correctly to utf-8, if
# other encodeings have been found.
# This seems to be the correct pageinfo-parser,
# which would relace "pageinfo" and "pageinfo2",
# but because missing time to test intensively,
# "pageinfo" and "pageinfo2" will not be removed now.
# Maybe later "pageinfo3" will replace "pageinfo",
# and the other pageinfo-parsers would be removed then.
# -------------------
parsername "pageinfo3": ( )
start
  print_string("\n"); # blank line
  htmldecode;
  titleextract;
  print;
  print( "   ", $STARTURL, "\n" );
end


# -----------------------------------------------------------
# Pageinfo as HTML-Link
# -----------------------------------------------------------
# use with CLI-switch   -sep=""
# -----------------------------------------------------------
# You need to add html-beginning and html-end,
# but this creates the content in between,
# if used on many urls on CLI.
# -----------------------------------------------------------
parsername "pageinfo_html": ( )
start

  print_string("<hr><p>");
  htmldecode;
  titleextract;
  store("TITLE");

  print_string("\n"); # blank line
  print(>>><a href="<<<, $STARTURL, >>>"><<<, $TITLE, "</a>");
end


# ------------------------------------------
# only prints data part of the html, no tags
# acts like html-2-txt, but does not change
# tags by their text-aequivalent. So, html2txt
# would be overestimating it's functionality.
# ------------------------------------------
parsername "dumptext": ( )
start
  dump_data;
end


# ------------------------------------------
# same as dumptext, but with html-decode
# ------------------------------------------
parsername "dumptext2": ( )
start
  htmldecode;
  dump_data;
end


# show video-files
parsername "videonames": ( )
start
  match( "(.*mp4)|(.*flv)|(.*swf)|(.*mpg)|(.*m4v)|(.*mp4)|(.*rtmp)|(.*f4v)|(.*3gp)|(.*mkv)" );
  uniq;
  print;
end


# show video-files
parsername "videolinks": ( )
start
  tagselect( "a"."href" | argpairs );
  grep("mkv"); # here match on all filenames would be needed
  #match( "(.*mp4)|(.*flv)|(.*swf)|(.*mpg)|(.*m4v)|(.*mp4)|(.*rtmp)|(.*f4v)|(.*3gp)|(.*mkv)" );
  print;
  #match( "(.*mp4)|(.*flv)|(.*swf)|(.*mpg)|(.*m4v)|(.*mp4)|(.*rtmp)|(.*f4v)|(.*3gp)|(.*mkv)" );
  uniq;
  print;
end


# downloading picture-files
parsername "pics": ( )
start
  linkextract;
  print;
  grep("((\.jpg)|(\.jpeg)|(\.gif)|(\.png)|(\.svg)|(\.pnm)|(\.ppm)|(\.pgm)|(\.pbm))$");
  uniq;
  print( " ======================================================================== ");
  print;
  show_type;
  download; # directly writing files to disk
end


# downloads the img-src
# ---------------------
parsername "get-img-src": ( )
start
  tagselect( "img" | arg("src") );
  rebase;
  makeurl;
  download;
end


# get pdf-docs
# ------------
parsername "pdfs": ( )
start
  #match( >>>"(http[^"]+\.pdf)<<< );
  linkextract;
  grep( >>>(.*?\.pdf)<<< );
  uniq;
  print;
  to_matchres;
  dropcol(1); # referrer removed!
  show_match;
  makeurl;
  show_type;
  print;
  download; # directly writing files to disk

end


# =================================
# mp3-files matched and downloaed =
# =================================
parsername "mp3": ( )
start
  linkextract;
  grep("\.mp3");
  makeurl;
  download;
end


# ------------------------
# Show all script-sources
# ------------------------
# ( shows js-file-urls )
# ------------------------
parsername "script-src": ( )
start
  tagselect( "script" | arg( "src" ) );
  rebase;
  show_match;
#exitparse;
  makeurl;
  download; # directly writing files to disk
end


# ------------------------
# Download all source-src
# ------------------------
parsername "source-src": ( )
start
  tagselect( "source" | arg( "src" ) );
  rebase;
  show_match;
  makeurl;
  download; # directly writing files to disk
end


# ------------------------
# Show all meta-tags
# ------------------------
parsername "show-meta": ( )
start
  tagselect( "meta" | dump );
end


# ------------------------
# Save all meta-tags
# ------------------------
parsername "save-meta": ( )
start
  tagselect( "meta" | htmlstring );
  to_string;
  store("META");

  paste($STARTURL);
  subst("[[:^alnum:]]+", "_");  # non-alnum -> "_"
  store("BASENAME");
  paste($BASENAME, ".meta");
  store("FILENAME");

  recall("META");
  save_as( $FILENAME );
end


# ---------------------------------------------
# Show all name-attributes of meta-tags
# ---------------------------------------------
# This can be used to extract all meta-names
# from a webpage, for example to easier create
# scraperJSON-Files (ContentMine).
# ---------------------------------------------
parsername "show-meta-names": ( )
start
  tagselect( "meta"."name" | arg("name") );

  transpose; uniq; transpose;
  sort;

  print("Meta-Tags (alphabetically sorted) from site ", $STARTURL, "\n");

  show_match;
end


# ---------------------------------------------
# show all argkey-combinations of meta-tags
# ---------------------------------------------
parsername "show-meta-argkeys": ( )
start
  tagselect( "meta"."name" | argkeys );

  uniq;
  sort;

  print("Meta-Tag argkey-combinations: (alphabetically sorted) from site ", $STARTURL, "\n");
  show_match;
end


# --------------------------------------------------------------
# This parser creates the output for a Gnome-Desktop-File.
# --------------------------------------------------------------
# (what you get on your desktop, when you drag&drop
# the URL-line from your browser (e.g. firefox) to
# the desktop
# --------------------------------------------------------------
# Does create the output and prints it to stdout; does not
# directly create the file.
# --------------------------------------------------------------
parsername "desktop": ()
start
  print( "[Desktop Entry]" );
  print( "Encoding=UTF-8" );
  titleextract;
  store("title");
  print("Name=", $title);
  print("Type=Link");
  print("URL=", $STARTURL);
  print("Icon=gnome-fs-bookmark");
end


parsername "desktop2": ()
start
  print( "[Desktop Entry]" );
  print( "Encoding=UTF-8" );
  titleextract;
print;
  htmldecode;
print;
  store("title");
  print("Name=", $title);
  print("Type=Link");
  print("URL=", $STARTURL);
  print("Icon=gnome-fs-bookmark");
end


# ----------------------------------------------------
# parser for testing cookies
# ----------------------------------------------------
# Some tests of cookies can be found here:
#   http://winware.org/de/cookietest.php
#
# Used testpage for setting/reflecting a cookie:
#   http://winware.org/de/cookietest.php?cookie=set
# ----------------------------------------------------
parsername "cookieecho": ( )
start
  call("cookieecho");
end


# ---------------------------------------------
# parser for testing cookies, using a testpage,
# so enforcing cookies to be set.
# the URL on the command line will be ignored!
# ---------------------------------------------
parsername "cookieecho-forced": ( )
start
  print("The URL provided via command line will be ignored.");
  print("The test-url   http://winware.org/de/cookietest.php?cookie=set  will be used instead!\n");

  paste("http://winware.org/de/cookietest.php?cookie=set"); # use this test-url to check the COOKIE-functionality!
  makeurl;
  store("STARTURL");

  get; # use this test-url (INSTEAD of the one given by to check the COOKIE-functionality!

  call("cookieecho");

end


# ----------------------------------------------------
# ----------------------------------------------------
# ----------------------------------------------------
parsername "cookietest": ( "http://winware.org/de/cookietest.php" )
start
  # the rreceived cookies will not be sent back - hence no effect
  # -------------------------------------------------------------
  get("http://winware.org/de/cookietest.php");
  get("http://winware.org/de/cookietest.php?cookie=set");
  get("http://winware.org/de/cookietest.php");
  get("http://winware.org/de/cookietest.php?cookie=set");

  # now with using cookies (by copying the cookies from RECEIVED to SEND):
  # ----------------------------------------------------------------------
  recall("COOKIES.RECEIVED");
  store("COOKIES.SEND");
  get("http://winware.org/de/cookietest.php?cookie=set");
end


# ==============================================
# Parser-Definitions for the Mediathek-Parsers #
# ==============================================

# ------------------------------------------------------
# Extract OSR Shownotes, and write them to csv-file;
# and download the audio-file.
# with -i switch, the audio-file-type can be selected.
# ------------------------------------------------------
parsername "OSR-shownotes-2-csv": (  "https://www.openscienceradio.org/" )
start

  # extract title, for usage as part of the filename
  call("_extract-and-store-title_");

  recall("BASEDOC");
# html.body.div.div.div.div.article.div.div.p.a.target
  tagselect("article", "div", "div", "p", "a" | arg("title"), arg("href") );
  transpose;

  # Html-decode first, then remove semicolons left over
  # ----------------------------------------------------
  htmldecode;       # html-encoded stuff to UTF-8
  subst(";", " -"); # remove semicolon, because of CSV-saving later on !!

  subst(" - \(link\)", ""); # remove unnecessary stuff

  #print;
  show_match;

  csv_save; # SAVING the show-notes as csv-file.


  # extract the AUDIO-urls
  # ----------------------
  recall("BASEDOC");
  tagselect("meta"."property"="og:audio" | arg("content") );
  transpose;

  # downloading the AUDIO-file now, and print some messages to the user.
  # --------------------------------------------------------------------
  iselectmatch(0, "http", "mp3");
  makeurl;
  print_string("\n\n-------------------------------------\n");
  print_string("Start downloading the audio-file now.\nURL: ");
  print;
  print("...please be patient.");
  download;
  print("done.");
  print_string("-------------------------------------\n");

end


# ---------------------------------
# Resonator
# ---------------------------------
# Downloads mp3 by default.
# With -i switch on command line,
# the user can select preferred audio file.
# ------------------------------------------
parsername "resonator": (  "https://resonator-podcast.de", "http://resonator-podcast.de" )
start
  # extract the audio-urls
  # ----------------------
  tagselect("meta"."property"="og:audio" | arg("content") );
  transpose;
  #show_match;
  iselectmatch(0, "RES", "mp3");

  # download and save
  # -----------------
  makeurl;
  download; # directly writing files to disk

end


# ---------------------------------
# ARD 
# ---------------------------------
# Example: https://www.ardmediathek.de/video/charite/folge-1-barmherzigkeit-s01-e01/mdr-fernsehen/Y3JpZDovL21kci5kZS9zZW5kdW5nLzI4MjA0MS8yMDI0MDIwNjAwMzUvbWRycGx1cy1zZW5kdW5nLTgwNjI
# =================================
parsername "ard_mediathek_get": ("https://www.ardmediathek.de" )
start
  call("_extract-and-store-title_");

  recall("BASEDOC");
  tagselect("meta"."property"="contentUrl" | arg("content") );
  makeurl;

  download($title, ".mp4");
end


# ------------------------------------
# Wrapper-Parser for youtube and vimeo
# ====================================
parsername "youtube-and-vimeo": ( "https://www.youtube.com/", "https://vimeo.com/" )
start
  paste(" yt-dlp ", $STARTURL );
  system;
end


# =================================
# working, but too simple, you could use wget instead ;-) It's just for testing purposes
parsername "save": ( )
start
  save; # save original document
  linkextract; # grab links from the document
  print; # print links/referers
  get;   # get all documents, to which the extracted documents are linking
  save;  # save all the retrieved documents, that wre elinked to via the main page
end

# =================================
# youtube Info
# =================================
# For a youtube-link directly to a
# video, print title, date and url.
# ---------------------------------
parsername "ytinfo": ( )
start

  # Title
  #recall("BASEDOC");
  tagselect("meta"."property"="og:title" | arg("content"));
  htmldecode;
  to_string;
  print;

  # Date
  recall("BASEDOC");
  tagselect("meta"."itemprop"="datePublished" | arg("content") );
  subst("T.*", ""); # remove time, only date needed (better readable without time)
  store("datePublished");
  print("   - ", $datePublished, " -");

  # Video URL
  print($STARTURL);

end
#
# vimscript-Function to insert the result of the ytinfo-parser into your current vim buffer:
#
#:function Youtubeinfo()
#: call inputsave()
#: let url = input(' Enter youtube-URL: ')
#: call inputrestore()
#: execute "r! any-dl -p ytinfo -sep=\"\" " . url
#:endfunction


# =====================================================
#            M   A   C   R   O   S 
# =====================================================


# =========================================================
# Extract title-tag and suvstitute non-alnum chars by "-" =
# and store the result in variable "title".
# =========================================================
defmacro "_extract-and-store-title_":
start
  titleextract;
  subst("[[:^alnum:]]+", "_");  # non-alnum -> "_"
  store("title");
end

defmacro "_extract-and-store-title-htmldecode_":
start
  titleextract;
  htmldecode;
  subst("[[:^alnum:]]+", "_");  # non-alnum -> "_"
  store("title");
end


# ----------------------------------------------------
# Macro for testing cookies
# ----------------------------------------------------
# Some tests of cookies can be found here:
#   http://winware.org/de/cookietest.php
#
# Used testpage for setting/reflecting a cookie:
#   http://winware.org/de/cookietest.php?cookie=set
# ----------------------------------------------------
defmacro "cookieecho":
start
  print("COOKIES.RECEIVED:\n", $COOKIES.RECEIVED );

  print("Preparing cookies for get");
  recall("COOKIES.RECEIVED");
  store("COOKIES.SEND");

  recall("STARTURL");
  print;
  get;

  print("COOKIES.SEND:\n", $COOKIES.SEND );
  print("COOKIES.RECEIVED:\n", $COOKIES.RECEIVED );


  print("Preparing cookies for second get");
  recall("COOKIES.RECEIVED");
  store("COOKIES.SEND");

  recall("STARTURL");
  print;
  get;

  print("COOKIES.SEND:\n", $COOKIES.SEND );
  print("COOKIES.RECEIVED:\n", $COOKIES.RECEIVED );
end

# -------------------------------------------------
# All fields of  an <input> are listed (as columns)
# -------------------------------------------------
# Col 0, Col 2, Col 4, ... are fieldnames,
# Col 1, Col 3, Col 5, ... are fieldvalues
# -------------------------------------------------
# use case: analyze page for setting up POST call
# -------------------------------------------------
parsername "showinput": ()
start
  tagselect( "input" | argpairs );
  show_match;
end


# show form's arg-pairs
# =====================
defmacro "showforms":
start
  tagselect("form" | argpairs);
  print;
end


# show input's arg-pairs
# ======================
defmacro "showinputs":
start
  tagselect("input" | argpairs);
  print;
end