Rewinded evaluation link updates to include in a separate PR

binpash · Nov 25, 2023 · d4512b0 · d4512b0
1 parent f8ac509
commit d4512b0
Show file tree

Hide file tree

Showing 16 changed files with 41 additions and 64 deletions.
diff --git a/evaluation/benchmarks/bio/bio-align/genome-diff.sh b/evaluation/benchmarks/bio/bio-align/genome-diff.sh
@@ -11,7 +11,7 @@
 # bacteria), and any regions with less than 10 supporting reads.
 
 # Requires: samtools, minimap2, bcftools
-# Data: atlas-group.cs.brown.edu/data/bio/R1.fastq.gz atlas-group.cs.brown.edu/data/bio/R2.fastq.gz  atlas-group.cs.brown.edu/data/bio/ref.fa
+# Data: http://ndr.md/data/bio/R1.fastq.gz http://ndr.md/data/bio/R2.fastq.gz  http://ndr.md/data/bio/ref.fa
 
 # https://github.com/samtools/samtools/releases/latest
 # https://github.com/lh3/minimap2

diff --git a/evaluation/benchmarks/bio/bio-align/genquality.sh b/evaluation/benchmarks/bio/bio-align/genquality.sh
@@ -6,7 +6,7 @@
 # http://thegenomefactory.blogspot.com/2019/09/25-reasons-assemblies-dont-make-it-into.html
 
 # Require: csvkit
-# Data: atlas-group.cs.brown.edu/data/bio/genbank.txt
+# Data: http://ndr.md/data/bio/genbank.txt
 
 IN=./input/genbank.txt
 OUT=./output/out.txt

diff --git a/evaluation/benchmarks/bio/bio1/setup.sh b/evaluation/benchmarks/bio/bio1/setup.sh
@@ -8,7 +8,7 @@ mkdir -p input
 mkdir -p output
 cd input
 if [[ ! -f R1.fastq ]]; then
-  wget atlas-group.cs.brown.edu/data/bio/{R1.fastq.gz,R2.fastq.gz,ref.fa}
+  wget ndr.md/data/bio/{R1.fastq.gz,R2.fastq.gz,ref.fa}
 
   gunzip R1.fastq.gz
   gunzip R2.fastq.gz

diff --git a/evaluation/benchmarks/max-temp/max-temp-preprocess.sh b/evaluation/benchmarks/max-temp/max-temp-preprocess.sh
@@ -1,12 +1,12 @@
 #!/bin/bash
 
-sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' |
+sed 's;^;http://ndr.md/data/noaa/;' |
     sed 's;$;/;' |
     xargs -r -n 1 curl -s |
     grep gz |
     tr -s ' \n' |
     cut -d ' ' -f9 |
     sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' |
-    sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' |
+    sed 's;^;http://ndr.md/data/noaa/;' |
     xargs -n1 curl -s |
     gunzip
diff --git a/evaluation/benchmarks/max-temp/max-temp.sh b/evaluation/benchmarks/max-temp/max-temp.sh
@@ -2,7 +2,7 @@
 
 FROM=${FROM:-2015}
 TO=${TO:-2015}
-IN=${IN:-'atlas-group.cs.brown.edu/data/noaa/'}
+IN=${IN:-'http://ndr.md/data/noaa/'}
 fetch=${fetch:-"curl -s"}
 
 seq $FROM $TO |

diff --git a/evaluation/benchmarks/max-temp/temp-analytics.sh b/evaluation/benchmarks/max-temp/temp-analytics.sh
@@ -2,7 +2,7 @@
 
 FROM=${FROM:-2015}
 TO=${TO:-2015}
-IN=${IN:-'atlas-group.cs.brown.edu/data/noaa/'}
+IN=${IN:-'http://ndr.md/data/noaa/'}
 fetch=${fetch:-"curl -s"}
 
 data_file=temperatures.txt

diff --git a/evaluation/benchmarks/nlp/input/setup.sh b/evaluation/benchmarks/nlp/input/setup.sh
@@ -20,7 +20,7 @@ setup_dataset() {
     cd pg
   if [[ "$1" == "--full" ]]; then
     echo 'N.b.: download/extraction will take about 10min'
-    wget atlas-group.cs.brown.edu/data/pg.tar.xz # FIXME: moving to PG soon
+    wget ndr.md/data/pg.tar.xz
     if [ $? -ne 0 ]; then
 		cat <<-'EOF' | sed 's/^ *//'
 		Downloading input dataset failed, thus need to manually rsync all books from  project gutenberg:

diff --git a/evaluation/benchmarks/oneliners/input/setup.sh b/evaluation/benchmarks/oneliners/input/setup.sh
@@ -26,7 +26,7 @@ setup_dataset() {
   fi
 
     if [ ! -f ./1M.txt ]; then
-        curl -sf 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt
+        curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
         if [ $? -ne 0 ]; then
             echo 'cannot find 1M.txt -- please contact the developers of pash'
             exit 1
@@ -51,7 +51,7 @@ setup_dataset() {
     fi
 
     if [ ! -f ./1G.txt ]; then
-        curl -sf 'atlas-group.cs.brown.edu/data/dummy/1G.txt' > 1G.txt
+        curl -sf 'http://ndr.md/data/dummy/1G.txt' > 1G.txt
         if [ $? -ne 0 ]; then
             echo 'cannot find 1G.txt -- please contact the developers of pash'
             exit 1
@@ -61,7 +61,7 @@ setup_dataset() {
 
   # download wamerican-insane dictionary and sort according to machine
   if [ ! -f ./dict.txt ]; then
-      curl -sf 'atlas-group.cs.brown.edu/data/dummy/dict.txt' | sort > dict.txt
+      curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt
       if [ $? -ne 0 ]; then
           echo 'cannot find dict.txt -- please contact the developers of pash'
           exit 1
@@ -70,7 +70,7 @@ setup_dataset() {
     fi
 
     if [ ! -f ./all_cmds.txt ]; then
-        curl -sf 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt
+        curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt
         if [ $? -ne 0 ]; then
             # This should be OK for tests, no need for abort
             ls /usr/bin/* > all_cmds.txt

diff --git a/evaluation/benchmarks/web-index/input/setup.sh b/evaluation/benchmarks/web-index/input/setup.sh
@@ -17,7 +17,8 @@ setup_dataset() {
     wget $wiki_archive || eexit "cannot fetch wikipedia"
     7za x wikipedia-en-html.tar.7z
     tar -xvf wikipedia-en-html.tar
-    wget atlas-group.cs.brown.edu/data/wikipedia/index.txt # FIXME: we download index below?
+    wget http://ndr.md/data/wikipedia/index.txt # || eexit "cannot fetch wiki indices"
+    # It is actually OK if we don't have this index since we download the 500/1000 below
   fi
 
   if [ "$1" = "--small" ]; then

diff --git a/evaluation/distr_benchmarks/nlp/input/setup.sh b/evaluation/distr_benchmarks/nlp/input/setup.sh
@@ -19,7 +19,7 @@ if [ ! -e ./pg ]; then
   cd pg
   if [[ "$1" == "--full" ]]; then
     echo 'N.b.: download/extraction will take about 10min'
-    wget atlas-group.cs.brown.edu/data/pg.tar.xz # FIXME: moving to PG soon
+    wget ndr.md/data/pg.tar.xz
     if [ $? -ne 0 ]; then
 		cat <<-'EOF' | sed 's/^ *//'
 		Downloading input dataset failed, thus need to manually rsync all books from  project gutenberg:

diff --git a/evaluation/distr_benchmarks/oneliners/input/setup.sh b/evaluation/distr_benchmarks/oneliners/input/setup.sh
@@ -1,9 +1,7 @@
 #!/bin/bash
 #set -e
 
-PASH_TOP=${PASH_TOP:-$DISH_TOP/pash}
-. "$PASH_TOP/scripts/utils.sh"
-
+PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}
 
 # another solution for capturing HTTP status code
 # https://superuser.com/a/590170
@@ -15,15 +13,14 @@ if [[ "$1" == "-c" ]]; then
     exit
 fi
 
-hdfs dfs -mkdir -p /oneliners
+hdfs dfs -mkdir /oneliners
 
 if [ ! -f ./1M.txt ]; then
-    curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt
+    curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
     if [ $? -ne 0 ]; then
-        curl -f 'https://zenodo.org/record/7650885/files/1M.txt' > 1M.txt
-        [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
+        echo 'cannot find 1M.txt -- please contact the developers of pash'
+        exit 1
     fi
-    append_nl_if_not ./1M.txt
 fi
 
 if [ ! -f ./10M.txt ]; then
@@ -41,53 +38,35 @@ if [ ! -f ./100M.txt ]; then
 fi
 
 if [ ! -f ./1G.txt ]; then
-    curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1G.txt' > 1G.txt
-    if [ $? -ne 0 ]; then
-        touch 1G.txt
-        for (( i = 0; i < 10; i++ )); do
-            cat 100M.txt >> 1G.txt
-        done
-    fi
-fi
-
-if [ ! -f ./words ]; then
-  curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/words' > words
-  if [ $? -ne 0 ]; then
-    curl -f 'https://zenodo.org/record/7650885/files/words' > words
+    curl -sf 'http://ndr.md/data/dummy/1G.txt' > 1G.txt
     if [ $? -ne 0 ]; then
-      if [ $(uname) = 'Darwin' ]; then
-        cp /usr/share/dict/web2 words || eexit "cannot find dict file"
-      else
-        # apt install wamerican-insane
-        cp /usr/share/dict/words words || eexit "cannot find dict file"
-      fi
+        echo 'cannot find 1G.txt -- please contact the developers of pash'
+        exit 1
     fi
-  fi
-  append_nl_if_not words
 fi
 
 # download wamerican-insane dictionary and sort according to machine
 if [ ! -f ./dict.txt ]; then
-    curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/dict.txt' | sort > dict.txt
+    curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt
     if [ $? -ne 0 ]; then
-        sort words > sorted_words
+        echo 'cannot find dict.txt -- please contact the developers of pash'
+        exit 1
     fi
 fi
 
 if [ ! -f ./all_cmds.txt ]; then
-    curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt
+    curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt
     if [ $? -ne 0 ]; then
         # This should be OK for tests, no need for abort
         ls /usr/bin/* > all_cmds.txt
     fi
-    append_nl_if_not ./all_cmds.txt
 fi
 
 if [ ! -f ./all_cmdsx100.txt ]; then
-    touch all_cmdsx100.txt
-    for (( i = 0; i < 100; i++ )); do
-        cat all_cmds.txt >> all_cmdsx100.txt
-    done
+        touch all_cmdsx100.txt
+        for (( i = 0; i < 100; i++ )); do
+            cat all_cmds.txt >> all_cmdsx100.txt
+        done
 fi
 
 if [ ! -f ./3G.txt ]; then

diff --git a/evaluation/intro/input/setup.sh b/evaluation/intro/input/setup.sh
@@ -7,7 +7,7 @@ cd $(dirname $0)
 [ "$1" = "-c" ] && rm-files 100M.txt words sorted_words
 
 if [ ! -f ./100M.txt ]; then
-  curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/100M.txt' > 100M.txt
+  curl -f 'ndr.md/data/dummy/100M.txt' > 100M.txt
   if [ $? -ne 0 ]; then
     curl -f 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1M > 1M.txt
     [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
@@ -20,7 +20,7 @@ if [ ! -f ./100M.txt ]; then
 fi
 
 if [ ! -f ./words ]; then
-  curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/words' > words
+  curl -f 'http://ndr.md/data/dummy/words' > words
   if [ $? -ne 0 ]; then
     if [ $(uname) = 'Darwin' ]; then
       cp /usr/share/dict/web2 words || eexit "cannot find dict file"

diff --git a/evaluation/other/more-scripts/page-count.sh b/evaluation/other/more-scripts/page-count.sh
@@ -5,7 +5,7 @@
 
 # Require: libimage-exiftool-perl, bc
 # Data:
-#   atlas-group.cs.brown.edu/data/large.pdf
+#   http://ndr.md/data/dummy/large.pdf
 # More data:
 #   https://arxiv.org/help/bulk_data
 

diff --git a/evaluation/other/more-scripts/spell.sh b/evaluation/other/more-scripts/spell.sh
@@ -6,7 +6,7 @@
 # TODO: `groff is an interesting "pure", whose wrapper only needs split input
 # TODO: files carefully.
 
-# Data: atlas-group.cs.brown.edu/data/dummy/ronn.1
+# Data: http://ndr.md/data/dummy/ronn.1
 # dict depends on the system (and has to be sorted), so we assume it exists
 dict=./input/dict.txt
 

diff --git a/evaluation/tests/input/setup.sh b/evaluation/tests/input/setup.sh
@@ -16,7 +16,7 @@ esac
 [ "$1" = "-c" ] && rm-files 1M.txt all_cmds.txt words sorted_words 10M.txt
 
 if [ ! -f ./1M.txt ]; then
-  curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt
+  curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
   if [ $? -ne 0 ]; then
     curl -sf 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1${head_sz} > 1M.txt
     [ $? -ne 0 ] && eexit 'cannot find 1M.txt'
@@ -26,18 +26,15 @@ fi
 
 if [ ! -f ./all_cmds.txt ]; then
   if [ "$(hostname)" = "deathstar" ]; then
-    curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt
-    if [ $? -ne 0 ]; then
-      curl -f 'https://zenodo.org/record/7650885/files/all_cmds.txt' > all_cmds.txt || eexit "all_cmds not found"
-    fi
+    curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt || eexit "all_cmds not found"
   else
     ls /usr/bin/* > all_cmds.txt
   fi
   append_nl_if_not ./all_cmds.txt
 fi
 
 if [ ! -f ./words ]; then
-  curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/words' > words
+  curl -sf 'http://ndr.md/data/dummy/words' > words
   if [ $? -ne 0 ]; then
     if [ $(uname) = 'Darwin' ]; then
       cp /usr/share/dict/web2 words || eexit "cannot find dict file"

diff --git a/evaluation/tests/sed-test.sh b/evaluation/tests/sed-test.sh
@@ -1,11 +1,11 @@
 cat $PASH_TOP/evaluation/tests/input/1M.txt |
     sed 's;^d;da;' |
-    sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' |
+    sed 's;^;http://ndr.md/data/noaa/;' |
     sed 's;$;/;' |
     sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' |
-    sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' |
+    sed 's;^;http://ndr.md/data/noaa/;' |
     sed "s#^#$WIKI#" |
     sed s/\$/'0s'/ |
     sed 1d |
     sed 4d |
-    sed "\$d"
+    sed "\$d"