From d4512b0cf9684a36774e317a65e39a15fd1d8214 Mon Sep 17 00:00:00 2001 From: Zhicheng Huang Date: Sat, 25 Nov 2023 17:38:45 -0500 Subject: [PATCH] Rewinded evaluation link updates to include in a separate PR --- .../benchmarks/bio/bio-align/genome-diff.sh | 2 +- .../benchmarks/bio/bio-align/genquality.sh | 2 +- evaluation/benchmarks/bio/bio1/setup.sh | 2 +- .../max-temp/max-temp-preprocess.sh | 4 +- evaluation/benchmarks/max-temp/max-temp.sh | 2 +- .../benchmarks/max-temp/temp-analytics.sh | 2 +- evaluation/benchmarks/nlp/input/setup.sh | 2 +- .../benchmarks/oneliners/input/setup.sh | 8 +-- .../benchmarks/web-index/input/setup.sh | 3 +- .../distr_benchmarks/nlp/input/setup.sh | 2 +- .../distr_benchmarks/oneliners/input/setup.sh | 53 ++++++------------- evaluation/intro/input/setup.sh | 4 +- evaluation/other/more-scripts/page-count.sh | 2 +- evaluation/other/more-scripts/spell.sh | 2 +- evaluation/tests/input/setup.sh | 9 ++-- evaluation/tests/sed-test.sh | 6 +-- 16 files changed, 41 insertions(+), 64 deletions(-) diff --git a/evaluation/benchmarks/bio/bio-align/genome-diff.sh b/evaluation/benchmarks/bio/bio-align/genome-diff.sh index c82061797..a269f9e95 100755 --- a/evaluation/benchmarks/bio/bio-align/genome-diff.sh +++ b/evaluation/benchmarks/bio/bio-align/genome-diff.sh @@ -11,7 +11,7 @@ # bacteria), and any regions with less than 10 supporting reads. # Requires: samtools, minimap2, bcftools -# Data: atlas-group.cs.brown.edu/data/bio/R1.fastq.gz atlas-group.cs.brown.edu/data/bio/R2.fastq.gz atlas-group.cs.brown.edu/data/bio/ref.fa +# Data: http://ndr.md/data/bio/R1.fastq.gz http://ndr.md/data/bio/R2.fastq.gz http://ndr.md/data/bio/ref.fa # https://github.com/samtools/samtools/releases/latest # https://github.com/lh3/minimap2 diff --git a/evaluation/benchmarks/bio/bio-align/genquality.sh b/evaluation/benchmarks/bio/bio-align/genquality.sh index 62c731960..64c777fdd 100755 --- a/evaluation/benchmarks/bio/bio-align/genquality.sh +++ b/evaluation/benchmarks/bio/bio-align/genquality.sh @@ -6,7 +6,7 @@ # http://thegenomefactory.blogspot.com/2019/09/25-reasons-assemblies-dont-make-it-into.html # Require: csvkit -# Data: atlas-group.cs.brown.edu/data/bio/genbank.txt +# Data: http://ndr.md/data/bio/genbank.txt IN=./input/genbank.txt OUT=./output/out.txt diff --git a/evaluation/benchmarks/bio/bio1/setup.sh b/evaluation/benchmarks/bio/bio1/setup.sh index 9c2bb1629..40bdd47cb 100644 --- a/evaluation/benchmarks/bio/bio1/setup.sh +++ b/evaluation/benchmarks/bio/bio1/setup.sh @@ -8,7 +8,7 @@ mkdir -p input mkdir -p output cd input if [[ ! -f R1.fastq ]]; then - wget atlas-group.cs.brown.edu/data/bio/{R1.fastq.gz,R2.fastq.gz,ref.fa} + wget ndr.md/data/bio/{R1.fastq.gz,R2.fastq.gz,ref.fa} gunzip R1.fastq.gz gunzip R2.fastq.gz diff --git a/evaluation/benchmarks/max-temp/max-temp-preprocess.sh b/evaluation/benchmarks/max-temp/max-temp-preprocess.sh index 8d0719049..e3d4b98c5 100755 --- a/evaluation/benchmarks/max-temp/max-temp-preprocess.sh +++ b/evaluation/benchmarks/max-temp/max-temp-preprocess.sh @@ -1,12 +1,12 @@ #!/bin/bash -sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' | +sed 's;^;http://ndr.md/data/noaa/;' | sed 's;$;/;' | xargs -r -n 1 curl -s | grep gz | tr -s ' \n' | cut -d ' ' -f9 | sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' | - sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' | + sed 's;^;http://ndr.md/data/noaa/;' | xargs -n1 curl -s | gunzip diff --git a/evaluation/benchmarks/max-temp/max-temp.sh b/evaluation/benchmarks/max-temp/max-temp.sh index b74f72b10..b0c18aaa8 100755 --- a/evaluation/benchmarks/max-temp/max-temp.sh +++ b/evaluation/benchmarks/max-temp/max-temp.sh @@ -2,7 +2,7 @@ FROM=${FROM:-2015} TO=${TO:-2015} -IN=${IN:-'atlas-group.cs.brown.edu/data/noaa/'} +IN=${IN:-'http://ndr.md/data/noaa/'} fetch=${fetch:-"curl -s"} seq $FROM $TO | diff --git a/evaluation/benchmarks/max-temp/temp-analytics.sh b/evaluation/benchmarks/max-temp/temp-analytics.sh index a1399fa7d..319a8f0e4 100755 --- a/evaluation/benchmarks/max-temp/temp-analytics.sh +++ b/evaluation/benchmarks/max-temp/temp-analytics.sh @@ -2,7 +2,7 @@ FROM=${FROM:-2015} TO=${TO:-2015} -IN=${IN:-'atlas-group.cs.brown.edu/data/noaa/'} +IN=${IN:-'http://ndr.md/data/noaa/'} fetch=${fetch:-"curl -s"} data_file=temperatures.txt diff --git a/evaluation/benchmarks/nlp/input/setup.sh b/evaluation/benchmarks/nlp/input/setup.sh index a26a9cf19..5486b39f2 100755 --- a/evaluation/benchmarks/nlp/input/setup.sh +++ b/evaluation/benchmarks/nlp/input/setup.sh @@ -20,7 +20,7 @@ setup_dataset() { cd pg if [[ "$1" == "--full" ]]; then echo 'N.b.: download/extraction will take about 10min' - wget atlas-group.cs.brown.edu/data/pg.tar.xz # FIXME: moving to PG soon + wget ndr.md/data/pg.tar.xz if [ $? -ne 0 ]; then cat <<-'EOF' | sed 's/^ *//' Downloading input dataset failed, thus need to manually rsync all books from project gutenberg: diff --git a/evaluation/benchmarks/oneliners/input/setup.sh b/evaluation/benchmarks/oneliners/input/setup.sh index eb8a00317..96388980d 100755 --- a/evaluation/benchmarks/oneliners/input/setup.sh +++ b/evaluation/benchmarks/oneliners/input/setup.sh @@ -26,7 +26,7 @@ setup_dataset() { fi if [ ! -f ./1M.txt ]; then - curl -sf 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt + curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt if [ $? -ne 0 ]; then echo 'cannot find 1M.txt -- please contact the developers of pash' exit 1 @@ -51,7 +51,7 @@ setup_dataset() { fi if [ ! -f ./1G.txt ]; then - curl -sf 'atlas-group.cs.brown.edu/data/dummy/1G.txt' > 1G.txt + curl -sf 'http://ndr.md/data/dummy/1G.txt' > 1G.txt if [ $? -ne 0 ]; then echo 'cannot find 1G.txt -- please contact the developers of pash' exit 1 @@ -61,7 +61,7 @@ setup_dataset() { # download wamerican-insane dictionary and sort according to machine if [ ! -f ./dict.txt ]; then - curl -sf 'atlas-group.cs.brown.edu/data/dummy/dict.txt' | sort > dict.txt + curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt if [ $? -ne 0 ]; then echo 'cannot find dict.txt -- please contact the developers of pash' exit 1 @@ -70,7 +70,7 @@ setup_dataset() { fi if [ ! -f ./all_cmds.txt ]; then - curl -sf 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt + curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt if [ $? -ne 0 ]; then # This should be OK for tests, no need for abort ls /usr/bin/* > all_cmds.txt diff --git a/evaluation/benchmarks/web-index/input/setup.sh b/evaluation/benchmarks/web-index/input/setup.sh index 79a77276a..72a4fd8f9 100755 --- a/evaluation/benchmarks/web-index/input/setup.sh +++ b/evaluation/benchmarks/web-index/input/setup.sh @@ -17,7 +17,8 @@ setup_dataset() { wget $wiki_archive || eexit "cannot fetch wikipedia" 7za x wikipedia-en-html.tar.7z tar -xvf wikipedia-en-html.tar - wget atlas-group.cs.brown.edu/data/wikipedia/index.txt # FIXME: we download index below? + wget http://ndr.md/data/wikipedia/index.txt # || eexit "cannot fetch wiki indices" + # It is actually OK if we don't have this index since we download the 500/1000 below fi if [ "$1" = "--small" ]; then diff --git a/evaluation/distr_benchmarks/nlp/input/setup.sh b/evaluation/distr_benchmarks/nlp/input/setup.sh index 8f9daa05d..e523d21a8 100755 --- a/evaluation/distr_benchmarks/nlp/input/setup.sh +++ b/evaluation/distr_benchmarks/nlp/input/setup.sh @@ -19,7 +19,7 @@ if [ ! -e ./pg ]; then cd pg if [[ "$1" == "--full" ]]; then echo 'N.b.: download/extraction will take about 10min' - wget atlas-group.cs.brown.edu/data/pg.tar.xz # FIXME: moving to PG soon + wget ndr.md/data/pg.tar.xz if [ $? -ne 0 ]; then cat <<-'EOF' | sed 's/^ *//' Downloading input dataset failed, thus need to manually rsync all books from project gutenberg: diff --git a/evaluation/distr_benchmarks/oneliners/input/setup.sh b/evaluation/distr_benchmarks/oneliners/input/setup.sh index c9078d477..a24725912 100755 --- a/evaluation/distr_benchmarks/oneliners/input/setup.sh +++ b/evaluation/distr_benchmarks/oneliners/input/setup.sh @@ -1,9 +1,7 @@ #!/bin/bash #set -e -PASH_TOP=${PASH_TOP:-$DISH_TOP/pash} -. "$PASH_TOP/scripts/utils.sh" - +PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)} # another solution for capturing HTTP status code # https://superuser.com/a/590170 @@ -15,15 +13,14 @@ if [[ "$1" == "-c" ]]; then exit fi -hdfs dfs -mkdir -p /oneliners +hdfs dfs -mkdir /oneliners if [ ! -f ./1M.txt ]; then - curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt + curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt if [ $? -ne 0 ]; then - curl -f 'https://zenodo.org/record/7650885/files/1M.txt' > 1M.txt - [ $? -ne 0 ] && eexit 'cannot find 1M.txt' + echo 'cannot find 1M.txt -- please contact the developers of pash' + exit 1 fi - append_nl_if_not ./1M.txt fi if [ ! -f ./10M.txt ]; then @@ -41,53 +38,35 @@ if [ ! -f ./100M.txt ]; then fi if [ ! -f ./1G.txt ]; then - curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1G.txt' > 1G.txt - if [ $? -ne 0 ]; then - touch 1G.txt - for (( i = 0; i < 10; i++ )); do - cat 100M.txt >> 1G.txt - done - fi -fi - -if [ ! -f ./words ]; then - curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/words' > words - if [ $? -ne 0 ]; then - curl -f 'https://zenodo.org/record/7650885/files/words' > words + curl -sf 'http://ndr.md/data/dummy/1G.txt' > 1G.txt if [ $? -ne 0 ]; then - if [ $(uname) = 'Darwin' ]; then - cp /usr/share/dict/web2 words || eexit "cannot find dict file" - else - # apt install wamerican-insane - cp /usr/share/dict/words words || eexit "cannot find dict file" - fi + echo 'cannot find 1G.txt -- please contact the developers of pash' + exit 1 fi - fi - append_nl_if_not words fi # download wamerican-insane dictionary and sort according to machine if [ ! -f ./dict.txt ]; then - curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/dict.txt' | sort > dict.txt + curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt if [ $? -ne 0 ]; then - sort words > sorted_words + echo 'cannot find dict.txt -- please contact the developers of pash' + exit 1 fi fi if [ ! -f ./all_cmds.txt ]; then - curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt + curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt if [ $? -ne 0 ]; then # This should be OK for tests, no need for abort ls /usr/bin/* > all_cmds.txt fi - append_nl_if_not ./all_cmds.txt fi if [ ! -f ./all_cmdsx100.txt ]; then - touch all_cmdsx100.txt - for (( i = 0; i < 100; i++ )); do - cat all_cmds.txt >> all_cmdsx100.txt - done + touch all_cmdsx100.txt + for (( i = 0; i < 100; i++ )); do + cat all_cmds.txt >> all_cmdsx100.txt + done fi if [ ! -f ./3G.txt ]; then diff --git a/evaluation/intro/input/setup.sh b/evaluation/intro/input/setup.sh index e1c253dd7..a524e9e56 100755 --- a/evaluation/intro/input/setup.sh +++ b/evaluation/intro/input/setup.sh @@ -7,7 +7,7 @@ cd $(dirname $0) [ "$1" = "-c" ] && rm-files 100M.txt words sorted_words if [ ! -f ./100M.txt ]; then - curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/100M.txt' > 100M.txt + curl -f 'ndr.md/data/dummy/100M.txt' > 100M.txt if [ $? -ne 0 ]; then curl -f 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1M > 1M.txt [ $? -ne 0 ] && eexit 'cannot find 1M.txt' @@ -20,7 +20,7 @@ if [ ! -f ./100M.txt ]; then fi if [ ! -f ./words ]; then - curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/words' > words + curl -f 'http://ndr.md/data/dummy/words' > words if [ $? -ne 0 ]; then if [ $(uname) = 'Darwin' ]; then cp /usr/share/dict/web2 words || eexit "cannot find dict file" diff --git a/evaluation/other/more-scripts/page-count.sh b/evaluation/other/more-scripts/page-count.sh index c4d89ecfd..b4a3326e5 100755 --- a/evaluation/other/more-scripts/page-count.sh +++ b/evaluation/other/more-scripts/page-count.sh @@ -5,7 +5,7 @@ # Require: libimage-exiftool-perl, bc # Data: -# atlas-group.cs.brown.edu/data/large.pdf +# http://ndr.md/data/dummy/large.pdf # More data: # https://arxiv.org/help/bulk_data diff --git a/evaluation/other/more-scripts/spell.sh b/evaluation/other/more-scripts/spell.sh index 9fd5e7384..1d4a9f330 100755 --- a/evaluation/other/more-scripts/spell.sh +++ b/evaluation/other/more-scripts/spell.sh @@ -6,7 +6,7 @@ # TODO: `groff is an interesting "pure", whose wrapper only needs split input # TODO: files carefully. -# Data: atlas-group.cs.brown.edu/data/dummy/ronn.1 +# Data: http://ndr.md/data/dummy/ronn.1 # dict depends on the system (and has to be sorted), so we assume it exists dict=./input/dict.txt diff --git a/evaluation/tests/input/setup.sh b/evaluation/tests/input/setup.sh index 88b332f1c..ac78afd20 100755 --- a/evaluation/tests/input/setup.sh +++ b/evaluation/tests/input/setup.sh @@ -16,7 +16,7 @@ esac [ "$1" = "-c" ] && rm-files 1M.txt all_cmds.txt words sorted_words 10M.txt if [ ! -f ./1M.txt ]; then - curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt + curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt if [ $? -ne 0 ]; then curl -sf 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1${head_sz} > 1M.txt [ $? -ne 0 ] && eexit 'cannot find 1M.txt' @@ -26,10 +26,7 @@ fi if [ ! -f ./all_cmds.txt ]; then if [ "$(hostname)" = "deathstar" ]; then - curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt - if [ $? -ne 0 ]; then - curl -f 'https://zenodo.org/record/7650885/files/all_cmds.txt' > all_cmds.txt || eexit "all_cmds not found" - fi + curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt || eexit "all_cmds not found" else ls /usr/bin/* > all_cmds.txt fi @@ -37,7 +34,7 @@ if [ ! -f ./all_cmds.txt ]; then fi if [ ! -f ./words ]; then - curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/words' > words + curl -sf 'http://ndr.md/data/dummy/words' > words if [ $? -ne 0 ]; then if [ $(uname) = 'Darwin' ]; then cp /usr/share/dict/web2 words || eexit "cannot find dict file" diff --git a/evaluation/tests/sed-test.sh b/evaluation/tests/sed-test.sh index 38d1cc855..f5ba0ac85 100644 --- a/evaluation/tests/sed-test.sh +++ b/evaluation/tests/sed-test.sh @@ -1,11 +1,11 @@ cat $PASH_TOP/evaluation/tests/input/1M.txt | sed 's;^d;da;' | - sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' | + sed 's;^;http://ndr.md/data/noaa/;' | sed 's;$;/;' | sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' | - sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' | + sed 's;^;http://ndr.md/data/noaa/;' | sed "s#^#$WIKI#" | sed s/\$/'0s'/ | sed 1d | sed 4d | - sed "\$d" + sed "\$d" \ No newline at end of file