Skip to content

Commit

Permalink
Rewinded evaluation link updates to include in a separate PR
Browse files Browse the repository at this point in the history
  • Loading branch information
huangworld committed Nov 25, 2023
1 parent f8ac509 commit d4512b0
Show file tree
Hide file tree
Showing 16 changed files with 41 additions and 64 deletions.
2 changes: 1 addition & 1 deletion evaluation/benchmarks/bio/bio-align/genome-diff.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# bacteria), and any regions with less than 10 supporting reads.

# Requires: samtools, minimap2, bcftools
# Data: atlas-group.cs.brown.edu/data/bio/R1.fastq.gz atlas-group.cs.brown.edu/data/bio/R2.fastq.gz atlas-group.cs.brown.edu/data/bio/ref.fa
# Data: http://ndr.md/data/bio/R1.fastq.gz http://ndr.md/data/bio/R2.fastq.gz http://ndr.md/data/bio/ref.fa

# https://github.com/samtools/samtools/releases/latest
# https://github.com/lh3/minimap2
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/bio/bio-align/genquality.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# http://thegenomefactory.blogspot.com/2019/09/25-reasons-assemblies-dont-make-it-into.html

# Require: csvkit
# Data: atlas-group.cs.brown.edu/data/bio/genbank.txt
# Data: http://ndr.md/data/bio/genbank.txt

IN=./input/genbank.txt
OUT=./output/out.txt
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/bio/bio1/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ mkdir -p input
mkdir -p output
cd input
if [[ ! -f R1.fastq ]]; then
wget atlas-group.cs.brown.edu/data/bio/{R1.fastq.gz,R2.fastq.gz,ref.fa}
wget ndr.md/data/bio/{R1.fastq.gz,R2.fastq.gz,ref.fa}

gunzip R1.fastq.gz
gunzip R2.fastq.gz
Expand Down
4 changes: 2 additions & 2 deletions evaluation/benchmarks/max-temp/max-temp-preprocess.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#!/bin/bash

sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' |
sed 's;^;http://ndr.md/data/noaa/;' |
sed 's;$;/;' |
xargs -r -n 1 curl -s |
grep gz |
tr -s ' \n' |
cut -d ' ' -f9 |
sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' |
sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' |
sed 's;^;http://ndr.md/data/noaa/;' |
xargs -n1 curl -s |
gunzip
2 changes: 1 addition & 1 deletion evaluation/benchmarks/max-temp/max-temp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

FROM=${FROM:-2015}
TO=${TO:-2015}
IN=${IN:-'atlas-group.cs.brown.edu/data/noaa/'}
IN=${IN:-'http://ndr.md/data/noaa/'}
fetch=${fetch:-"curl -s"}

seq $FROM $TO |
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/max-temp/temp-analytics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

FROM=${FROM:-2015}
TO=${TO:-2015}
IN=${IN:-'atlas-group.cs.brown.edu/data/noaa/'}
IN=${IN:-'http://ndr.md/data/noaa/'}
fetch=${fetch:-"curl -s"}

data_file=temperatures.txt
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/nlp/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ setup_dataset() {
cd pg
if [[ "$1" == "--full" ]]; then
echo 'N.b.: download/extraction will take about 10min'
wget atlas-group.cs.brown.edu/data/pg.tar.xz # FIXME: moving to PG soon
wget ndr.md/data/pg.tar.xz
if [ $? -ne 0 ]; then
cat <<-'EOF' | sed 's/^ *//'
Downloading input dataset failed, thus need to manually rsync all books from project gutenberg:
Expand Down
8 changes: 4 additions & 4 deletions evaluation/benchmarks/oneliners/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ setup_dataset() {
fi

if [ ! -f ./1M.txt ]; then
curl -sf 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt
curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
if [ $? -ne 0 ]; then
echo 'cannot find 1M.txt -- please contact the developers of pash'
exit 1
Expand All @@ -51,7 +51,7 @@ setup_dataset() {
fi

if [ ! -f ./1G.txt ]; then
curl -sf 'atlas-group.cs.brown.edu/data/dummy/1G.txt' > 1G.txt
curl -sf 'http://ndr.md/data/dummy/1G.txt' > 1G.txt
if [ $? -ne 0 ]; then
echo 'cannot find 1G.txt -- please contact the developers of pash'
exit 1
Expand All @@ -61,7 +61,7 @@ setup_dataset() {

# download wamerican-insane dictionary and sort according to machine
if [ ! -f ./dict.txt ]; then
curl -sf 'atlas-group.cs.brown.edu/data/dummy/dict.txt' | sort > dict.txt
curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt
if [ $? -ne 0 ]; then
echo 'cannot find dict.txt -- please contact the developers of pash'
exit 1
Expand All @@ -70,7 +70,7 @@ setup_dataset() {
fi

if [ ! -f ./all_cmds.txt ]; then
curl -sf 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt
curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt
if [ $? -ne 0 ]; then
# This should be OK for tests, no need for abort
ls /usr/bin/* > all_cmds.txt
Expand Down
3 changes: 2 additions & 1 deletion evaluation/benchmarks/web-index/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ setup_dataset() {
wget $wiki_archive || eexit "cannot fetch wikipedia"
7za x wikipedia-en-html.tar.7z
tar -xvf wikipedia-en-html.tar
wget atlas-group.cs.brown.edu/data/wikipedia/index.txt # FIXME: we download index below?
wget http://ndr.md/data/wikipedia/index.txt # || eexit "cannot fetch wiki indices"
# It is actually OK if we don't have this index since we download the 500/1000 below
fi

if [ "$1" = "--small" ]; then
Expand Down
2 changes: 1 addition & 1 deletion evaluation/distr_benchmarks/nlp/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ if [ ! -e ./pg ]; then
cd pg
if [[ "$1" == "--full" ]]; then
echo 'N.b.: download/extraction will take about 10min'
wget atlas-group.cs.brown.edu/data/pg.tar.xz # FIXME: moving to PG soon
wget ndr.md/data/pg.tar.xz
if [ $? -ne 0 ]; then
cat <<-'EOF' | sed 's/^ *//'
Downloading input dataset failed, thus need to manually rsync all books from project gutenberg:
Expand Down
53 changes: 16 additions & 37 deletions evaluation/distr_benchmarks/oneliners/input/setup.sh
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
#!/bin/bash
#set -e

PASH_TOP=${PASH_TOP:-$DISH_TOP/pash}
. "$PASH_TOP/scripts/utils.sh"

PASH_TOP=${PASH_TOP:-$(git rev-parse --show-toplevel)}

# another solution for capturing HTTP status code
# https://superuser.com/a/590170
Expand All @@ -15,15 +13,14 @@ if [[ "$1" == "-c" ]]; then
exit
fi

hdfs dfs -mkdir -p /oneliners
hdfs dfs -mkdir /oneliners

if [ ! -f ./1M.txt ]; then
curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt
curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
if [ $? -ne 0 ]; then
curl -f 'https://zenodo.org/record/7650885/files/1M.txt' > 1M.txt
[ $? -ne 0 ] && eexit 'cannot find 1M.txt'
echo 'cannot find 1M.txt -- please contact the developers of pash'
exit 1
fi
append_nl_if_not ./1M.txt
fi

if [ ! -f ./10M.txt ]; then
Expand All @@ -41,53 +38,35 @@ if [ ! -f ./100M.txt ]; then
fi

if [ ! -f ./1G.txt ]; then
curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1G.txt' > 1G.txt
if [ $? -ne 0 ]; then
touch 1G.txt
for (( i = 0; i < 10; i++ )); do
cat 100M.txt >> 1G.txt
done
fi
fi

if [ ! -f ./words ]; then
curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/words' > words
if [ $? -ne 0 ]; then
curl -f 'https://zenodo.org/record/7650885/files/words' > words
curl -sf 'http://ndr.md/data/dummy/1G.txt' > 1G.txt
if [ $? -ne 0 ]; then
if [ $(uname) = 'Darwin' ]; then
cp /usr/share/dict/web2 words || eexit "cannot find dict file"
else
# apt install wamerican-insane
cp /usr/share/dict/words words || eexit "cannot find dict file"
fi
echo 'cannot find 1G.txt -- please contact the developers of pash'
exit 1
fi
fi
append_nl_if_not words
fi

# download wamerican-insane dictionary and sort according to machine
if [ ! -f ./dict.txt ]; then
curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/dict.txt' | sort > dict.txt
curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt
if [ $? -ne 0 ]; then
sort words > sorted_words
echo 'cannot find dict.txt -- please contact the developers of pash'
exit 1
fi
fi

if [ ! -f ./all_cmds.txt ]; then
curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt
curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt
if [ $? -ne 0 ]; then
# This should be OK for tests, no need for abort
ls /usr/bin/* > all_cmds.txt
fi
append_nl_if_not ./all_cmds.txt
fi

if [ ! -f ./all_cmdsx100.txt ]; then
touch all_cmdsx100.txt
for (( i = 0; i < 100; i++ )); do
cat all_cmds.txt >> all_cmdsx100.txt
done
touch all_cmdsx100.txt
for (( i = 0; i < 100; i++ )); do
cat all_cmds.txt >> all_cmdsx100.txt
done
fi

if [ ! -f ./3G.txt ]; then
Expand Down
4 changes: 2 additions & 2 deletions evaluation/intro/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ cd $(dirname $0)
[ "$1" = "-c" ] && rm-files 100M.txt words sorted_words

if [ ! -f ./100M.txt ]; then
curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/100M.txt' > 100M.txt
curl -f 'ndr.md/data/dummy/100M.txt' > 100M.txt
if [ $? -ne 0 ]; then
curl -f 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1M > 1M.txt
[ $? -ne 0 ] && eexit 'cannot find 1M.txt'
Expand All @@ -20,7 +20,7 @@ if [ ! -f ./100M.txt ]; then
fi

if [ ! -f ./words ]; then
curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/words' > words
curl -f 'http://ndr.md/data/dummy/words' > words
if [ $? -ne 0 ]; then
if [ $(uname) = 'Darwin' ]; then
cp /usr/share/dict/web2 words || eexit "cannot find dict file"
Expand Down
2 changes: 1 addition & 1 deletion evaluation/other/more-scripts/page-count.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

# Require: libimage-exiftool-perl, bc
# Data:
# atlas-group.cs.brown.edu/data/large.pdf
# http://ndr.md/data/dummy/large.pdf
# More data:
# https://arxiv.org/help/bulk_data

Expand Down
2 changes: 1 addition & 1 deletion evaluation/other/more-scripts/spell.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# TODO: `groff is an interesting "pure", whose wrapper only needs split input
# TODO: files carefully.

# Data: atlas-group.cs.brown.edu/data/dummy/ronn.1
# Data: http://ndr.md/data/dummy/ronn.1
# dict depends on the system (and has to be sorted), so we assume it exists
dict=./input/dict.txt

Expand Down
9 changes: 3 additions & 6 deletions evaluation/tests/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ esac
[ "$1" = "-c" ] && rm-files 1M.txt all_cmds.txt words sorted_words 10M.txt

if [ ! -f ./1M.txt ]; then
curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/1M.txt' > 1M.txt
curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
if [ $? -ne 0 ]; then
curl -sf 'http://www.gutenberg.org/files/2600/2600-0.txt' | head -c 1${head_sz} > 1M.txt
[ $? -ne 0 ] && eexit 'cannot find 1M.txt'
Expand All @@ -26,18 +26,15 @@ fi

if [ ! -f ./all_cmds.txt ]; then
if [ "$(hostname)" = "deathstar" ]; then
curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt
if [ $? -ne 0 ]; then
curl -f 'https://zenodo.org/record/7650885/files/all_cmds.txt' > all_cmds.txt || eexit "all_cmds not found"
fi
curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt || eexit "all_cmds not found"
else
ls /usr/bin/* > all_cmds.txt
fi
append_nl_if_not ./all_cmds.txt
fi

if [ ! -f ./words ]; then
curl -sf --connect-timeout 10 'atlas-group.cs.brown.edu/data/dummy/words' > words
curl -sf 'http://ndr.md/data/dummy/words' > words
if [ $? -ne 0 ]; then
if [ $(uname) = 'Darwin' ]; then
cp /usr/share/dict/web2 words || eexit "cannot find dict file"
Expand Down
6 changes: 3 additions & 3 deletions evaluation/tests/sed-test.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
cat $PASH_TOP/evaluation/tests/input/1M.txt |
sed 's;^d;da;' |
sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' |
sed 's;^;http://ndr.md/data/noaa/;' |
sed 's;$;/;' |
sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' |
sed 's;^;atlas-group.cs.brown.edu/data/noaa/;' |
sed 's;^;http://ndr.md/data/noaa/;' |
sed "s#^#$WIKI#" |
sed s/\$/'0s'/ |
sed 1d |
sed 4d |
sed "\$d"
sed "\$d"

0 comments on commit d4512b0

Please sign in to comment.