Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update data URL #696

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion evaluation/benchmarks/bio/bio-align/genome-diff.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# bacteria), and any regions with less than 10 supporting reads.

# Requires: samtools, minimap2, bcftools
# Data: http://ndr.md/data/bio/R1.fastq.gz http://ndr.md/data/bio/R2.fastq.gz http://ndr.md/data/bio/ref.fa
# Data: atlas.cs.brown.edu/data/bio/R1.fastq.gz atlas.cs.brown.edu/data/bio/R2.fastq.gz atlas.cs.brown.edu/data/bio/ref.fa

# https://github.com/samtools/samtools/releases/latest
# https://github.com/lh3/minimap2
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/bio/bio-align/genquality.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# http://thegenomefactory.blogspot.com/2019/09/25-reasons-assemblies-dont-make-it-into.html

# Require: csvkit
# Data: http://ndr.md/data/bio/genbank.txt
# Data: atlas.cs.brown.edu/data/bio/genbank.txt

IN=./input/genbank.txt
OUT=./output/out.txt
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/bio/bio1/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ mkdir -p input
mkdir -p output
cd input
if [[ ! -f R1.fastq ]]; then
wget ndr.md/data/bio/{R1.fastq.gz,R2.fastq.gz,ref.fa}
wget atlas.cs.brown.edu/data/bio/{R1.fastq.gz,R2.fastq.gz,ref.fa}

gunzip R1.fastq.gz
gunzip R2.fastq.gz
Expand Down
4 changes: 2 additions & 2 deletions evaluation/benchmarks/max-temp/max-temp-preprocess.sh
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#!/bin/bash

sed 's;^;http://ndr.md/data/noaa/;' |
sed 's;^;atlas.cs.brown.edu/data/noaa/;' |
sed 's;$;/;' |
xargs -r -n 1 curl -s |
grep gz |
tr -s ' \n' |
cut -d ' ' -f9 |
sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' |
sed 's;^;http://ndr.md/data/noaa/;' |
sed 's;^;atlas.cs.brown.edu/data/noaa/;' |
xargs -n1 curl -s |
gunzip
2 changes: 1 addition & 1 deletion evaluation/benchmarks/max-temp/max-temp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

FROM=${FROM:-2015}
TO=${TO:-2015}
IN=${IN:-'http://ndr.md/data/noaa/'}
IN=${IN:-'atlas.cs.brown.edu/data/noaa/'}
fetch=${fetch:-"curl -s"}

seq $FROM $TO |
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/max-temp/temp-analytics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

FROM=${FROM:-2015}
TO=${TO:-2015}
IN=${IN:-'http://ndr.md/data/noaa/'}
IN=${IN:-'atlas.cs.brown.edu/data/noaa/'}
fetch=${fetch:-"curl -s"}

data_file=temperatures.txt
Expand Down
2 changes: 1 addition & 1 deletion evaluation/benchmarks/nlp/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ setup_dataset() {
cd pg
if [[ "$1" == "--full" ]]; then
echo 'N.b.: download/extraction will take about 10min'
wget ndr.md/data/pg.tar.xz
wget atlas.cs.brown.edu/data/pg.tar.xz # FIXME: moving to PG soon
if [ $? -ne 0 ]; then
cat <<-'EOF' | sed 's/^ *//'
Downloading input dataset failed, thus need to manually rsync all books from project gutenberg:
Expand Down
8 changes: 4 additions & 4 deletions evaluation/benchmarks/oneliners/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ setup_dataset() {
fi

if [ ! -f ./1M.txt ]; then
curl -sf 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
curl -sf 'atlas.cs.brown.edu/data/dummy/1M.txt' > 1M.txt
if [ $? -ne 0 ]; then
echo 'cannot find 1M.txt -- please contact the developers of pash'
exit 1
Expand All @@ -51,7 +51,7 @@ setup_dataset() {
fi

if [ ! -f ./1G.txt ]; then
curl -sf 'http://ndr.md/data/dummy/1G.txt' > 1G.txt
curl -sf 'atlas.cs.brown.edu/data/dummy/1G.txt' > 1G.txt
if [ $? -ne 0 ]; then
echo 'cannot find 1G.txt -- please contact the developers of pash'
exit 1
Expand All @@ -61,7 +61,7 @@ setup_dataset() {

# download wamerican-insane dictionary and sort according to machine
if [ ! -f ./dict.txt ]; then
curl -sf 'http://ndr.md/data/dummy/dict.txt' | sort > dict.txt
curl -sf 'atlas.cs.brown.edu/data/dummy/dict.txt' | sort > dict.txt
if [ $? -ne 0 ]; then
echo 'cannot find dict.txt -- please contact the developers of pash'
exit 1
Expand All @@ -70,7 +70,7 @@ setup_dataset() {
fi

if [ ! -f ./all_cmds.txt ]; then
curl -sf 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt
curl -sf 'atlas.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt
if [ $? -ne 0 ]; then
# This should be OK for tests, no need for abort
ls /usr/bin/* > all_cmds.txt
Expand Down
3 changes: 1 addition & 2 deletions evaluation/benchmarks/web-index/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ setup_dataset() {
wget $wiki_archive || eexit "cannot fetch wikipedia"
7za x wikipedia-en-html.tar.7z
tar -xvf wikipedia-en-html.tar
wget http://ndr.md/data/wikipedia/index.txt # || eexit "cannot fetch wiki indices"
# It is actually OK if we don't have this index since we download the 500/1000 below
wget atlas-group.cs.brown.edu/data/wikipedia/index.txt # FIXME: we download index below?
fi

if [ "$1" = "--small" ]; then
Expand Down
4 changes: 2 additions & 2 deletions evaluation/intro/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ cd $(dirname $0)


if [ ! -f ./100M.txt ]; then
curl -sf --connect-timeout 10 'ndr.md/data/dummy/100M.txt' > 100M.txt
curl -sf --connect-timeout 10 'atlas.cs.brown.edu/data/dummy/100M.txt' > 100M.txt
if [ $? -ne 0 ]; then
# Pipe curl through tac (twice) in order to consume all the output from curl.
# This way, curl can write the whole page and not emit an error code.
Expand All @@ -23,7 +23,7 @@ if [ ! -f ./100M.txt ]; then
fi

if [ ! -f ./words ]; then
curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/words' > words
curl -sf --connect-timeout 10 'atlas.cs.brown.edu/data/dummy/words' > words
if [ $? -ne 0 ]; then
curl -sf 'https://zenodo.org/record/7650885/files/words' > words
if [ $? -ne 0 ]; then
Expand Down
2 changes: 1 addition & 1 deletion evaluation/other/more-scripts/page-count.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

# Require: libimage-exiftool-perl, bc
# Data:
# http://ndr.md/data/dummy/large.pdf
# atlas.cs.brown.edu/data/large.pdf
# More data:
# https://arxiv.org/help/bulk_data

Expand Down
2 changes: 1 addition & 1 deletion evaluation/other/more-scripts/spell.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# TODO: `groff is an interesting "pure", whose wrapper only needs split input
# TODO: files carefully.

# Data: http://ndr.md/data/dummy/ronn.1
# Data: atlas.cs.brown.edu/data/dummy/ronn.1
# dict depends on the system (and has to be sorted), so we assume it exists
dict=./input/dict.txt

Expand Down
6 changes: 3 additions & 3 deletions evaluation/tests/input/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ esac
[ "$1" = "-c" ] && rm-files 1M.txt all_cmds.txt words sorted_words 10M.txt

if [ ! -f ./1M.txt ]; then
curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/1M.txt' > 1M.txt
curl -sf --connect-timeout 10 'atlas.cs.brown.edu/data/dummy/1M.txt' > 1M.txt
if [ $? -ne 0 ]; then
curl -f 'https://zenodo.org/record/7650885/files/1M.txt' > 1M.txt
if [ $? -ne 0 ]; then
Expand All @@ -29,7 +29,7 @@ fi

if [ ! -f ./all_cmds.txt ]; then
if [ "$(hostname)" = "deathstar" ]; then
curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/all_cmds.txt' > all_cmds.txt
curl -sf --connect-timeout 10 'atlas.cs.brown.edu/data/dummy/all_cmds.txt' > all_cmds.txt
if [ $? -ne 0 ]; then
curl -f 'https://zenodo.org/record/7650885/files/all_cmds.txt' > all_cmds.txt || eexit "all_cmds not found"
fi
Expand All @@ -40,7 +40,7 @@ if [ ! -f ./all_cmds.txt ]; then
fi

if [ ! -f ./words ]; then
curl -sf --connect-timeout 10 'http://ndr.md/data/dummy/words' > words
curl -sf --connect-timeout 10 'atlas.cs.brown.edu/data/dummy/words' > words
if [ $? -ne 0 ]; then
curl -f 'https://zenodo.org/record/7650885/files/words' > words
if [ $? -ne 0 ]; then
Expand Down
6 changes: 3 additions & 3 deletions evaluation/tests/sed-test.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
cat $PASH_TOP/evaluation/tests/input/1M.txt |
sed 's;^d;da;' |
sed 's;^;http://ndr.md/data/noaa/;' |
sed 's;^;atlas.cs.brown.edu/data/noaa/;' |
sed 's;$;/;' |
sed 's;^\(.*\)\(20[0-9][0-9]\).gz;\2/\1\2\.gz;' |
sed 's;^;http://ndr.md/data/noaa/;' |
sed 's;^;atlas.cs.brown.edu/data/noaa/;' |
sed "s#^#$WIKI#" |
sed s/\$/'0s'/ |
sed 1d |
sed 4d |
sed "\$d"
sed "\$d"
Loading