forked from mazko/jssnowball
-
Notifications
You must be signed in to change notification settings - Fork 0
/
configure
executable file
·69 lines (57 loc) · 4.76 KB
/
configure
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
JAVA_SRC_DWNLOADED=java_snowball_dwnloaded
rm -fr $JAVA_SRC_DWNLOADED snowball_code snowball_all
make clean
wget http://snowball.tartarus.org/dist/snowball_code.tgz
tar xf snowball_code.tgz
rm snowball_code.tgz*
wget -P $JAVA_SRC_DWNLOADED http://snowball.tartarus.org/dist/libstemmer_java.tgz
tar xf $JAVA_SRC_DWNLOADED/libstemmer_java.tgz -C $JAVA_SRC_DWNLOADED
rm $JAVA_SRC_DWNLOADED/libstemmer_java.tgz*
wget http://snowball.tartarus.org/dist/snowball_all.tgz
tar xf snowball_all.tgz
rm snowball_all.tgz*
make JAVA_SRC_DWNLD="$JAVA_SRC_DWNLOADED" libstemmer_algorithms="danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish turkish" java_src_check
cp js_snowball/generator_js.c snowball_code/compiler/generator_java.c
# basque manipulation - encoding and little .sbl patch: echo -n $'\xC3\xB1' | iconv -f utf8 -t ISO-8859-1 | xxd -p => f1
tar xf snowball_all/algorithms/basque/tarball.tgz -C snowball_all/algorithms/basque
iconv -f `file -bi snowball_all/algorithms/basque/basque/voc.txt | sed 's!.*charset=\(.*\)!\1!'` -t utf-8 snowball_all/algorithms/basque/basque/voc.txt > snowball_all/algorithms/basque/voc.txt
mkdir snowball_code/algorithms/basque
cp snowball_all/algorithms/basque/basque/stemmer.sbl snowball_code/algorithms/basque/stem_Unicode.sbl
line_begin=`sed -n '/\/\* special characters (in ISO Latin I) \*\//=' snowball_code/algorithms/basque/stem_Unicode.sbl`; \
sed -i "${line_begin}a\\stringdef n\" hex 'F1'" snowball_code/algorithms/basque/stem_Unicode.sbl; \
sed -i ${line_begin}'a\\' snowball_code/algorithms/basque/stem_Unicode.sbl
sed -i 's!\xc3\xb1!{n"}!g' snowball_code/algorithms/basque/stem_Unicode.sbl
rm -fr snowball_all/algorithms/basque/basque
# armenian - extract voc.txt from aspell, lowercase, sort, uniq, and / 3 (debian sudo apt-get install aspell-hy)
aspell -l hy dump master | aspell -l hy expand | tr ' ' '\n' | sed -e 's/./\L\0/g' | sort -u | sed -n '1~3p'> snowball_all/algorithms/armenian/voc.txt
mkdir snowball_code/algorithms/armenian
cp snowball_all/algorithms/armenian/stemmer.sbl snowball_code/algorithms/armenian/stem_Unicode.sbl
# catalan - extract voc.txt from aspell, lowercase, sort, uniq, and / 200 (debian sudo apt-get install aspell-ca)
aspell -l ca dump master | aspell -l ca expand | tr ' ' '\n' | sed -e 's/./\L\0/g' | sort -u | sed -n '1~200p'> snowball_all/algorithms/catalan/voc.txt
mkdir snowball_code/algorithms/catalan
tar xf snowball_all/algorithms/catalan/tarball.tgz -C snowball_all/algorithms/catalan
cp snowball_all/algorithms/catalan/catalan/stemmer.sbl snowball_code/algorithms/catalan/stem_Unicode.sbl
rm -fr snowball_all/algorithms/catalan/catalan
# czech - extract voc.txt from aspell, lowercase, sort, uniq, and / 75 (apt-get install aspell-cs), download, convert ISO-8859-2 to unicode
mkdir snowball_all/algorithms/czech
aspell -l cs dump master | aspell -l cs expand | tr ' ' '\n' | sed -e 's/./\L\0/g' | sort -u | sed -n '1~75p' > snowball_all/algorithms/czech/voc.txt
mkdir snowball_code/algorithms/czech
wget -O snowball_code/algorithms/czech/stem_Unicode.sbl http://snowball.tartarus.org/otherapps/oregan/czech-do.sbl
VARS_8859_2=`sed -n "s/^.*\s\+hex\s\+'\(\w\+\)'.*/\1/p" snowball_code/algorithms/czech/stem_Unicode.sbl`; \
for v in $VARS_8859_2; do \
vc=`printf "\x$v" | iconv -f ISO-8859-2 -t UNICODELITTLE | hexdump | sed -n 's/^.*\s\+0*\(\w\+\).*/\U\1/p'`; \
echo "'$v' => '$vc'"; sed -i "s/\s\+hex\s\+'$v'/ hex '$vc'/" snowball_code/algorithms/czech/stem_Unicode.sbl; done;
# irish - extract voc.txt from aspell, lowercase, sort, uniq, and / 5 (debian sudo apt-get install aspell-ga), download stemmer
mkdir snowball_all/algorithms/irish
aspell -l ga dump master | aspell -l ga expand | tr ' ' '\n' | sed -e 's/./\L\0/g' | sort -u | sed -n '1~5p' > snowball_all/algorithms/irish/voc.txt
mkdir snowball_code/algorithms/irish
wget -O snowball_code/algorithms/irish/stem_Unicode.sbl http://snowball.tartarus.org/otherapps/oregan/irish.sbl
# slovene - extract voc.txt from aspell, lowercase, sort, uniq, and / 15 (apt-get install aspell-sl), download, convert ISO-8859-2 to unicode
mkdir snowball_all/algorithms/slovene
aspell -l sl dump master | aspell -l sl expand | tr ' ' '\n' | sed -e 's/./\L\0/g' | sort -u | sed -n '1~15p' > snowball_all/algorithms/slovene/voc.txt
mkdir snowball_code/algorithms/slovene
cp albert_einstein/slovene_stemmer_from_discuss.sbl snowball_code/algorithms/slovene/stem_Unicode.sbl
VARS_8859_2=`sed -n "s/^.*\s\+hex\s\+'\(\w\+\)'.*/\1/p" snowball_code/algorithms/slovene/stem_Unicode.sbl`; \
for v in $VARS_8859_2; do \
vc=`printf "\x$v" | iconv -f ISO-8859-2 -t UNICODELITTLE | hexdump | sed -n 's/^.*\s\+0*\(\w\+\).*/\U\1/p'`; \
echo "'$v' => '$vc'"; sed -i "s/\s\+hex\s\+'$v'/ hex '$vc'/" snowball_code/algorithms/slovene/stem_Unicode.sbl; done;