Skip to content

Commit

Permalink
Rewrite 7.sh dgsh script
Browse files Browse the repository at this point in the history
  • Loading branch information
gliargovas authored Sep 7, 2023
1 parent bc71b6c commit 4b3d3f1
Showing 1 changed file with 62 additions and 0 deletions.
62 changes: 62 additions & 0 deletions evaluation/benchmarks/dgsh/sequential/7.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash

# Consistent sorting across machines
export LC_ALL=C

# Convert input into a ranked frequency list
ranked_frequency()
{
awk '{count[$1]++} END {for (i in count) print count[i], i}' |
# We want the standard sort here
sort -rn
}

# Convert standard input to a ranked frequency list of specified n-grams
ngram()
{
local N=$1

perl -ne 'for ($i = 0; $i < length($_) - '$N'; $i++) {
print substr($_, $i, '$N'), "\n";
}' |
ranked_frequency
}

# Temporary files
file1=$(mktemp)
file2=$(mktemp)
file3=$(mktemp)

cat > "$file1"

# Split input one word per line
tr -cs a-zA-Z '\n' < "$file1" > "$file2"

# Digram frequency
echo "Digram frequency"
ngram 2 < "$file2"

# Trigram frequency
echo "Trigram frequency"
ngram 3 < "$file2"

# Word frequency
echo "Word frequency"
ranked_frequency < "$file2"

# Store number of characters to use in awk below
nchars=$(wc -c < "$file1")

# Character frequency
echo "Character frequency"
sed 's/./&\
/g' < "$file1" |
# Print absolute
ranked_frequency | tee "$file3"

# Print relative
echo "Relative character frequency"
awk -v NCHARS=$nchars 'BEGIN {
OFMT = "%.2g%%"}
{print $1, $2, $1 / NCHARS * 100}' "$file3"

0 comments on commit 4b3d3f1

Please sign in to comment.