-
Notifications
You must be signed in to change notification settings - Fork 53
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #727 from clarin-eric/devel
@TomazErjavec, merging to main
- Loading branch information
Showing
1,483 changed files
with
36,123 additions
and
18,247 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,7 @@ cd ParlaMint | |
|
||
FAIL=0 | ||
|
||
DATADIR=Data | ||
DATADIR=Sample | ||
|
||
TESTDIR="SAMPLE/Parla-CLARIN" | ||
mkdir -p $TESTDIR | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,5 +4,3 @@ nohup.* | |
*.zip | ||
*.tar | ||
*.tgz | ||
validation* | ||
Data/TMP |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
# ParlaMint Corpora Log files | ||
|
||
This directory contains the log files made by the release pipeline. | ||
For each corpus there are 3 files: | ||
* `ParlaMint-XX.log`: the complete log for ParlaMint-XX | ||
* `ParlaMint-XX.warn.log`: warnings only | ||
* `ParlaMint-XX.error.log`: errors only | ||
|
||
The logs above are also present with the -en suffix (e.g. `ParlaMint-XX-en.log`) which are the logs for corpora | ||
which have been machine translated to English. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,55 +1,87 @@ | ||
## Transliteration tests | ||
test-translit4: | ||
$s tsv=0.tsv -xsl:bin/trans-tsv2tei.xsl Sources-TEI/ParlaMint-BG.TEI/ParlaMint-BG-listPerson.xml > ParlaMint-BG-listPerson.xml | ||
test-translit3: | ||
bin/trans-execute.pl Sources-TEI/ParlaMint-BG.TEI/ParlaMint-BG-listPerson.xml | ||
test-translit2: | ||
$s -xsl:bin/trans-tei2tsv.xsl Sources-TEI/ParlaMint-BG.TEI/ParlaMint-BG-listOrg.xml > ParlaMint-BG-listOrg.tsv | ||
test-translit1: | ||
$s -xsl:bin/trans-tei2tsv.xsl Sources-TEI/ParlaMint-BG.TEI/ParlaMint-BG-listPerson.xml > ParlaMint-BG-listPerson.tsv | ||
$s -xsl:bin/trans-tei2tsv.xsl Sources-TEI/ParlaMint-GR.TEI/ParlaMint-GR-listPerson.xml > ParlaMint-GR-listPerson.tsv | ||
$s -xsl:bin/trans-tei2tsv.xsl Sources-TEI/ParlaMint-UA.TEI/ParlaMint-UA-listPerson.xml > ParlaMint-UA-listPerson.tsv | ||
|
||
######## Merging taxonomies | ||
|
||
TAXONOMIES-TEI = subcorpus speaker_types parla.legislature | ||
TAXONOMIES-ANA = NER | ||
|
||
merge-taxos-nohup: | ||
nohup time make merge-taxos 2> Taxonomies/ParlaMint-taxonomy-merge.log > Logs/ParlaMint-taxonomy.log & | ||
|
||
merge-taxos: | ||
for TAXONOMY in ${TAXONOMIES-TEI}; do \ | ||
$s template=../Corpora/Taxonomies/ParlaMint-taxonomy-$${TAXONOMY}.template.xml \ | ||
-xsl:../Scripts/parlamint-merge-taxonomy.xsl Master/ParlaMint.xml \ | ||
> Taxonomies/ParlaMint-taxonomy-$${TAXONOMY}.xml; \ | ||
done; | ||
for TAXONOMY in ${TAXONOMIES-ANA}; do \ | ||
$s template=../Corpora/Taxonomies/ParlaMint-taxonomy-$${TAXONOMY}.ana.template.xml \ | ||
-xsl:../Scripts/parlamint-merge-taxonomy.xsl Master/ParlaMint.ana.xml \ | ||
> Taxonomies/ParlaMint-taxonomy-$${TAXONOMY}.ana.xml; \ | ||
done; | ||
${vta} Taxonomies/ParlaMint-taxonomy-*.xml | ||
|
||
############### Makefile for making a distributable version of the ParlaMint and ParlaMint-en corpora | ||
|
||
### VARIABLES | ||
|
||
# All ParlaMint II corpora | ||
# CORPORA = AT BA BE BG CZ DK EE ES ES-CT ES-GA ES-PV FI FR GB GR HR HU IS IT LT LV NL NO PL PT RS RO SE SI TR UA | ||
# Missing corpora: ES ES-PV FI LT RO | ||
# Missing corpora: FI LT RO ES-PV | ||
|
||
######## SUBMITTED CORPORA FOR V 3.0 | ||
# CORPORA = AT BA BE BG CZ DK EE ES-CT ES-GA FR GB GR HR HU IS IT LV NL NO PL PT RS SE SI TR UA | ||
######## SUBMITTED CORPORA FOR V 3.1 | ||
# CORPORA = AT BA BE BG CZ DK EE ES ES-CT ES-GA FR GB GR HR HU IS IT LV NL NO PL PT RS SE SI TR UA | ||
|
||
# Partial runs: | ||
CORPORA = AT | ||
|
||
######## MTed CORPORA FOR V 3.0 | ||
MT-CORPORA = AT-en BA-en BE-en BG-en CZ-en DK-en EE-en ES-CT-en ES-GA-en FR-en GR-en HR-en HU-en IS-en IT-en LV-en NL-en NO-en PL-en PT-en RS-en SE-en SI-en TR-en UA-en | ||
######## MTed CORPORA FOR V 3.1 | ||
MT-CORPORA = AT-en BA-en BE-en BG-en CZ-en DK-en EE-en ES-en ES-CT-en ES-GA-en FR-en GR-en HR-en HU-en IS-en IT-en LV-en NL-en NO-en PL-en PT-en RS-en SE-en SI-en TR-en UA-en | ||
|
||
# Used in test targets: | ||
CORPUS = UA | ||
|
||
#Where things are, as we use several branches: this one (most likely dev), and documentation | ||
PARLAMINT = /project/corpora/Parla/ParlaMint | ||
HERE = ${PARLAMINT}/ParlaMint-V3/Distro | ||
DOC = ${PARLAMINT}/ParlaMint-documentation | ||
SCH = ${DOC}/Schema | ||
PARLAMINT = /project/corpora/Parla/ParlaMint/ParlaMint | ||
SCH = ${PARLAMINT}/Schema | ||
HERE = ${PARLAMINT}/Corpora | ||
TEMP = ${HERE}/Temp | ||
|
||
#Where the submitted corpora are found (ParlaMint- .TEI/ and .TEI.ana/ | ||
SOURCE = ${HERE}/Source | ||
SOURCE-MT = ${HERE}/Source-MT | ||
SOURCES = ${HERE}/Sources-TEI | ||
SOURCES-MT = ${HERE}/Sources-MT | ||
|
||
# Version number and PID of next(!) TEI and TEI.ana ParlaMint release | ||
VERSION = 3.1 | ||
HANDLE-TEI = http://hdl.handle.net/11356/1859 | ||
HANDLE-ANA = http://hdl.handle.net/11356/1860 | ||
|
||
# Version number and PID of future(!) MTed ParlaMint-en.ana release | ||
VERSION-MT = 3.0 | ||
HANDLE-MT = http://hdl.handle.net/11356/1810 | ||
# Version number and PID of next MTed ParlaMint-en.ana release | ||
VERSION-MT = 3.1 | ||
HANDLE-MT = http://hdl.handle.net/11356/1864 | ||
|
||
#Where the produced corpora are put for inspection | ||
WEB = [email protected]:/home/tomaz/www/tmp/ParlaMint/ | ||
|
||
###### Targets | ||
|
||
### Fixes for 3.0-en: | ||
### Fixes for 3.1-en: | ||
|
||
# Instead of TEI-derived CoNLL-U files we release original MTed CoNLL-U | ||
# because it contains word alignments | ||
# because they also contain word alignments | ||
# Script also adds -en suffix to filesnames + readme. | ||
mt-cp-conllu: | ||
bin/cp-conllu.pl 'Source-MT/ParlaMint-*-en.conllu' 'Master' | ||
bin/cp-conllu.pl validate '${SOURCES-MT}/ParlaMint-*-en.conllu' 'Master' | ||
|
||
# Make txt and tsv files with tsvs | ||
mt-convert-txt: | ||
|
@@ -62,9 +94,9 @@ mt-convert-txt: | |
mrg-conll-nohup: | ||
nohup time make mrg-conll > Logs/ParlaMint_Merge_CoNLL-U.log & | ||
mrg-conll: | ||
bin/merge-conllu.pl Master/ParlaMint-BE.conllu Source-MT/ParlaMint-BE-en.conllu | ||
bin/merge-conllu.pl Master/ParlaMint-ES-CT.conllu Source-MT/ParlaMint-ES-CT-en.conllu | ||
bin/merge-conllu.pl Master/ParlaMint-UA.conllu Source-MT/ParlaMint-UA-en.conllu | ||
bin/merge-conllu.pl Master/ParlaMint-BE.conllu ${SOURCES-MT}/ParlaMint-BE-en.conllu | ||
bin/merge-conllu.pl Master/ParlaMint-ES-CT.conllu ${SOURCES-MT}/ParlaMint-ES-CT-en.conllu | ||
bin/merge-conllu.pl Master/ParlaMint-UA.conllu ${SOURCES-MT}/ParlaMint-UA-en.conllu | ||
|
||
# Fix a mistake with handle in corpora | ||
fix-handle: | ||
|
@@ -81,15 +113,15 @@ cp-readmes: | |
# Make samples only | ||
samples: | ||
for CORPUS in ${CORPORA}; do \ | ||
${FINALIZE} -sample -codes $${CORPUS} -in ${SOURCE} -out ${HERE}/Master 2> Logs/ParlaMint-$${CORPUS}.log; \ | ||
${FINALIZE} -sample -codes $${CORPUS} -in ${SOURCES} -out ${HERE}/Master 2> Logs/ParlaMint-$${CORPUS}.log; \ | ||
grep -a -i 'error' Logs/ParlaMint-$${CORPUS}.log > Logs/ParlaMint-$${CORPUS}.error.log; \ | ||
grep -a -i 'warn' Logs/ParlaMint-$${CORPUS}.log > Logs/ParlaMint-$${CORPUS}.warn.log; \ | ||
done; | ||
|
||
# Make vertical files only | ||
make-verts: | ||
for CORPUS in ${CORPORA}; do \ | ||
${FINALIZE} -vert -codes $${CORPUS} -in ${SOURCE} -out ${HERE}/Master \ | ||
${FINALIZE} -vert -codes $${CORPUS} -in ${SOURCES} -out ${HERE}/Master; \ | ||
done; | ||
make verts | ||
|
||
|
@@ -106,7 +138,7 @@ mt-make-root: | |
|
||
cp-samples: | ||
# bin/cp-samples.pl 'Master/Sample-ParlaMint-*-en' Test | ||
bin/cp-samples.pl 'Master/Sample-ParlaMint-*-en' ../Data | ||
bin/cp-samples.pl 'Master/Sample-ParlaMint-*-en' ../Samples | ||
|
||
mt-logs: | ||
for CORPUS in ${CORPORA}; do \ | ||
|
@@ -120,10 +152,6 @@ web: | |
rsync -av Logs/*.log ${WEB}/Logs | ||
rsync -av Packed/*.tgz ${WEB}/Repo | ||
|
||
###### Factorisation of source corpora; needs to be run only once | ||
factor-all: | ||
../Scripts/parlamint-factorize-corpora.pl ${SOURCE} | ||
|
||
###### Targets for producing releasable version of ParlaMint corpora | ||
FINALIZE = perl ../Scripts/parlamint2distro.pl -version ${VERSION} -teihandle ${HANDLE-TEI} -anahandle ${HANDLE-ANA} -schema ../Schema -docs Docs | ||
|
||
|
@@ -136,16 +164,16 @@ nohup: | |
nohup2: | ||
nice nohup time make all > Logs/ParlaMint.2.log & | ||
|
||
all: final | ||
all: final verts | ||
xall: final verts pack | ||
|
||
pack: | ||
perl ../Scripts/pack-parlamint.pl -codes '${CORPORA}' -in Master -out Packed | ||
verts: | ||
perl ../Scripts/join-verts.pl -codes '${CORPORA}' -in Master -out Verts | ||
perl ../Scripts/join-verts.pl -version ${VERSION} -codes '${CORPORA}' -in Master -out Verts | ||
final: | ||
for CORPUS in ${CORPORA}; do \ | ||
${FINALIZE} -all -codes $${CORPUS} -in ${SOURCE} -out ${HERE}/Master 2> Logs/ParlaMint-$${CORPUS}.log; \ | ||
${FINALIZE} -all -codes $${CORPUS} -in ${SOURCES} -out ${HERE}/Master 2> Logs/ParlaMint-$${CORPUS}.log; \ | ||
grep -a -i 'error' Logs/ParlaMint-$${CORPUS}.log > Logs/ParlaMint-$${CORPUS}.error.log; \ | ||
grep -a -i 'warn' Logs/ParlaMint-$${CORPUS}.log > Logs/ParlaMint-$${CORPUS}.warn.log; \ | ||
echo "$${CORPUS}.warn"; \ | ||
|
@@ -162,13 +190,13 @@ final: | |
### Make MTed corpora | ||
|
||
# Make distribution with: | ||
FINALIZE-MT = perl ../Scripts/parlamint2distro.pl -version ${VERSION-MT} -anahandle ${HANDLE-MT} -schema ${DOC}/Schema -docs ${HERE}/Docs | ||
FINALIZE-MT = perl ../Scripts/parlamint2distro.pl -version ${VERSION-MT} -anahandle ${HANDLE-MT} -schema ${PARLAMINT}/Schema -docs ${HERE}/Docs | ||
|
||
# Targets | ||
mt-nohup: | ||
nice nohup time make mt-all-final > Logs/ParlaMint-en.3.log & | ||
mt-all-final: mt-pack mt-web | ||
mt-xall-final: mt-convert mt-verts mt-pack mt-web | ||
nice nohup time make mt-all-final > Logs/ParlaMint-en.log & | ||
mt-all-final: mt-convert | ||
mt-xall-final: mt-init mt-convert mt-verts mt-pack mt-web | ||
|
||
mt-web: | ||
rsync -av Logs/*-en*.log ${WEB}/Logs | ||
|
@@ -198,8 +226,8 @@ mt-convert: | |
for CORPUS in ${CORPORA}; do \ | ||
perl ../Scripts/mt-conllu2tei.pl \ | ||
${HERE}/Master/ParlaMint-$${CORPUS}.TEI.ana/ParlaMint-$${CORPUS}.ana.xml \ | ||
${SOURCE-MT}/ParlaMint-$${CORPUS}-en-notes.tsv \ | ||
${SOURCE-MT}/ParlaMint-$${CORPUS}-en.conllu \ | ||
${SOURCES-MT}/ParlaMint-$${CORPUS}-en-notes.tsv \ | ||
${SOURCES-MT}/ParlaMint-$${CORPUS}-en.conllu \ | ||
${TEMP}/ParlaMint-$${CORPUS}-en.TEI.ana 2> Logs/ParlaMint-$${CORPUS}-en.log; \ | ||
${FINALIZE-MT} -all -notei -noconll -codes $${CORPUS}-en -in ${TEMP} -out ${HERE}/Master \ | ||
2>> Logs/ParlaMint-$${CORPUS}-en.log; \ | ||
|
@@ -208,6 +236,15 @@ mt-convert: | |
done; | ||
|
||
### Tests for debugging MT processing | ||
|
||
mt-test9: | ||
perl ../Scripts/mt-conllu2tei.pl \ | ||
${HERE}/Master/ParlaMint-LV.TEI.ana/ParlaMint-LV.ana.xml \ | ||
${SOURCES-MT}/ParlaMint-LV-en-notes.tsv \ | ||
Sources-Sem/Test/ParlaMint-LV-en.conllu \ | ||
${TEMP}/ParlaMint-LV-en.TEI.ana | ||
${FINALIZE-MT} -all -notei -noconll -codes LV-en -in ${TEMP} -out ${HERE}/Test | ||
|
||
mt-test8: | ||
$s -xsl:../Scripts/validate-parlamint.xsl \ | ||
${HERE}/Master/ParlaMint-AT-en.TEI.ana/ParlaMint-AT-en.ana.xml | ||
|
@@ -237,46 +274,13 @@ mt-test2a: | |
perl ../Scripts/conllu2tei.pl < Test/0.conllu > Test/0.body.xml | ||
mt-test2: | ||
perl ../Scripts/conllu2tei.pl \ | ||
< Source-MT/ParlaMint-LV-en.conllu/2015/ParlaMint-LV_2015-11-12-PT12-329.conllu \ | ||
< ${SOURCES-MT}/ParlaMint-LV-en.conllu/2015/ParlaMint-LV_2015-11-12-PT12-329.conllu \ | ||
> Test/ParlaMint-LV_2015-11-12-PT12-329.body.xml | ||
xmllint --noout Test/ParlaMint-LV_2015-11-12-PT12-329.body.xml | ||
mt-test1: | ||
rm -fr Test/ParlaMint-CZ.tmp/* | ||
$s outDir=Test/ParlaMint-CZ.tmp -xsl:../Scripts/mt-prepare4mt.xsl ${HERE}/Master/ParlaMint-CZ.TEI.ana/ParlaMint-CZ.ana.xml | ||
|
||
# Producing almost XX-en, uses CORPUS variable | ||
mt-prep-nohup: | ||
nohup time make mt-prep-cnv > Logs/ParlaMint-${CORPUS}-mt2tei.log & | ||
mt-prep-cnv: | ||
perl ../Scripts/mt-conllu2tei.pl \ | ||
${HERE}/Master/ParlaMint-${CORPUS}.TEI.ana/ParlaMint-${CORPUS}.ana.xml \ | ||
${SOURCE-MT}/ParlaMint-${CORPUS}-en-notes.tsv \ | ||
${SOURCE-MT}/ParlaMint-${CORPUS}-en.conllu \ | ||
Test/ParlaMint-${CORPUS}-en.TEI.ana | ||
|
||
#Tests for original corpora | ||
test1: | ||
rm -fr Test/Out/ParlaMint-${CORPUS}* | ||
perl ../Scripts/parlamint2distro.pl -all -codes ${CORPUS} -in Test/In -out Test/Out \ | ||
-version 3.0 -teihandle ${HANDLE-TEI} -anahandle ${HANDLE-ANA} -schema ../Schema -docs Docs \ | ||
|
||
test-factorize: | ||
rm -fr Test/Factorized/ParlaMint-${CORPUS}* | ||
mkdir Test/Factorized || : | ||
$s outDir=Test/Factorized/ParlaMint-${CORPUS}.TEI \ | ||
prefix="ParlaMint-${CORPUS}-" \ | ||
-xsl:../Scripts/parlamint-factorize-teiHeader.xsl \ | ||
${SOURCE}/ParlaMint-${CORPUS}.TEI/ParlaMint-${CORPUS}.xml || : | ||
$s outDir=Test/Factorized/ParlaMint-${CORPUS}.TEI.ana \ | ||
prefix="ParlaMint-${CORPUS}-" \ | ||
teiRoot=`pwd`"/Test/Factorized/ParlaMint-${CORPUS}.TEI/ParlaMint-${CORPUS}.xml" \ | ||
-xsl:../Scripts/parlamint-factorize-teiHeader.xsl \ | ||
${SOURCE}/ParlaMint-${CORPUS}.TEI.ana/ParlaMint-${CORPUS}.ana.xml || : | ||
|
||
errs1: | ||
grep -i error Logs/*.log | \ | ||
grep -v '...suppressing' | grep -v 'Format errors' | grep -v 'Syntax errors' | grep -v 'FAILED' | ||
|
||
### Some idea, need to think about it! | ||
#REGIS=at ba be bg cz dk es_ct fr gb gr hr hu is it lv nl no pl pt rs se si tr ua | ||
REGIS=ua | ||
|
Oops, something went wrong.