Skip to content

Commit

Permalink
Add potential new corpora (DE, ES-AN, SK).
Browse files Browse the repository at this point in the history
  • Loading branch information
TomazErjavec committed Oct 16, 2024
1 parent 2bc9d17 commit f9a0b6a
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 13 deletions.
3 changes: 3 additions & 0 deletions Build/Scripts/mt-prepare4mt.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,9 @@
<xsl:when test="$country-code = 'BA'">zls</xsl:when>
<xsl:when test="$country-code = 'BG'">bg</xsl:when>
<xsl:when test="$country-code = 'CZ'">cs</xsl:when>
<xsl:when test="$country-code = 'DE'">gmw</xsl:when>
<xsl:when test="$country-code = 'DK'">da</xsl:when>
<!-- Why is ES missing? -->
<xsl:when test="$country-code = 'ES-CT'">roa</xsl:when>
<xsl:when test="$country-code = 'ES-GA'">itc</xsl:when>
<xsl:when test="$country-code = 'GR'">grk</xsl:when>
Expand All @@ -98,6 +100,7 @@
<xsl:when test="$country-code = 'RS'">zls</xsl:when>
<xsl:when test="$country-code = 'SE'">sv</xsl:when>
<xsl:when test="$country-code = 'SI'">sla</xsl:when>
<xsl:when test="$country-code = 'SK'">sla</xsl:when>
<xsl:when test="$country-code = 'TR'">tr</xsl:when>
<xsl:when test="$country-code = 'UA'">sla</xsl:when>
</xsl:choose>
Expand Down
13 changes: 13 additions & 0 deletions Scripts/parlamint-add-common-content.xsl
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@
<term>Bicameralism</term>
<term>Lower house</term>
</xsl:when>
<xsl:when test="$country-code = 'DE'">
<term>Legislature</term>
<term>Bicameralism</term>
<term>Lower house</term>
</xsl:when>
<xsl:when test="$country-code = 'DK'">
<term>Legislature</term>
<term>Unicameralism</term>
Expand All @@ -123,6 +128,10 @@
<term>Bicameralism</term>
<term>Lower house</term>
</xsl:when>
<xsl:when test="$country-code = 'ES-AN'">
<term>Legislature</term>
<term>Unicameralism</term>
</xsl:when>
<xsl:when test="$country-code = 'ES-CT'">
<term>Legislature</term>
<term>Unicameralism</term>
Expand Down Expand Up @@ -229,6 +238,10 @@
<term>Bicameralism</term>
<term>Lower house</term>
</xsl:when>
<xsl:when test="$country-code = 'SK'">
<term>Legislature</term>
<term>Unicameralism</term>
</xsl:when>
<xsl:when test="$country-code = 'TR'">
<term>Legislature</term>
<term>Unicameralism</term>
Expand Down
15 changes: 10 additions & 5 deletions Scripts/parlamint-factorize-corpora.pl
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,17 @@
# Mapping of countries to languages, we need it for mapping of common taxonomies
$country2lang{'AT'} = 'de';
$country2lang{'BA'} = 'bs';
$country2lang{'BE'} = 'nl';
$country2lang{'BE'} = 'nl, fr';
$country2lang{'BG'} = 'bg';
$country2lang{'CZ'} = 'cs';
$country2lang{'DE'} = 'de';
$country2lang{'DK'} = 'da';
$country2lang{'EE'} = 'et';
$country2lang{'ES'} = 'es';
$country2lang{'ES-CT'} = 'ca';
$country2lang{'ES-AN'} = 'es';
$country2lang{'ES-CT'} = 'ca, es';
$country2lang{'ES-GA'} = 'gl';
$country2lang{'ES-PV'} = 'eu';
$country2lang{'ES-PV'} = 'eu, es';
$country2lang{'FI'} = 'fi';
$country2lang{'FR'} = 'fr';
$country2lang{'GB'} = 'en';
Expand All @@ -49,8 +51,9 @@
$country2lang{'RS'} = 'sr';
$country2lang{'SE'} = 'sv';
$country2lang{'SI'} = 'sl';
$country2lang{'SK'} = 'sk';
$country2lang{'TR'} = 'tr';
$country2lang{'UA'} = 'uk';
$country2lang{'UA'} = 'uk, ru';

$bkpName = "BKP";
$Saxon = "java -jar $Bin/bin/saxon.jar";
Expand Down Expand Up @@ -106,7 +109,9 @@
push(@missing_taxonomies, $taxonomyFName)
}
else {print STDERR "WARN: Inserting forced taxonomy file $taxonomyFName\n"}
my $command = "$Saxon if-lang-missing=skip langs='$country2lang{$country}' -xsl:$scriptTaxonomy";
my $Language = $country2lang{$Country};
$Language =~ s/, .+//; #For multilingual corpora take the first language as main language
my $command = "$Saxon if-lang-missing=skip langs='$Language' -xsl:$scriptTaxonomy";
`$command $CommonTaxonomyFile > $taxonomyFile`;
}
}
Expand Down
5 changes: 4 additions & 1 deletion Scripts/parlamint2conllu.pl
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,14 @@ sub usage

$country2lang{'AT'} = 'de';
$country2lang{'BA'} = 'bs';
$country2lang{'BE'} = 'fr, nl';
$country2lang{'BE'} = 'nl, fr';
$country2lang{'BG'} = 'bg';
$country2lang{'CZ'} = 'cs';
$country2lang{'DE'} = 'de';
$country2lang{'DK'} = 'da';
$country2lang{'EE'} = 'et';
$country2lang{'ES'} = 'es';
$country2lang{'ES-AN'} = 'es';
$country2lang{'ES-CT'} = 'ca, es';
$country2lang{'ES-GA'} = 'gl';
$country2lang{'ES-PV'} = 'eu, es';
Expand All @@ -58,6 +60,7 @@ sub usage
$country2lang{'RS'} = 'sr';
$country2lang{'SE'} = 'sv';
$country2lang{'SI'} = 'sl';
$country2lang{'SK'} = 'sk';
$country2lang{'TR'} = 'tr';
$country2lang{'UA'} = 'uk, ru';

Expand Down
17 changes: 11 additions & 6 deletions Scripts/parlamint2distro.pl
Original file line number Diff line number Diff line change
Expand Up @@ -124,15 +124,17 @@ sub usage {
# Mapping of countries to languages, we need it for mapping of common taxonomies
$country2lang{'AT'} = 'de';
$country2lang{'BA'} = 'bs';
$country2lang{'BE'} = 'nl';
$country2lang{'BE'} = 'nl, fr';
$country2lang{'BG'} = 'bg';
$country2lang{'CZ'} = 'cs';
$country2lang{'DE'} = 'de';
$country2lang{'DK'} = 'da';
$country2lang{'EE'} = 'et';
$country2lang{'ES'} = 'es';
$country2lang{'ES-CT'} = 'ca';
$country2lang{'ES-AN'} = 'es';
$country2lang{'ES-CT'} = 'ca, es';
$country2lang{'ES-GA'} = 'gl';
$country2lang{'ES-PV'} = 'eu';
$country2lang{'ES-PV'} = 'eu, es';
$country2lang{'FI'} = 'fi';
$country2lang{'FR'} = 'fr';
$country2lang{'GB'} = 'en';
Expand All @@ -151,8 +153,9 @@ sub usage {
$country2lang{'RS'} = 'sr';
$country2lang{'SE'} = 'sv';
$country2lang{'SI'} = 'sl';
$country2lang{'SK'} = 'sk';
$country2lang{'TR'} = 'tr';
$country2lang{'UA'} = 'uk';
$country2lang{'UA'} = 'uk, ru';
# Fake country for testing:
$country2lang{'XX'} = 'hr';

Expand Down Expand Up @@ -403,8 +406,10 @@ sub commonTaxonomies {
if ($taxonomy !~ /\.ana/ or
($taxonomy =~ /\.ana/ and ($outDir =~ /\.ana/ or $outDir !~ /\.TEI/))) {
if (-e $taxonomy{$taxonomy}) {
if (exists($country2lang{$Country})) {
my $command = "$Saxon if-lang-missing=skip langs='$country2lang{$Country}' -xsl:$scriptTaxonomy";
if (exists($country2lang{$Country})) {
my $Language = $country2lang{$Country};
$Language =~ s/, .+//; #For multilingual corpora take the first language as main language
my $command = "$Saxon if-lang-missing=skip langs='$Language' -xsl:$scriptTaxonomy";
`$command $taxonomy{$taxonomy} > $outDir/$taxonomy.xml`;
}
else {
Expand Down
5 changes: 4 additions & 1 deletion Scripts/parlamintp2conllu.pl
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,14 @@ sub usage
#This should be somehow factorised out!!
$country2lang{'AT'} = 'de';
$country2lang{'BA'} = 'sr'; # Should be 'bs', but UD does not support it!
$country2lang{'BE'} = 'fr, nl';
$country2lang{'BE'} = 'nl, fr';
$country2lang{'BG'} = 'bg';
$country2lang{'CZ'} = 'cs';
$country2lang{'DE'} = 'de';
$country2lang{'DK'} = 'da';
$country2lang{'EE'} = 'et';
$country2lang{'ES'} = 'es';
$country2lang{'ES-AN'} = 'es';
$country2lang{'ES-CT'} = 'ca, es';
$country2lang{'ES-GA'} = 'gl';
$country2lang{'ES-PV'} = 'eu, es';
Expand All @@ -65,6 +67,7 @@ sub usage
$country2lang{'RS'} = 'sr';
$country2lang{'SE'} = 'sv';
$country2lang{'SI'} = 'sl';
$country2lang{'SK'} = 'sk';
$country2lang{'TR'} = 'tr';
$country2lang{'UA'} = 'uk, ru';
# Fake country for testing:
Expand Down

0 comments on commit f9a0b6a

Please sign in to comment.