Skip to content

Commit

Permalink
enable correct parsing for tokens like 2-й, see issue #670
Browse files Browse the repository at this point in the history
  • Loading branch information
grandsbor committed Feb 8, 2016
1 parent bccb3c1 commit cdacce2
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions lib/lib_annot.php
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ class MorphParseSet {
public $token_text;
public $parses;
private static $gram_descr = array();
private static $RE_CYR_TOKEN = '/^[А-Яа-яЁё][А-Яа-яЁё\-\']*$/u';
private static $RE_MIXED_TOKEN = '/^[0-9]+\-[А-Яа-яЁё]+$/u'; // like '2-ого'

public function __construct($xml="", $token_text="", $force_unknown=false, $force_include_init=false) {
if ($xml)
Expand Down Expand Up @@ -174,7 +176,7 @@ private function _from_token($token, $force_unknown, $force_include_init) {
$cyrillic = false;
if ($force_unknown) {
$this->parses[] = new MorphParse($token, array(array('inner' => 'UNKN')));
} elseif (preg_match('/^[А-Яа-яЁё][А-Яа-яЁё\-\']*$/u', $token)) {
} elseif (preg_match(self::$RE_CYR_TOKEN, $token) || preg_match(self::$RE_MIXED_TOKEN, $token)) {
$cyrillic = true;
$res = sql_pe("
SELECT lemma_id, lemma_text, grammems
Expand All @@ -196,8 +198,10 @@ private function _from_token($token, $force_unknown, $force_include_init) {
$require_uc = false;
foreach ($matches[1] as $gr) {
$gramlist[] = array('inner' => $gr);
if ($gr == 'Init')
if ($gr == 'Init') {
$require_uc = true;
break;
}
}
}
if (!$require_uc || $force_include_init || preg_match('/^[А-ЯЁ]+$/u', $token))
Expand Down

0 comments on commit cdacce2

Please sign in to comment.