Skip to content

Commit

Permalink
implement transliteration word forms in search #33
Browse files Browse the repository at this point in the history
  • Loading branch information
ghost committed Oct 27, 2023
1 parent c7c5d73 commit 997666a
Show file tree
Hide file tree
Showing 8 changed files with 144 additions and 58 deletions.
17 changes: 7 additions & 10 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -92,28 +92,25 @@ APP_TORRENT_WANTED_FTP_FOLDER=/yggtracker
APP_TORRENT_WANTED_FTP_APPROVED_ONLY=1

# Enable search index for torrent name
APP_INDEX_TORRENT_NAME=1
APP_INDEX_TORRENT_NAME_ENABLED=1

# Enable search index for torrent info hash v1
APP_INDEX_TORRENT_HASH_V1=1
APP_INDEX_TORRENT_HASH_V1_ENABLED=1

# Enable search index for torrent info hash v2
APP_INDEX_TORRENT_HASH_V2=1
APP_INDEX_TORRENT_HASH_V2_ENABLED=1

# Enable search index for torrent filenames
APP_INDEX_TORRENT_FILENAMES=1
APP_INDEX_TORRENT_FILENAMES_ENABLED=1

# Enable search index for torrent source
APP_INDEX_TORRENT_SOURCE=1
APP_INDEX_TORRENT_SOURCE_ENABLED=1

# Enable search index for torrent comment
APP_INDEX_TORRENT_COMMENT=1
APP_INDEX_TORRENT_COMMENT_ENABLED=1

# Enable search index for words length greater than N chars
APP_INDEX_WORD_LENGTH_MIN=3

# Enable search index for words length not greater than N chars
APP_INDEX_WORD_LENGTH_MAX=255

# Enable search index transliteration @TODO
APP_INDEX_TRANSLITERATION=1
APP_INDEX_WORD_LENGTH_MAX=255
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ git checkout -b my-pr-branch-name
* [SVG icons](https://icons.getbootstrap.com)
* [Scrapper](https://github.com/medariox/scrapeer) / [Composer Edition](https://github.com/YGGverse/scrapeer)
* [Bencode Library](https://github.com/Rhilip/Bencode)
* [Transliteration Library](https://github.com/ashtokalo/php-translit)
* [Identicons](https://github.com/dmester/jdenticon-php)

#### Support
Expand Down
1 change: 1 addition & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"php": ">=8.1",
"ext-ctype": "*",
"ext-iconv": "*",
"ashtokalo/php-translit": "^0.2.0",
"doctrine/annotations": "^2.0",
"doctrine/doctrine-bundle": "^2.10",
"doctrine/doctrine-migrations-bundle": "^3.2",
Expand Down
45 changes: 44 additions & 1 deletion composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 6 additions & 7 deletions config/services.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,14 @@ parameters:
app.torrent.wanted.ftp.enabled: '%env(APP_TORRENT_WANTED_FTP_ENABLED)%'
app.torrent.wanted.ftp.folder: '%env(APP_TORRENT_WANTED_FTP_FOLDER)%'
app.torrent.wanted.ftp.approved: '%env(APP_TORRENT_WANTED_FTP_APPROVED_ONLY)%'
app.index.torrent.name: '%env(APP_INDEX_TORRENT_NAME)%'
app.index.torrent.filenames: '%env(APP_INDEX_TORRENT_FILENAMES)%'
app.index.torrent.hash.v1: '%env(APP_INDEX_TORRENT_HASH_V1)%'
app.index.torrent.hash.v2: '%env(APP_INDEX_TORRENT_HASH_V2)%'
app.index.torrent.source: '%env(APP_INDEX_TORRENT_SOURCE)%'
app.index.torrent.comment: '%env(APP_INDEX_TORRENT_COMMENT)%'
app.index.torrent.name.enabled: '%env(APP_INDEX_TORRENT_NAME_ENABLED)%'
app.index.torrent.filenames.enabled: '%env(APP_INDEX_TORRENT_FILENAMES_ENABLED)%'
app.index.torrent.hash.v1.enabled: '%env(APP_INDEX_TORRENT_HASH_V1_ENABLED)%'
app.index.torrent.hash.v2.enabled: '%env(APP_INDEX_TORRENT_HASH_V2_ENABLED)%'
app.index.torrent.source.enabled: '%env(APP_INDEX_TORRENT_SOURCE_ENABLED)%'
app.index.torrent.comment.enabled: '%env(APP_INDEX_TORRENT_COMMENT_ENABLED)%'
app.index.word.length.min: '%env(APP_INDEX_WORD_LENGTH_MIN)%'
app.index.word.length.max: '%env(APP_INDEX_WORD_LENGTH_MAX)%'
app.index.transliteration: '%env(APP_INDEX_TRANSLITERATION)%'

services:
# default configuration for services in *this* file
Expand Down
28 changes: 12 additions & 16 deletions src/Controller/TorrentController.php
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,6 @@ public function search(
$activityService
);

//

// Init request
$query = $request->get('query') ? explode(' ', urldecode($request->get('query'))) : [];
$page = $request->get('page') ? (int) $request->get('page') : 1;
Expand Down Expand Up @@ -883,13 +881,12 @@ public function submit(

$file->getPathName(),

(bool) $this->getParameter('app.index.torrent.name'),
(bool) $this->getParameter('app.index.torrent.filenames'),
(bool) $this->getParameter('app.index.torrent.hash.v1'),
(bool) $this->getParameter('app.index.torrent.hash.v2'),
(bool) $this->getParameter('app.index.torrent.source'),
(bool) $this->getParameter('app.index.torrent.comment'),
(bool) $this->getParameter('app.index.transliteration'),
(bool) $this->getParameter('app.index.torrent.name.enabled'),
(bool) $this->getParameter('app.index.torrent.filenames.enabled'),
(bool) $this->getParameter('app.index.torrent.hash.v1.enabled'),
(bool) $this->getParameter('app.index.torrent.hash.v2.enabled'),
(bool) $this->getParameter('app.index.torrent.source.enabled'),
(bool) $this->getParameter('app.index.torrent.comment.enabled'),
(int) $this->getParameter('app.index.word.length.min'),
(int) $this->getParameter('app.index.word.length.max'),

Expand Down Expand Up @@ -2453,13 +2450,12 @@ public function reindex(
{
// Reindex keywords
$torrentService->reindexTorrentKeywordsAll(
(bool) $this->getParameter('app.index.torrent.name'),
(bool) $this->getParameter('app.index.torrent.filenames'),
(bool) $this->getParameter('app.index.torrent.hash.v1'),
(bool) $this->getParameter('app.index.torrent.hash.v2'),
(bool) $this->getParameter('app.index.torrent.source'),
(bool) $this->getParameter('app.index.torrent.comment'),
(bool) $this->getParameter('app.index.transliteration'),
(bool) $this->getParameter('app.index.torrent.name.enabled'),
(bool) $this->getParameter('app.index.torrent.filenames.enabled'),
(bool) $this->getParameter('app.index.torrent.hash.v1.enabled'),
(bool) $this->getParameter('app.index.torrent.hash.v2.enabled'),
(bool) $this->getParameter('app.index.torrent.source.enabled'),
(bool) $this->getParameter('app.index.torrent.comment.enabled'),
(int) $this->getParameter('app.index.word.length.min'),
(int) $this->getParameter('app.index.word.length.max')
);
Expand Down
82 changes: 73 additions & 9 deletions src/Repository/TorrentRepository.php
Original file line number Diff line number Diff line change
Expand Up @@ -73,27 +73,36 @@ private function getTorrentsQueryByFilter(
int $userId,
array $keywords,
array $locales,
?bool $sensitive = null,
?bool $approved = null,
?bool $status = null,
?bool $sensitive = null,
?bool $approved = null,
?bool $status = null
): \Doctrine\ORM\QueryBuilder
{
$query = $this->createQueryBuilder('t');

if ($keywords)
{
$andKeywords = $query->expr()->andX();

foreach ($keywords as $i => $keyword)
{
$keyword = mb_strtolower($keyword); // all keywords stored in lowercase
// Make query to the index case insensitive
$keyword = mb_strtolower($keyword);

$andKeywords->add("t.keywords LIKE :keyword{$i}");
// Init OR condition for each word form
$orKeywords = $query->expr()->orX();

$orKeywords->add("t.keywords LIKE :keyword{$i}");
$query->setParameter(":keyword{$i}", "%{$keyword}%");
}

$query->andWhere($andKeywords);
// Generate word forms for each transliteration locale #33
foreach ($this->generateWordForms($keyword) as $j => $wordForm)
{
$orKeywords->add("t.keywords LIKE :keyword{$i}{$j}");
$query->setParameter(":keyword{$i}{$j}", "%{$wordForm}%");
}

// Append AND condition
$query->andWhere($orKeywords);
}
}

if ($locales)
Expand Down Expand Up @@ -153,4 +162,59 @@ private function getTorrentsQueryByFilter(

return $query;
}

// Word forms generator to improve search results
// e.g. transliteration rules for latin filenames
private function generateWordForms(
string $keyword,
// #33 supported locales:
// https://github.com/ashtokalo/php-translit
array $transliteration = [
'be',
'bg',
'el',
'hy',
'kk',
'mk',
'ru',
'ka',
'uk'
],
// Additional char forms
array $charForms =
[
'c' => 'k',
'k' => 'c',
]
): array
{
$wordForms = [];

// Apply transliteration
foreach ($transliteration as $locale)
{
$wordForms[] = \ashtokalo\translit\Translit::object()->convert(
$keyword,
$locale
);
}

// Apply char forms
foreach ($wordForms as $wordForm)
{
foreach ($charForms as $from => $to)
{
$wordForms[] = str_replace(
$from,
$to,
$wordForm
);
}
}

// Remove duplicates
return array_unique(
$wordForms
);
}
}
15 changes: 0 additions & 15 deletions src/Service/TorrentService.php
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ public function readTorrentFileByTorrentId(

public function generateTorrentKeywordsByString(
string $string,
bool $transliteration,
int $wordLengthMin,
int $wordLengthMax,
): array
Expand Down Expand Up @@ -97,11 +96,6 @@ public function generateTorrentKeywordsByString(
{
// Apply case insensitive search conversion
$words[$key] = mb_strtolower($value);

if ($transliteration)
{
// @TODO
}
}
}

Expand Down Expand Up @@ -129,7 +123,6 @@ public function generateTorrentKeywordsByTorrentFilepath(
bool $extractSource,
bool $extractComment,

bool $wordTransliteration,
int $wordLengthMin,
int $wordLengthMax

Expand All @@ -147,7 +140,6 @@ public function generateTorrentKeywordsByTorrentFilepath(
$keywords,
$this->generateTorrentKeywordsByString(
$name,
$wordTransliteration,
$wordLengthMin,
$wordLengthMax
)
Expand All @@ -163,7 +155,6 @@ public function generateTorrentKeywordsByTorrentFilepath(
$keywords,
$this->generateTorrentKeywordsByString(
$list['path'],
$wordTransliteration,
$wordLengthMin,
$wordLengthMax
)
Expand All @@ -179,7 +170,6 @@ public function generateTorrentKeywordsByTorrentFilepath(
$keywords,
$this->generateTorrentKeywordsByString(
$source,
$wordTransliteration,
$wordLengthMin,
$wordLengthMax
)
Expand All @@ -195,7 +185,6 @@ public function generateTorrentKeywordsByTorrentFilepath(
$keywords,
$this->generateTorrentKeywordsByString(
$comment,
$wordTransliteration,
$wordLengthMin,
$wordLengthMax
)
Expand Down Expand Up @@ -301,7 +290,6 @@ public function add(
bool $extractSource,
bool $extractComment,

bool $wordTransliteration,
int $wordLengthMin,
int $wordLengthMax,

Expand All @@ -326,7 +314,6 @@ public function add(
$extractInfoHashV2,
$extractSource,
$extractComment,
$wordTransliteration,
$wordLengthMin,
$wordLengthMax
),
Expand Down Expand Up @@ -623,7 +610,6 @@ public function reindexTorrentKeywordsAll(
bool $extractInfoHashV2,
bool $extractSource,
bool $extractComment,
bool $wordTransliteration,
int $wordLengthMin,
int $wordLengthMax
): void
Expand All @@ -643,7 +629,6 @@ public function reindexTorrentKeywordsAll(
$extractInfoHashV2,
$extractSource,
$extractComment,
$wordTransliteration,
$wordLengthMin,
$wordLengthMax
)
Expand Down

0 comments on commit 997666a

Please sign in to comment.