Skip to content

Commit

Permalink
fix: [ocr] filter ocr supported languages + fix type of object accept…
Browse files Browse the repository at this point in the history
…ed by the tracker
  • Loading branch information
Terrtia committed Apr 26, 2024
1 parent 26f9e84 commit 35502d9
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 10 deletions.
4 changes: 2 additions & 2 deletions bin/lib/ail_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,10 @@ def get_default_correlation_objects():
return AIL_OBJECTS_CORRELATIONS_DEFAULT

def get_obj_queued():
return ['item', 'image']
return ['item', 'image', 'message', 'ocr']

def get_objects_tracked():
return ['decoded', 'item', 'pgp', 'message', 'title']
return ['decoded', 'item', 'pgp', 'message', 'ocr', 'title']

def get_objects_retro_hunted():
return ['decoded', 'item', 'message']
Expand Down
22 changes: 16 additions & 6 deletions bin/lib/objects/Ocrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,14 +296,24 @@ def extract_text(image_path, languages, threshold=0.2):
extracted.append((bbox, text))
return extracted

# TODO OCRS Class

def get_ids():
return r_object.smembers(f'ocr:all')
def get_ocr_languages():
return {'af', 'ar', 'as', 'az', 'be', 'bg', 'bh', 'bs', 'cs', 'cy', 'da', 'de', 'en', 'es', 'et', 'fa', 'fr', 'ga', 'hi', 'hr', 'hu', 'id', 'is', 'it', 'ja', 'kn', 'ko', 'ku', 'la', 'lt', 'lv', 'mi', 'mn', 'mr', 'ms', 'mt', 'ne', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sr', 'sv', 'sw', 'ta', 'te', 'th', 'tl', 'tr', 'ug', 'uk', 'ur', 'uz', 'vi', 'zh'}

def get_all_ocrs_objects(filters={}):
for obj_id in get_ids():
yield Ocr(obj_id)

def sanityze_ocr_languages(languages, ocr_languages=None):
langs = set()
if not ocr_languages:
ocr_languages = get_ocr_languages()
for lang in languages:
if lang in ocr_languages:
if lang == 'zh':
langs.add('ch_sim')
elif lang == 'sr':
langs.add('rs_latin')
else:
langs.add(lang)
return langs

class Ocrs(AbstractDaterangeObjects):
"""
Expand Down
9 changes: 7 additions & 2 deletions bin/modules/OcrExtractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@
from lib.objects import Messages
from lib.objects import Ocrs


# Default to eng
def get_model_languages(obj, add_en=True):
def get_model_languages(obj, ocr_languages, add_en=True):
if add_en:
model_languages = {'en'}
else:
Expand Down Expand Up @@ -53,6 +54,8 @@ def get_model_languages(obj, add_en=True):
model_languages.add(lang)
return model_languages

model_languages = Ocrs.sanityze_ocr_languages(model_languages, ocr_languages=ocr_languages)

return model_languages

# TODO thread
Expand All @@ -72,6 +75,8 @@ def __init__(self):
config_loader = ConfigLoader()
self.r_cache = config_loader.get_redis_conn("Redis_Cache")

self.ocr_languages = Ocrs.get_ocr_languages()

# Send module state to logs
self.logger.info(f'Module {self.module_name} initialized')

Expand All @@ -95,7 +100,7 @@ def compute(self, message):

if not ocr.exists():
path = image.get_filepath()
languages = get_model_languages(image)
languages = get_model_languages(image, self.ocr_languages)
print(image.id, languages)
texts = Ocrs.extract_text(path, languages)
if texts:
Expand Down
4 changes: 4 additions & 0 deletions var/www/templates/hunter/tracker_add.html
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,10 @@ <h6>Filter PGP by subtype:</h6>
<input class="custom-control-input" type="checkbox" name="message_obj" id="message_obj" checked="">
<label class="custom-control-label" for="message_obj"><i class="fas fa-comment-dots"></i>&nbsp;Message <i class="fas fa-info-circle text-info" data-toggle="tooltip" data-placement="right" title="Messages from Chats"></i></label>
</div>
<div class="custom-control custom-switch mt-1">
<input class="custom-control-input" type="checkbox" name="ocr_obj" id="ocr_obj" checked="">
<label class="custom-control-label" for="ocr_obj"><i class="fas fa-comment-dots"></i>&nbsp;OCR <i class="fas fa-expand text-info" data-toggle="tooltip" data-placement="right" title="Text extracted from Images"></i></label>
</div>

{# <div class="custom-control custom-switch mt-1">#}
{# <input class="custom-control-input" type="checkbox" name="level" id="screenshot_obj" checked="">#}
Expand Down

0 comments on commit 35502d9

Please sign in to comment.