From 8f670ab28e83f509992c4146dced5e27692a07ba Mon Sep 17 00:00:00 2001 From: Jose Torres Date: Fri, 5 Feb 2021 19:05:50 +0100 Subject: [PATCH] bump to v3.0.0 --- .dockerignore | 0 .gitignore | 15 +- CHANGELOG | 13 + CONTRIBUTORS | 9 +- Readme.md | 94 +- {faro/test => conf}/__init__.py | 0 conf/commons.yaml | 149 +++ conf/config.py | 12 + config/commons.yaml | 153 --- config/es.yaml | 35 - config/nolanguage.yaml | 35 - .../Dockerfiles/faro/commands/test-local.sh | 2 +- docker/README.md | 12 +- faro/detector.py | 284 ----- faro/document.py | 19 +- faro/faro_entrypoint.py | 120 +- faro/io_parser.py | 41 +- faro/language/__init__.py | 0 faro/language/language_detection.py | 39 + faro/ner.py | 60 - faro/ner_regex.py | 232 ---- faro/sensitivity_score.py | 69 +- faro/test/data/sensitive_data.docx | Bin 15027 -> 0 bytes faro/test/test_ner_regex.py | 1033 ----------------- faro_detection.py | 24 +- faro_spider.sh | 2 +- logger/__init__.py | 0 logger/logger.py | 36 + plugins/__init__.py | 0 plugins/address_bitcoin/__init__.py | 0 plugins/address_bitcoin/entrypoint.py | 56 + plugins/address_bitcoin/pattern.py | 67 ++ plugins/address_bitcoin/test/__init__.py | 0 .../address_bitcoin/test/data/document.txt | 27 + .../test/test_address_bitcoin.py | 39 + plugins/corporate_email/__init__.py | 0 plugins/corporate_email/entrypoint.py | 62 + .../corporate_email}/excl_corp_email_es.txt | 0 plugins/corporate_email/pattern.py | 35 + plugins/credit_card/__init__.py | 0 plugins/credit_card/context-left.txt | 0 plugins/credit_card/context-right.txt | 0 .../credit_card/context.txt | 11 +- plugins/credit_card/context.yaml | 5 + plugins/credit_card/entrypoint.py | 31 + plugins/credit_card/es/__init__.py | 0 plugins/credit_card/es/context-left.txt | 0 plugins/credit_card/es/context-right.txt | 0 plugins/credit_card/es/context.txt | 16 + plugins/credit_card/es/context.yaml | 11 + plugins/credit_card/es/pattern.py | 13 + plugins/credit_card/pattern.py | 35 + plugins/custom_word/__init__.py | 0 {faro => plugins/custom_word}/custom_word.py | 31 +- .../custom_word/custom_words.txt | 0 plugins/custom_word/entrypoint.py | 32 + plugins/custom_word/es/custom_words.txt | 2 + plugins/email/__init__.py | 0 plugins/email/context.txt | 0 plugins/email/context.yaml | 5 + plugins/email/entrypoint.py | 62 + plugins/email/excl_corp_email_es.txt | 25 + plugins/email/pattern.py | 27 + plugins/financial_data/__init__.py | 0 plugins/financial_data/context-left.txt | 0 plugins/financial_data/context-right.txt | 0 plugins/financial_data/context.txt | 2 + plugins/financial_data/context.yaml | 5 + plugins/financial_data/entrypoint.py | 31 + plugins/financial_data/es/__init__.py | 0 plugins/financial_data/es/context-left.txt | 0 plugins/financial_data/es/context-right.txt | 0 .../financial_data/es/context.txt | 10 +- plugins/financial_data/es/context.yaml | 5 + plugins/financial_data/es/pattern.py | 18 + plugins/financial_data/pattern.py | 29 + plugins/id_document/__init__.py | 0 plugins/id_document/context-left.txt | 0 plugins/id_document/context-right.txt | 0 .../id_document/context.txt | 10 +- plugins/id_document/context.yaml | 5 + plugins/id_document/entrypoint.py | 31 + plugins/id_document/es/__init__.py | 0 plugins/id_document/es/context-left.txt | 0 plugins/id_document/es/context-right.txt | 0 plugins/id_document/es/context.txt | 5 + plugins/id_document/es/context.yaml | 5 + plugins/id_document/es/language.yalm | 0 plugins/id_document/es/pattern.py | 32 + plugins/id_document/pattern.py | 27 + plugins/mobile/__init__.py | 0 plugins/mobile/context-left.txt | 0 plugins/mobile/context-right.txt | 0 plugins/mobile/context.txt | 0 plugins/mobile/context.yaml | 13 + plugins/mobile/entrypoint.py | 31 + plugins/mobile/es/__init__.py | 0 .../mobile/es/context-left.txt | 2 +- plugins/mobile/es/context-right.txt | 0 plugins/mobile/es/context.txt | 27 + plugins/mobile/es/context.yaml | 13 + plugins/mobile/es/language.yalm | 0 plugins/mobile/es/pattern.py | 25 + plugins/mobile/pattern.py | 16 + plugins/money/__init__.py | 0 plugins/money/context.txt | 0 plugins/money/context.yaml | 5 + plugins/money/entrypoint.py | 31 + plugins/money/pattern.py | 26 + plugins/orchestrator.py | 62 + plugins/organization/__init__.py | 0 plugins/organization/entrypoint.py | 32 + plugins/person/__init__.py | 0 plugins/person/entrypoint.py | 32 + plugins/phone/__init__.py | 0 plugins/phone/context-left.txt | 0 plugins/phone/context-right.txt | 0 plugins/phone/context.txt | 2 + plugins/phone/context.yaml | 5 + plugins/phone/entrypoint.py | 31 + plugins/phone/es/__init__.py | 0 .../phone/es/context-left.txt | 0 plugins/phone/es/context-right.txt | 0 plugins/phone/es/context.txt | 0 plugins/phone/es/context.yaml | 13 + plugins/phone/es/pattern.py | 24 + plugins/phone/pattern.py | 24 + plugins/phone/pt/__init__.py | 0 plugins/phone/pt/context-left.txt | 5 + plugins/phone/pt/context-right.txt | 0 plugins/phone/pt/context.txt | 0 plugins/phone/pt/context.yaml | 13 + plugins/phone/pt/pattern.py | 24 + plugins/probable_currency_amount/__init__.py | 0 plugins/probable_currency_amount/context.txt | 0 plugins/probable_currency_amount/context.yaml | 5 + .../probable_currency_amount/entrypoint.py | 31 + plugins/probable_currency_amount/pattern.py | 27 + plugins/signature/__init__.py | 0 plugins/signature/config.yaml | 1 + plugins/signature/context.txt | 0 plugins/signature/context.yaml | 5 + plugins/signature/entrypoint.py | 76 ++ plugins/signature/pattern.py | 35 + requirements.txt | 2 +- test/__init__.py | 0 {faro/test => test}/data/lorem.rtf | 0 {faro/test => test}/data/no_metadata.pdf | Bin {faro/test => test}/data/ocr.pdf | Bin test/data/organizations.txt | 6 + {faro/test => test}/data/person_position.pdf | Bin {faro/test => test}/data/protected.pdf | Bin test/data/sensitive_data.docx | Bin 0 -> 17101 bytes {faro/test => test}/data/sensitive_data.pdf | Bin 94388 -> 104978 bytes {faro/test => test}/data/signature_boe.pdf | Bin {faro/test => test}/data/split_lines.docx | Bin {faro/test => test}/data/tests.txt | 0 test_faro_cli.py => test/test_faro_cli.py | 14 +- {faro/test => test}/test_faro_entrypoint.py | 48 +- {faro/test => test}/test_utils.py | 4 +- utils/__init__.py | 0 utils/base_detector.py | 33 + utils/base_plugin.py | 45 + {faro => utils}/email.py | 8 +- utils/features/__init__.py | 0 utils/features/entrypoint_feature_base.py | 38 + utils/features/feature_detector.py | 56 + utils/features/ner.py | 52 + utils/features/spacy.py | 69 ++ utils/pattern/__init__.py | 0 utils/pattern/entrypoint_pattern_base.py | 63 + utils/pattern/ner_regex.py | 177 +++ utils/pattern/pattern_base.py | 52 + utils/pattern/pattern_detector.py | 128 ++ utils/singleton.py | 25 + utils/storage.py | 11 + {faro => utils}/utils.py | 11 +- 177 files changed, 2704 insertions(+), 2069 deletions(-) mode change 100644 => 100755 .dockerignore mode change 100644 => 100755 .gitignore rename {faro/test => conf}/__init__.py (100%) create mode 100755 conf/commons.yaml create mode 100755 conf/config.py delete mode 100755 config/commons.yaml delete mode 100755 config/es.yaml delete mode 100755 config/nolanguage.yaml delete mode 100755 faro/detector.py create mode 100755 faro/language/__init__.py create mode 100755 faro/language/language_detection.py delete mode 100755 faro/ner.py delete mode 100755 faro/ner_regex.py delete mode 100755 faro/test/data/sensitive_data.docx delete mode 100755 faro/test/test_ner_regex.py create mode 100755 logger/__init__.py create mode 100755 logger/logger.py create mode 100755 plugins/__init__.py create mode 100755 plugins/address_bitcoin/__init__.py create mode 100755 plugins/address_bitcoin/entrypoint.py create mode 100755 plugins/address_bitcoin/pattern.py create mode 100755 plugins/address_bitcoin/test/__init__.py create mode 100755 plugins/address_bitcoin/test/data/document.txt create mode 100755 plugins/address_bitcoin/test/test_address_bitcoin.py create mode 100755 plugins/corporate_email/__init__.py create mode 100755 plugins/corporate_email/entrypoint.py rename {config => plugins/corporate_email}/excl_corp_email_es.txt (100%) create mode 100755 plugins/corporate_email/pattern.py create mode 100755 plugins/credit_card/__init__.py create mode 100755 plugins/credit_card/context-left.txt create mode 100755 plugins/credit_card/context-right.txt rename config/keywords_creditcard_es.txt => plugins/credit_card/context.txt (51%) create mode 100755 plugins/credit_card/context.yaml create mode 100755 plugins/credit_card/entrypoint.py create mode 100755 plugins/credit_card/es/__init__.py create mode 100755 plugins/credit_card/es/context-left.txt create mode 100755 plugins/credit_card/es/context-right.txt create mode 100755 plugins/credit_card/es/context.txt create mode 100755 plugins/credit_card/es/context.yaml create mode 100755 plugins/credit_card/es/pattern.py create mode 100755 plugins/credit_card/pattern.py create mode 100755 plugins/custom_word/__init__.py rename {faro => plugins/custom_word}/custom_word.py (62%) rename config/keywords_custom_words_es.txt => plugins/custom_word/custom_words.txt (100%) create mode 100755 plugins/custom_word/entrypoint.py create mode 100755 plugins/custom_word/es/custom_words.txt create mode 100755 plugins/email/__init__.py create mode 100755 plugins/email/context.txt create mode 100755 plugins/email/context.yaml create mode 100755 plugins/email/entrypoint.py create mode 100755 plugins/email/excl_corp_email_es.txt create mode 100755 plugins/email/pattern.py create mode 100755 plugins/financial_data/__init__.py create mode 100755 plugins/financial_data/context-left.txt create mode 100755 plugins/financial_data/context-right.txt create mode 100755 plugins/financial_data/context.txt create mode 100755 plugins/financial_data/context.yaml create mode 100755 plugins/financial_data/entrypoint.py create mode 100755 plugins/financial_data/es/__init__.py create mode 100755 plugins/financial_data/es/context-left.txt create mode 100755 plugins/financial_data/es/context-right.txt rename config/keywords_financialdata_es.txt => plugins/financial_data/es/context.txt (87%) create mode 100755 plugins/financial_data/es/context.yaml create mode 100755 plugins/financial_data/es/pattern.py create mode 100755 plugins/financial_data/pattern.py create mode 100755 plugins/id_document/__init__.py create mode 100755 plugins/id_document/context-left.txt create mode 100755 plugins/id_document/context-right.txt rename config/keywords_dni_spain_es.txt => plugins/id_document/context.txt (91%) create mode 100755 plugins/id_document/context.yaml create mode 100755 plugins/id_document/entrypoint.py create mode 100755 plugins/id_document/es/__init__.py create mode 100755 plugins/id_document/es/context-left.txt create mode 100755 plugins/id_document/es/context-right.txt create mode 100755 plugins/id_document/es/context.txt create mode 100755 plugins/id_document/es/context.yaml create mode 100755 plugins/id_document/es/language.yalm create mode 100755 plugins/id_document/es/pattern.py create mode 100755 plugins/id_document/pattern.py create mode 100755 plugins/mobile/__init__.py create mode 100755 plugins/mobile/context-left.txt create mode 100755 plugins/mobile/context-right.txt create mode 100755 plugins/mobile/context.txt create mode 100755 plugins/mobile/context.yaml create mode 100755 plugins/mobile/entrypoint.py create mode 100755 plugins/mobile/es/__init__.py rename config/keywords_mobile_es.txt => plugins/mobile/es/context-left.txt (90%) create mode 100755 plugins/mobile/es/context-right.txt create mode 100755 plugins/mobile/es/context.txt create mode 100755 plugins/mobile/es/context.yaml create mode 100755 plugins/mobile/es/language.yalm create mode 100755 plugins/mobile/es/pattern.py create mode 100755 plugins/mobile/pattern.py create mode 100755 plugins/money/__init__.py create mode 100755 plugins/money/context.txt create mode 100755 plugins/money/context.yaml create mode 100755 plugins/money/entrypoint.py create mode 100755 plugins/money/pattern.py create mode 100755 plugins/orchestrator.py create mode 100755 plugins/organization/__init__.py create mode 100755 plugins/organization/entrypoint.py create mode 100755 plugins/person/__init__.py create mode 100755 plugins/person/entrypoint.py create mode 100755 plugins/phone/__init__.py create mode 100755 plugins/phone/context-left.txt create mode 100755 plugins/phone/context-right.txt create mode 100755 plugins/phone/context.txt create mode 100755 plugins/phone/context.yaml create mode 100755 plugins/phone/entrypoint.py create mode 100755 plugins/phone/es/__init__.py rename config/keywords_phone_es.txt => plugins/phone/es/context-left.txt (100%) create mode 100755 plugins/phone/es/context-right.txt create mode 100755 plugins/phone/es/context.txt create mode 100755 plugins/phone/es/context.yaml create mode 100755 plugins/phone/es/pattern.py create mode 100755 plugins/phone/pattern.py create mode 100755 plugins/phone/pt/__init__.py create mode 100755 plugins/phone/pt/context-left.txt create mode 100755 plugins/phone/pt/context-right.txt create mode 100755 plugins/phone/pt/context.txt create mode 100755 plugins/phone/pt/context.yaml create mode 100755 plugins/phone/pt/pattern.py create mode 100755 plugins/probable_currency_amount/__init__.py create mode 100755 plugins/probable_currency_amount/context.txt create mode 100755 plugins/probable_currency_amount/context.yaml create mode 100755 plugins/probable_currency_amount/entrypoint.py create mode 100755 plugins/probable_currency_amount/pattern.py create mode 100755 plugins/signature/__init__.py create mode 100755 plugins/signature/config.yaml create mode 100755 plugins/signature/context.txt create mode 100755 plugins/signature/context.yaml create mode 100755 plugins/signature/entrypoint.py create mode 100755 plugins/signature/pattern.py create mode 100755 test/__init__.py rename {faro/test => test}/data/lorem.rtf (100%) rename {faro/test => test}/data/no_metadata.pdf (100%) rename {faro/test => test}/data/ocr.pdf (100%) create mode 100755 test/data/organizations.txt rename {faro/test => test}/data/person_position.pdf (100%) rename {faro/test => test}/data/protected.pdf (100%) create mode 100755 test/data/sensitive_data.docx rename {faro/test => test}/data/sensitive_data.pdf (80%) rename {faro/test => test}/data/signature_boe.pdf (100%) rename {faro/test => test}/data/split_lines.docx (100%) rename {faro/test => test}/data/tests.txt (100%) rename test_faro_cli.py => test/test_faro_cli.py (71%) rename {faro/test => test}/test_faro_entrypoint.py (86%) rename {faro/test => test}/test_utils.py (96%) create mode 100755 utils/__init__.py create mode 100755 utils/base_detector.py create mode 100755 utils/base_plugin.py rename {faro => utils}/email.py (86%) create mode 100755 utils/features/__init__.py create mode 100755 utils/features/entrypoint_feature_base.py create mode 100755 utils/features/feature_detector.py create mode 100755 utils/features/ner.py create mode 100755 utils/features/spacy.py create mode 100755 utils/pattern/__init__.py create mode 100755 utils/pattern/entrypoint_pattern_base.py create mode 100755 utils/pattern/ner_regex.py create mode 100755 utils/pattern/pattern_base.py create mode 100755 utils/pattern/pattern_detector.py create mode 100755 utils/singleton.py create mode 100755 utils/storage.py rename {faro => utils}/utils.py (79%) diff --git a/.dockerignore b/.dockerignore old mode 100644 new mode 100755 diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 index db5dfaf..091a3b7 --- a/.gitignore +++ b/.gitignore @@ -4,9 +4,22 @@ data/* *.orig *.log +*.env +*.list output/ +input/ +.idea/ *.list +!docker_faro_env_example.list +test-reports/* +# Ensure no output files are published *.entity *.score +# Ignore coverage stats and config .coverage -!docker_faro_env_example.list +nosetests.xml +.keep +venv +*.log +*.log* +env diff --git a/CHANGELOG b/CHANGELOG index dbe0c9c..08e5d54 100755 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,16 @@ +3.0.0 +----- +* Update FARO to allow for plug-in support. +* Decouple FARO 2.0.0 functionality to be run separately in plug-ins +* Add plug-in template to use as a guide for new plug-in integration +* Add plug-in example (address_bitcoin plus tests) based on plug-in template +* Add option to run all plugins in configurable path +* Move tests to separate package +* Simplify configuration +* Support for logging configuration +* Update to tika 1.24 + + 2.0.0 ----- * Add password-protected/encrypted file detection and score them as high sensitivity diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 2611629..a403db0 100755 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -1,7 +1,8 @@ -- Enrique Andrade González (ElevenPaths-TEGRA) -- Hector Cerezo Costas (Gradiant-TEGRA) -- Juan Elosua Tomé (ElevenPaths-TEGRA) -- Rafael P. Martínez Álvarez (Gradiant-TEGRA) +- Enrique Andrade González +- Hector Cerezo Costas +- Juan Elosua Tomé +- Hugo Román García-Pardo Rodríguez +- Rafael P. Martínez Álvarez TEGRA is an R&D Cybersecurity Center based in Galicia (Spain). It is a joint effort from Telefónica, a leading international telecommunications company, through ElevenPaths, its global cybersecurity unit, and Gradiant, an ICT R&D center with more than 100 professionals working in areas like connectivity, security and intelligence, to create innovative products and services inside cybersecurity. diff --git a/Readme.md b/Readme.md index 66bc194..1bbf417 100755 --- a/Readme.md +++ b/Readme.md @@ -47,7 +47,12 @@ IF you are in a rush and just want to give it a try...go [here](docker/README.md The project contains the following folders: * `faro/` : this is FARO module with the main functionality and tests - * `config/`: yaml configuration files go here. There is one yaml file per language (plus one `nolanguage.yaml` to provide basic functionality for non detected languages) and one yaml file with common configurations for all languages `config/commons.yaml`. + * `conf/`: yaml configuration files go here. There is one yaml file per language (plus one `nolanguage.yaml` to provide basic functionality for non detected languages) and one yaml file with common configurations for all languages `config/commons.yaml`. + * `plugins/`: Stores all the available plugins to detect sensitive information with the appropiate language support. + * `utils/`: Utilities for faro execution, for example pre-process of texts and root classes to implement common plugin functionality. + * `docker/`: Everything related with the execution of faro in a container squema. + * `test/`: Unit tests for faro. + * `logs/` and `logger/`: Definition and storage of logging. * `faro_detection.py`: launcher of FARO for standalone operation over a single file. * `faro_spider.sh`: script for bulk processing. * `nose.cfg`: Configuration for testing faro @@ -110,7 +115,7 @@ These other dependencies are used for testing: #### Tika dependency -We provide some utilities in order to get tika server up and running on your local machine in case is useful donwload this [zip file](https://github.com/ElevenPaths/FARO/releases/download/v2.0.0/tika_external.zip) and uncompress somewhere in your local filesystem. +We provide some utilities in order to get tika server up and running on your local machine in case is useful donwload this [zip file](https://github.com/ElevenPaths/FARO/releases/download/v3.0.0/tika_external.zip) and uncompress somewhere in your local filesystem. To fire up tika run: ```unix @@ -120,7 +125,7 @@ $ tika_start.sh To stop tika server: ```unix $ tika_stop.sh -`` +``` ### NER models @@ -148,7 +153,7 @@ FARO creates an "output" folder inside the parent folder of `docker` normally th * `output/scan.$CURRENT_TIME.csv`: is a csv file with the score given to the document and the frequence of indicators in each file. ``` -filepath,score,monetary_quantity,signature,personal_email,mobile_phone_number,financial_data,document_id,custom_words,meta:content-type,meta:author,meta:pages,meta:lang,meta:date,meta:filesize,meta:num_words,meta:num_chars,meta:ocr +filepath,score,money,signature,personal_email,mobile,financial_data,id_document,custom_word,meta:content-type,meta:encrypted,meta:author,meta:pages,meta:lang,meta:date,meta:filesize,meta:ocr /Users/test/code/FARO_datasets/quick_test_data/Factura_NRU_0_1_001.pdf,high,0,0,0,0,0,1,4,application/pdf,Powered By Crystal,1,es,,85739,219,1185,False /Users/test/code/FARO_datasets/quick_test_data/Factura_Plancha.pdf,high,6,0,0,0,0,2,8,application/pdf,Python PDF Library - http://pybrary.net/pyPdf/,1,es,,77171,259,1524,True /Users/test/code/FARO_datasets/quick_test_data/20190912-FS2019.pdf,high,3,0,0,0,0,1,2,application/pdf,FPDF 1.6,1,es,2019-09-12T20:08:19Z,1545,62,648,False @@ -157,17 +162,17 @@ filepath,score,monetary_quantity,signature,personal_email,mobile_phone_number,fi * `output/scan.$CURRENT_TIME.entity`: is a json with the list of indicators (disaggregated) extracted in a file. For example: ``` -{"filepath": "/Users/test/code/FARO_datasets/quick_test_data/Factura_NRU_0_1_001.pdf", "entities": {"custom_words": {"facturar": 3, "total": 1}, "prob_currency": {"12,0021": 1, "12,00": 1, "9,92": 1, "3,9921": 1, "3,99": 1, "3,30": 1, "15,99": 1, "13,21": 1, "1.106.166": 1, "1,00": 1, "99,00": 1}, "document_id": {"89821284M": 1}}, "datetime": "2019-12-11 14:19:17"} -{"filepath": "/Users/test/code/FARO_datasets/quick_test_data/Factura_Plancha.pdf", "entities": {"document_id": {"H82547761": 1, "21809943D": 2}, "custom_words": {"factura": 2, "facturar": 2, "total": 2, "importe": 2}, "monetary_quantity": {"156,20": 4, "2,84": 2, "0,00": 2, "159,04": 2, "32,80": 4, "191,84": 2}, "prob_currency": {"1,00": 6, "189,00": 2}}, "datetime": "2019-12-11 14:19:27"} -{"filepath": "/Users/test/code/FARO_datasets/quick_test_data/20190912-FS2019.pdf", "entities": {"document_id": {"C-01107564": 1}, "custom_words": {"factura": 1, "total": 1}, "monetary_quantity": {"3,06": 1, "0,64": 1, "3,70": 1}}, "datetime": "2019-12-11 14:19:33"} +{"filepath": "/Users/test/code/FARO_datasets/quick_test_data/Factura_NRU_0_1_001.pdf", "entities": {"custom_word": {"facturar": 3, "total": 1}, "probable_currency_amount": {"12,0021": 1, "12,00": 1, "9,92": 1, "3,9921": 1, "3,99": 1, "3,30": 1, "15,99": 1, "13,21": 1, "1.106.166": 1, "1,00": 1, "99,00": 1}, "id_document": {"89821284M": 1}}, "datetime": "2019-12-11 14:19:17"} +{"filepath": "/Users/test/code/FARO_datasets/quick_test_data/Factura_Plancha.pdf", "entities": {"id_document": {"H82547761": 1, "21809943D": 2}, "custom_word": {"factura": 2, "facturar": 2, "total": 2, "importe": 2}, "money": {"156,20": 4, "2,84": 2, "0,00": 2, "159,04": 2, "32,80": 4, "191,84": 2}, "probable_currency_amount": {"1,00": 6, "189,00": 2}}, "datetime": "2019-12-11 14:19:27"} +{"filepath": "/Users/test/code/FARO_datasets/quick_test_data/20190912-FS2019.pdf", "entities": {"document_id": {"C-01107564": 1}, "custom_word": {"factura": 1, "total": 1}, "money": {"3,06": 1, "0,64": 1, "3,70": 1}}, "datetime": "2019-12-11 14:19:33"} ``` #### Finetuning Faro Execution After adding OCR there are some configuration that can be customized for FARO execution through environment variables: * `FARO_DISABLE_OCR`: if this variable is found (with any value) FARO will not execute OCR on the documents -* `FARO_REQUESTS_TIMEOUT`: Number of seconds before FARO will timeout if the tika server does not respond (default: 60) -* `FARO_PDF_OCR_RATIO`: Bytes per character used in PDF mixed documents (text and images) to force OCR (default: 150 bytes/char) +* `FARO_REQUESTS_TIMEOUT`: Number of seconds before FARO will timeout if the tika server does not respond (default: 300) +* `FARO_PDF_OCR_RATIO`: Bytes per character used in PDF mixed documents (text and images) to force OCR (default: 500 bytes/char) Logging configuration can also be configured through environment variables: @@ -205,7 +210,7 @@ a) `.entity`: a json with the list of entities ordered by their type b) `.score`: a json with the types of entities and the number this type of entity appears in the text. This json also contains the sensitivy score in the property "score" (it can be "low", "medium" and "high"). ``` -{"score": "high", "summary": {"monetary_quantity": 1, "mobile_phone_number": 1, "personal_email": 1, "credit_account_number": 2}} +{"score": "high", "summary": {"money": 1, "mobile": 1, "personal_email": 1, "financial_data": 2}} ``` For information about additional arguments that can be passed to our detection script, take a look [here](#faro-detection-additional-arguments). @@ -220,17 +225,17 @@ The FARO entity detector performs two steps: The list of indicators are the following: - * **monetary_quantity**: money quantity (currently only euros and dollars are supported). + * **money**: money quantity (currently only euros and dollars are supported). * **signature**: it outputs the person who signs a document * **personal_email**: emails that are not corporative (e.g. not info@ rrhh@ ) - * **mobile_phone_number**: mobile phone numbers (filtering out non mobile ones) + * **mobile**: mobile phone numbers (filtering out non mobile ones) * **financial_data**: credit cards and IBAN account numbers - * **document_id**: Spanish NIF and CIF. + * **id_document**: Spanish NIF and CIF. The unique counts of these sentences are gathered in a json object and relayed as input to the next step. @@ -246,51 +251,48 @@ The following rules are applied: ### Configuration -It employs a YAML set of files for configuring its functionality (the YAML files are located inside the "config" folder) - -* common.yaml: has the common functionality to every language +It employs a YAML set of files for configuring its functionality (the YAML files are located inside the "conf" folder) -* .yaml: has the specific configuration for a language (currently only spanish is supported: "es" code). It also indicates where the ML Models are located (e.g. by default inside the "models" folder) +* `common.yaml`: has the common configuration for the tool. +* `config.py`: Sets the logging for faro execution #### Configuration of the sensitivity score Those are a collection of conditions that selects a score following the specification of the configuration file. The levels are configured in the sensitivity_list sorted by their intensity (from less to more sensitive). The sensitivity dict contains the conditions (min, max) ordered by type of entity. The system only needs to fulfill one condition of a certain level in order to flag the document with that level of sensitivity. Furtheremore if multiple KPIs of a certain leve are found in the document (as marked by the sensitivity_multiple_kpis parameter), the system increases their sensitivity level (e.g. from medium to high). ``` -sensitivity_list: - - low - - medium - - high - - -sensitivity_multiple_kpis: 3 - sensitivity: - low: - person_position: - min: 1 - max: 5 - monetary_quantity: - min: 1 - max: 5 - - signature: - min: 0 - max: 0 - - personal_email: - min: 0 - max: 0 - - .... - + sensitivity_list: + - low + - medium + - high + sensitivity_multiple_kpis: 3 ``` * sensitivity_list is the list of different sensitivity scores ordered by intensity. * sensitivity_multiple_kpis this number indicates the simultaneous number of scores in a level allowed before leveling up the sensitivy score -* sensitivity is a dict with the sensitivity conditions that must be satisfied in order to reach a sensitivity level. +Also each entity can be configured in terms of the amount of presence needed to be scored as each level: low, medium or high. by using a sensitivity dict with the sensitivity conditions that must be satisfied in order to reach a sensitivity level. + +``` +entities: + MONEY: + description: money + output: true + sensitivity: + low: + min: 1 + max: 6 + medium: + min: 6 + max: 65535 + high: + min: 65535 + max: 65535 + .... +``` + ### Supported Input File Formats @@ -309,8 +311,8 @@ Mails are extracted with RegExp. A ML classifier and heuristics are used to dist `--dump`: the system dumps the information of .score to stdout in csv format. E.g. an example of output might be: ``` -id_file,score,person_jobposition_organization,monetary_quantity,sign,personal_email,mobile_phone_number,credit_account_number,id_document -data/test/test2.pdf,medium,3,0,1,0,0,0,0 +filepath,score,money,signature,personal_email,mobile,financial_data,id_document,custom_word,meta:content-type,meta:encrypted,meta:author,meta:pages,meta:lang,meta:date,meta:filesize,meta:ocr +/Users/test/code/FARO_datasets/quick_test_data/Factura_NRU_0_1_001.pdf,high,0,0,0,0,0,1,4,application/pdf,Powered By Crystal,1,es,,85739,219,1185,False ``` diff --git a/faro/test/__init__.py b/conf/__init__.py similarity index 100% rename from faro/test/__init__.py rename to conf/__init__.py diff --git a/conf/commons.yaml b/conf/commons.yaml new file mode 100755 index 0000000..827432d --- /dev/null +++ b/conf/commons.yaml @@ -0,0 +1,149 @@ +entities: + PER: + description: person + output: false + ORG: + description: organization + output: false + LOC: + description: localization + output: false + MISC: + description: miscelaneous + output: false + FINANCIAL_DATA: + description: financial_data + output: true + sensitivity: + low: + min: 0 + max: 0 + medium: + min: 0 + max: 0 + high: + min: 1 + max: 65535 + MONEY: + description: money + output: true + sensitivity: + low: + min: 1 + max: 6 + medium: + min: 6 + max: 65535 + high: + min: 65535 + max: 65535 + PROB_CURRENCY: + description: probable_currency_amount + output: false + EMAIL: + description: personal_email + output: true + sensitivity: + low: + min: 1 + max: 2 + medium: + min: 2 + max: 65535 + high: + min: 65535 + max: 65535 + CORP_EMAIL: + description: corporate_email + output: false + ID_DOCUMENT: + description: id_document + output: true + sensitivity: + low: + min: 0 + max: 0 + medium: + min: 0 + max: 0 + high: + min: 1 + max: 65535 + MOBILE: + description: mobile + output: true + sensitivity: + low: + min: 1 + max: 2 + medium: + min: 2 + max: 4 + high: + min: 4 + max: 65535 + PHONE: + description: phone + output: false + SIGNATURE: + description: signature + output: true + max_distance: 15 + sensitivity: + low: + min: 0 + max: 0 + medium: + min: 1 + max: 2 + high: + min: 2 + max: 65535 + CUSTOM: + description: custom_word + output: true + sensitivity: + low: + min: 0 + max: 0 + medium: + min: 0 + max: 0 + high: + min: 1 + max: 65535 + +plugins: + all: false + available_list: + - financial_data + - mobile + - credit_card + - id_document + - phone + - money + - custom_word + - email + - corporate_email + - probable_currency_amount + - person + - organization + - signature + - address_bitcoin + +# These entities need to be synchronized with faro_spider.sh +spider_output_entities: + - money + - signature + - personal_email + - mobile + - financial_data + - id_document + - custom_word + +sensitivity: + sensitivity_list: + - low + - medium + - high + sensitivity_multiple_kpis: 3 diff --git a/conf/config.py b/conf/config.py new file mode 100755 index 0000000..52906eb --- /dev/null +++ b/conf/config.py @@ -0,0 +1,12 @@ +# Logger +import logging +import os + +LOG_FILE_NAME = 'faro-community.log' +LOG_LEVEL = os.getenv('FARO_LOG_LEVEL', "INFO") + +logging.basicConfig( + level=LOG_LEVEL, + format="%(levelname)s: %(name)20s: %(message)s", + handlers=[logging.StreamHandler()] + ) \ No newline at end of file diff --git a/config/commons.yaml b/config/commons.yaml deleted file mode 100755 index cc6f5a4..0000000 --- a/config/commons.yaml +++ /dev/null @@ -1,153 +0,0 @@ -features: - PER: - description: person - output: false - ORG: - description: organization - output: false - LOC: - description: localization - output: false - MISC: - description: miscelaneous - output: false - FINANCIAL_DATA: - description: financial_data - output: true - MONEY: - description: monetary_quantity - output: true - PROB_CURRENCY: - description: probable_currency_amount - output: false - EMAIL: - description: personal_email - output: true - CORP_EMAIL: - description: corporate_email - output: false - ID_DOCUMENT: - description: document_id - output: true - MOBILE: - description: mobile_phone_number - output: true - PHONE: - description: phone_number - output: false - SIGNATURE: - description: signature - output: true - max_distance: 15 - CUSTOM: - description: custom_words - output: true - -# Features analyzed by faro that will be written to the output -# together with the document metadata and final score -# TODO: how to do this better, maybe wait until we have a frontend? -scoring_output_features: - - monetary_quantity - - signature - - personal_email - - mobile_phone_number - - financial_data - - document_id - - custom_words - - -sensitivity_list: - - low - - medium - - high - -sensitivity_multiple_kpis: 3 - -sensitivity: - low: - monetary_quantity: - min: 1 - max: 6 - - signature: - min: 0 - max: 0 - - personal_email: - min: 1 - max: 2 - - mobile_phone_number: - min: 1 - max: 2 - - financial_data: - min: 0 - max: 0 - - document_id: - min: 0 - max: 0 - - custom_words: - min: 0 - max: 0 - - medium: - monetary_quantity: - min: 6 - max: 65535 - - signature: - min: 1 - max: 2 - - personal_email: - min: 2 - max: 65535 - - mobile_phone_number: - min: 2 - max: 4 - - financial_data: - min: 0 - max: 0 - - document_id: - min: 0 - max: 0 - - custom_words: - min: 0 - max: 0 - - high: - monetary_quantity: - min: 65535 - max: 65535 - - signature: - min: 2 - max: 65535 - - personal_email: - min: 65535 - max: 65535 - - mobile_phone_number: - min: 4 - max: 65535 - - financial_data: - min: 1 - max: 65535 - - document_id: - min: 1 - max: 65535 - - custom_words: - min: 1 - max: 65535 - diff --git a/config/es.yaml b/config/es.yaml deleted file mode 100755 index 38736ec..0000000 --- a/config/es.yaml +++ /dev/null @@ -1,35 +0,0 @@ -regexp_config: - CreditCard: - word_file: keywords_creditcard_es.txt - left_span_len: 20 - right_span_len: 0 - - FinancialData: - word_file: keywords_financialdata_es.txt - left_span_len: 20 - right_span_len: 0 - - DNI_SPAIN: - word_file: keywords_dni_spain_es.txt - left_span_len: 20 - right_span_len: 0 - - PHONE: - word_file: keywords_phone_es.txt - left_span_len: 20 - right_span_len: 0 - - MOBILE: - word_file: keywords_mobile_es.txt - left_span_len: 20 - right_span_len: 0 - -email_config: - excl_file: excl_corp_email_es.txt - -ner_config: - nlp_model : es_core_news_sm - -custom_config: - word_file: keywords_custom_words_es.txt - diff --git a/config/nolanguage.yaml b/config/nolanguage.yaml deleted file mode 100755 index c9e5913..0000000 --- a/config/nolanguage.yaml +++ /dev/null @@ -1,35 +0,0 @@ -regexp_config: - CreditCard: - word_file: keywords_creditcard_es.txt - left_span_len: 20 - right_span_len: 0 - - FinancialData: - word_file: keywords_financialdata_es.txt - left_span_len: 20 - right_span_len: 0 - - DNI_SPAIN: - word_file: keywords_dni_spain_es.txt - left_span_len: 20 - right_span_len: 0 - - PHONE: - word_file: keywords_phone_es.txt - left_span_len: 20 - right_span_len: 0 - - MOBILE: - word_file: keywords_mobile_es.txt - left_span_len: 20 - right_span_len: 0 - -email_config: - excl_file: excl_corp_email_es.txt - -ner_config: - nlp_model : xx_ent_wiki_sm - -custom_config: - word_file: keywords_custom_words_es.txt - diff --git a/docker/Dockerfiles/faro/commands/test-local.sh b/docker/Dockerfiles/faro/commands/test-local.sh index bb62b82..6e32e13 100755 --- a/docker/Dockerfiles/faro/commands/test-local.sh +++ b/docker/Dockerfiles/faro/commands/test-local.sh @@ -17,4 +17,4 @@ then echo "Error: Looks like tika server is unreachable" exit 1 fi -nosetests -sv ./test_*.py ./faro/test/test_*.py --with-coverage --cover-package=faro +nosetests -sv ./test/test_*.py --with-coverage --cover-package=faro diff --git a/docker/README.md b/docker/README.md index 7f1f910..231baa9 100755 --- a/docker/README.md +++ b/docker/README.md @@ -30,8 +30,8 @@ If on the other hand you want to develop or contribute to faro use the [developm First you'll need to download the images binaries from our repo to your target machine, for example: ``` $ cd ~/Downloads -$ wget https://github.com/ElevenPaths/FARO/releases/download/v2.0.0/faro.tar.gz -$ wget https://github.com/ElevenPaths/FARO/releases/download/v2.0.0/tika.tar.gz +$ wget https://github.com/ElevenPaths/FARO/releases/download/v3.0.0/faro.tar.gz +$ wget https://github.com/ElevenPaths/FARO/releases/download/v3.0.0/tika.tar.gz ``` Once in your target machine you'll need to load those images into docker, for example: @@ -91,7 +91,7 @@ FARO creates an "output" folder inside the parent folder of `docker` normally th * `output/scan.$CURRENT_TIME.csv`: is a csv file with the score given to the document and the frequence of indicators in each file. ``` -filepath,score,monetary_quantity,signature,personal_email,mobile_phone_number,financial_data,document_id,custom_words,meta:content-type,meta:author,meta:pages,meta:lang,meta:date,meta:filesize,meta:num_words,meta:num_chars,meta:ocr +filepath,score,money,signature,personal_email,mobile,financial_data,id_document,custom_word,meta:content-type,meta:encrypted,meta:author,meta:pages,meta:lang,meta:date,meta:filesize,meta:ocr /Users/test/code/FARO_datasets/quick_test_data/Factura_NRU_0_1_001.pdf,high,0,0,0,0,0,1,4,application/pdf,Powered By Crystal,1,es,,85739,219,1185,False /Users/test/code/FARO_datasets/quick_test_data/Factura_Plancha.pdf,high,6,0,0,0,0,2,8,application/pdf,Python PDF Library - http://pybrary.net/pyPdf/,1,es,,77171,259,1524,True /Users/test/code/FARO_datasets/quick_test_data/20190912-FS2019.pdf,high,3,0,0,0,0,1,2,application/pdf,FPDF 1.6,1,es,2019-09-12T20:08:19Z,1545,62,648,False @@ -100,9 +100,9 @@ filepath,score,monetary_quantity,signature,personal_email,mobile_phone_number,fi * `output/scan.$CURRENT_TIME.entity`: is a json with the list of indicators (disaggregated) extracted in a file. For example: ``` -{"filepath": "/Users/test/code/FARO_datasets/quick_test_data/Factura_NRU_0_1_001.pdf", "entities": {"custom_words": {"facturar": 3, "total": 1}, "prob_currency": {"12,0021": 1, "12,00": 1, "9,92": 1, "3,9921": 1, "3,99": 1, "3,30": 1, "15,99": 1, "13,21": 1, "1.106.166": 1, "1,00": 1, "99,00": 1}, "document_id": {"89821284M": 1}}, "datetime": "2019-12-11 14:19:17"} -{"filepath": "/Users/test/code/FARO_datasets/quick_test_data/Factura_Plancha.pdf", "entities": {"document_id": {"H82547761": 1, "21809943D": 2}, "custom_words": {"factura": 2, "facturar": 2, "total": 2, "importe": 2}, "monetary_quantity": {"156,20": 4, "2,84": 2, "0,00": 2, "159,04": 2, "32,80": 4, "191,84": 2}, "prob_currency": {"1,00": 6, "189,00": 2}}, "datetime": "2019-12-11 14:19:27"} -{"filepath": "/Users/test/code/FARO_datasets/quick_test_data/20190912-FS2019.pdf", "entities": {"document_id": {"C-01107564": 1}, "custom_words": {"factura": 1, "total": 1}, "monetary_quantity": {"3,06": 1, "0,64": 1, "3,70": 1}}, "datetime": "2019-12-11 14:19:33"} +{"filepath": "/Users/test/code/FARO_datasets/quick_test_data/Factura_NRU_0_1_001.pdf", "entities": {"custom_word": {"facturar": 3, "total": 1}, "probable_currency_amount": {"12,0021": 1, "12,00": 1, "9,92": 1, "3,9921": 1, "3,99": 1, "3,30": 1, "15,99": 1, "13,21": 1, "1.106.166": 1, "1,00": 1, "99,00": 1}, "id_document": {"89821284M": 1}}, "datetime": "2019-12-11 14:19:17"} +{"filepath": "/Users/test/code/FARO_datasets/quick_test_data/Factura_Plancha.pdf", "entities": {"id_document": {"H82547761": 1, "21809943D": 2}, "custom_word": {"factura": 2, "facturar": 2, "total": 2, "importe": 2}, "money": {"156,20": 4, "2,84": 2, "0,00": 2, "159,04": 2, "32,80": 4, "191,84": 2}, "probable_currency_amount": {"1,00": 6, "189,00": 2}}, "datetime": "2019-12-11 14:19:27"} +{"filepath": "/Users/test/code/FARO_datasets/quick_test_data/20190912-FS2019.pdf", "entities": {"document_id": {"C-01107564": 1}, "custom_word": {"factura": 1, "total": 1}, "money": {"3,06": 1, "0,64": 1, "3,70": 1}}, "datetime": "2019-12-11 14:19:33"} ``` ## Run tests diff --git a/faro/detector.py b/faro/detector.py deleted file mode 100755 index 15c6235..0000000 --- a/faro/detector.py +++ /dev/null @@ -1,284 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import logging -import os -import spacy -from .utils import normalize_text, clean_text -from stdnum import get_cc_module -from stdnum.luhn import validate -from stdnum.exceptions import InvalidChecksum, InvalidFormat -from .ner import NER -from .email import EmailFilter -from .ner_regex import RegexNer -from .custom_word import CustomWordDetector -from collections import OrderedDict - -CWD = os.path.dirname(__file__) -CONFIG_PATH = os.path.join(CWD, '..', 'config') -MODELS_PATH = os.path.join(CWD, '..', 'models') -_COMMONS_YAML = "%s/commons.yaml" % CONFIG_PATH - -logger = logging.getLogger(__name__) - - -class Detector(object): - """ Main class for extracting KPIs of confidential documents - - """ - - def _get_signature(self, person_signed_idx, next_person_has_signed, total_ent_list): - if next_person_has_signed: - min_itx_signed = self.signature_max_distance - id_min_itx = -1 - - for i in range(len(total_ent_list)): - ent = total_ent_list[i] - if ent[1] == "PER" and int(ent[3]) > person_signed_idx and int( - ent[3]) - person_signed_idx < min_itx_signed: - min_itx_signed = int(ent[3]) - person_signed_idx - id_min_itx = i - next_person_has_signed = False - - if id_min_itx != -1: - ent = total_ent_list[id_min_itx] - total_ent_list.append((ent[0], "SIGNATURE", ent[2], ent[3], ent[4])) - - def _extract_entities_ml(self, sent, offset, total_ent_list): - if self.ml_ner is not None: - ent_list_ner = self.ml_ner.get_model_entities(sent) - - for ent in ent_list_ner: - # storing as entity/label pair - new_ent = [ent[0], - ent[1], - "NER", - str(int(ent[2]) + offset), - str(int(ent[3]) + offset)] - - total_ent_list.append(new_ent) - - def _entity_regex_email(self, ent, offset, total_ent_list): - if self.corp_email_class.is_corp_email(ent[0]): - total_ent_list.append(( - ent[0], - "CORP_EMAIL", - ent[1], - str(ent[2] + offset), - str(ent[3] + offset))) - else: - total_ent_list.append((ent[0], - "EMAIL", - ent[1], - str(ent[2] + offset), - str(ent[3] + offset))) - - @staticmethod - def _entity_regex_credit_card(ent, offset, total_ent_list): - sent = clean_text(ent[0]) - try: - if validate(sent): - logger.debug( - "Credit card accepted {}.{}".format(sent, ent[0])) - - total_ent_list.append((ent[0], - "FINANCIAL_DATA", - ent[1], - str(ent[2] + offset), - str(ent[3] + offset))) - - except (InvalidChecksum, InvalidFormat): - logger.debug("Wrong credit card {}.{}.".format(sent, ent[0])) - - def _entity_signed_person(self, total_ent_list, person_signed_idx, next_person_has_signed): - min_itx_signed = self.signature_max_distance - id_min_itx = -1 - - for i in range(len(total_ent_list)): - _ent = total_ent_list[i] - - if _ent[1] == "PER" and int(_ent[3]) > person_signed_idx and int( - _ent[3]) - person_signed_idx < min_itx_signed: - min_itx_signed = (int(_ent[3]) - person_signed_idx) - id_min_itx = i - next_person_has_signed = False - - if id_min_itx != -1: - _ent = total_ent_list[id_min_itx] - - total_ent_list.append((_ent[0], "SIGNATURE", _ent[2], _ent[3], _ent[4])) - return next_person_has_signed - - @staticmethod - def _entity_financial_data(ent, ent_key, offset, total_ent_list): - sent = clean_text(ent[0]) - if get_cc_module('es', 'ccc').is_valid(sent) or get_cc_module('es', 'iban').is_valid(sent): - total_ent_list.append((ent[0], ent_key, ent[1], str(ent[2] + offset), str(ent[3] + offset))) - else: - logger.debug("Invalid financial data {}.{}".format(sent, ent[0])) - - @staticmethod - def _entity_id_document(ent, ent_key, offset, total_ent_list): - sent = clean_text(ent[0]) - if (get_cc_module('es', 'dni').is_valid(sent) or - get_cc_module('es', 'cif').is_valid(sent) or - get_cc_module('es', 'nie').is_valid(sent)): - total_ent_list.append((ent[0], ent_key, ent[1], str(ent[2] + offset), str(ent[3] + offset))) - else: - logger.debug("Invalid data ID document {}.{}".format(sent, ent[0])) - - def _extract_entities_regex(self, offset, sent, full_text, total_ent_list, next_person_has_signed): - ent_list_regex = self.regex_ner.regex_detection(sent, full_text, offset) - - for ent_key in ent_list_regex.keys(): - for ent in ent_list_regex[ent_key]: - - # We treat differently common corporative/personal emails - if ent_key == "EMAIL": - self._entity_regex_email(ent, offset, total_ent_list) - - elif ent_key == "SIGNATURE": - next_person_has_signed = True - person_signed_idx = int(ent[3]) + offset - - elif ent_key == "CREDIT_CARD": - self._entity_regex_credit_card(ent, offset, total_ent_list) - elif ent_key == "FINANCIAL_DATA": - self._entity_financial_data(ent, ent_key, offset, total_ent_list) - elif ent_key == "ID_DOCUMENT": - self._entity_id_document(ent, ent_key, offset, total_ent_list) - else: - total_ent_list.append((ent[0], - ent_key, - ent[1], - str(ent[2] + offset), - str(ent[3] + offset))) - if next_person_has_signed: - self._entity_signed_person(total_ent_list, person_signed_idx, next_person_has_signed) - - def _detection_custom_word(self, sent, offset, total_ent_list): - custom_list = self.custom_detector.search_custom_words(sent) - for _ent in custom_list: - total_ent_list.append((_ent[0], - _ent[1], - _ent[0], - str(_ent[2] + offset), - str(_ent[3] + offset))) - - def _get_kpis(self, sent_list): - """ Extract KPIs from document """ - - # full_text is used for proximity detection - full_text = "".join(sent_list) - - total_ent_list = [] - - # Flag to indicate that a sign entity is expected (if True) - next_person_has_signed = False - person_signed_idx = 0 - - offset = 0 - - for sent in sent_list: - line_length = len(sent) - - # extract entities (ML) - self._extract_entities_ml(sent, offset, total_ent_list) - - # extract entities (Regex) - self._extract_entities_regex(offset, sent, full_text, total_ent_list, next_person_has_signed) - - # detection of custom words - self._detection_custom_word(sent, offset, total_ent_list) - - offset += line_length - - self._get_signature(person_signed_idx, next_person_has_signed, total_ent_list) - - return total_ent_list - - @staticmethod - def _get_unique_ents(ent_list): - """ Process the entities to obtain a json object """ - unique_ent_dict = {} - for _ent in ent_list: - if _ent[1] not in unique_ent_dict: - unique_ent_dict[_ent[1]] = {} - if _ent[0] not in unique_ent_dict[_ent[1]]: - unique_ent_dict[_ent[1]][_ent[0]] = 0 - unique_ent_dict[_ent[1]][_ent[0]] += 1 - return unique_ent_dict - - def analyse(self, content): - """ Obtain KPIs from a document and obtain the output in the right format (json) - - Keyword arguments: - content -- list of sentences to obtain the entities - - """ - total_ent_list = self._get_kpis(content) - unique_ent_dict = self._get_unique_ents(total_ent_list) - return unique_ent_dict - - def __init__(self, config): - """ Intialization - - Keyword Arguments: - config -- a dict with yaml configuration parameters - - Properties - nlp -- a spacy model or None - custom_word_list -- list with custom words - regexp_config_dict -- configuration of the proximity detections - signature_max_distance -- maximum distance between distance and signature - low_priority_list -- list of entity types with low priority - - """ - - # build the system here - nlp = None - cfg_section = "ner_config" - cfg_item = "nlp_model" - if cfg_section in config and cfg_item in config[cfg_section]: - nlp = spacy.load(config[cfg_section][cfg_item]) - - # Custom word that the organization wants to detect as sensitive - custom_word_list = [] - cfg_section = "custom_config" - cfg_item = "word_file" - if cfg_section in config and cfg_item in config[cfg_section]: - with open('%s/%s' % (CONFIG_PATH, config[cfg_section][cfg_item]), "r") as f_in: - custom_word_list = [line.rstrip("\n") for line in f_in] - - # configuration of the proximity regexp - regexp_config_dict = {} - if "regexp_config" in config: - for key in config["regexp_config"]: - regexp_config_dict[key] = {} - regexp_config_dict[key]["left_span_len"] = int(config["regexp_config"][key]["left_span_len"]) - regexp_config_dict[key]["right_span_len"] = int(config["regexp_config"][key]["right_span_len"]) - - with open('%s/%s' % ( - CONFIG_PATH, config["regexp_config"][key]["word_file"]), "r") as f_in: - word_list = [normalize_text(line.rstrip("\n").strip()) for line in f_in] - - regexp_config_dict[key]["word_list"] = word_list - - # Email filter known corporative (non sensitive) email accounts - cfg_section = "email_config" - cfg_item = "excl_file" - if cfg_section in config and cfg_item in config[cfg_section]: - with open('%s/%s' % (CONFIG_PATH, config[cfg_section][cfg_item]), "r") as f_in: - excl_corp_list = [line.rstrip("\n") for line in f_in] - - if nlp is not None: - self.ml_ner = NER(nlp, None) - else: - self.ml_ner = None - - self.custom_detector = CustomWordDetector(nlp, custom_word_list) - - self.regex_ner = RegexNer(regexp_config_dict=regexp_config_dict) - self.corp_email_class = EmailFilter(excl_corp_list) - - max_distance = config["features"]["SIGNATURE"]["max_distance"] - self.signature_max_distance = max_distance diff --git a/faro/document.py b/faro/document.py index 2fe8351..9c977ae 100755 --- a/faro/document.py +++ b/faro/document.py @@ -1,10 +1,14 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- import logging -from .utils import preprocess_file_content -from .io_parser import parse_file +import os +from conf import config from collections import OrderedDict +from faro.io_parser import parse_file +from logger import logger +from utils.utils import preprocess_file_content + META_AUTHOR = "meta:author" META_CONTENT_TYPE = "meta:content-type" META_ENCRYPTED = "meta:encrypted" @@ -14,7 +18,8 @@ META_FILE_SIZE = "meta:filesize" META_OCR = "meta:ocr" -logger = logging.getLogger(__name__) +script_name = os.path.basename(__file__) +faro_logger = logger.Logger(logger_name=script_name, file_name=config.LOG_FILE_NAME, logging_level=config.LOG_LEVEL) def _assign_author_metadata(metadata): @@ -144,7 +149,8 @@ def _parse_metadata(self, metadata): meta_dict -- dict of metadata (as returned by tika) """ - logger.debug("METADATA DICT {}".format(metadata)) + message = "METADATA DICT {}".format(metadata) + faro_logger.debug(script_name, self._parse_metadata.__name__, message) if metadata is None: self.metadata_error = True @@ -170,14 +176,14 @@ def _parse_metadata(self, metadata): # Creation date self.creation_date = _assign_creation_date_metadata(metadata) - def get_document_data(self): + def parse_document_data(self): """ Launch tika parser and retrieve both content and metadata """ # parse input file and join sentences if requested try: tika_content, tika_metadata = parse_file(self.document_path) - except Exception: + except Exception as e: tika_content = "" tika_metadata = None @@ -196,3 +202,4 @@ def __init__(self, document_path, split_lines): self.document_path = document_path # store wether or not we should split lines or not self.split_lines = split_lines + self.content = {} diff --git a/faro/faro_entrypoint.py b/faro/faro_entrypoint.py index e21265e..fae1f03 100755 --- a/faro/faro_entrypoint.py +++ b/faro/faro_entrypoint.py @@ -1,31 +1,33 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import logging -import io -import os -import sys import csv -import yaml +import datetime +import io import json +import sys import time -import datetime -from langdetect import detect +from pathlib import Path + +import yaml from langdetect import DetectorFactory -from langdetect.lang_detect_exception import LangDetectException -from faro.detector import Detector + +from conf import config +from faro.document import FARODocument +from faro.language.language_detection import language_detection from faro.sensitivity_score import SensitivityScorer -from .document import FARODocument +from logger import logger +from plugins.orchestrator import Orchestrator -CWD = os.path.dirname(__file__) -CONFIG_PATH = os.path.join(CWD, '..', 'config') -MODELS_PATH = os.path.join(CWD, '..', 'models') -_COMMONS_YAML = "%s/commons.yaml" % CONFIG_PATH +CWD = Path(__file__).parent.parent +CONFIG_PATH = CWD / "conf" +_COMMONS_YAML = CONFIG_PATH / "commons.yaml" ACCEPTED_LANGS = ["es"] # init the seeed of the lang detection algorithm DetectorFactory.seed = 0 -logger = logging.getLogger(__name__) +script_name = Path(__file__).name +faro_logger = logger.Logger(logger_name=script_name, file_name=config.LOG_FILE_NAME, logging_level=config.LOG_LEVEL) def _check_input_params(params): @@ -46,27 +48,13 @@ def _check_input_params(params): if not hasattr(params, 'dump'): params.dump = False - return params - - -def _customize_faro_engine_by_language(lang): - # TODO: refactor code, we need to simplify the flow since docs with no content - # go through a lot of unnecessary processing - if lang in ACCEPTED_LANGS: - with open("%s/%s.%s" % (CONFIG_PATH, lang, "yaml"), "r") as stream: - config = yaml.load(stream, Loader=yaml.FullLoader) - else: - logger.debug("Language {} is not fully supported. All the " + - "functionality is only implemented for these languages: {}".format( - lang, " ".join(ACCEPTED_LANGS))) - - with open("%s/nolanguage.%s" % (CONFIG_PATH, "yaml"), "r") as stream: - config = yaml.load(stream, Loader=yaml.FullLoader) - return config + if not hasattr(params, 'filehash'): + params.filehash = None + return params -def _generate_entities_output(entities, params, config): +def _generate_entities_output(entities, params, conf): """ Generate entities output humanizing feature descriptions """ @@ -75,16 +63,23 @@ def _generate_entities_output(entities, params, config): if not params.verbose: # Dict comprehension to filter out not verbose output filtered_entities = {k: v for k, - v in entities.items() if config["features"][k]["output"] == True} + v in entities.items() if conf["entities"][k]["output"] == True} else: filtered_entities = entities - output_entities = {config["features"][k]["description"]: v for k, - v in filtered_entities.items()} + output_entities = {conf["entities"][k]["description"]: v for k, + v in filtered_entities.items()} entity_dict = {"filepath": params.input_file, "entities": output_entities, "datetime": st} + return entity_dict + + +def _persist_entities_output(entity_dict, params): + """ + Persist detected entities to disk + """ with io.open(params.output_entity_file, "a+") as f_out: f_out.write("{}\n".format(json.dumps(entity_dict, ensure_ascii=False))) @@ -104,7 +99,7 @@ def _compute_scoring(scorer, entities, faro_doc): return result -def _generate_scoring_output(result, params, config, faro_doc): +def _generate_scoring_output(result, params, conf, faro_doc): # Adding metadata to output result.update(faro_doc.get_metadata()) @@ -117,61 +112,62 @@ def _generate_scoring_output(result, params, config, faro_doc): # Create list with output fieldnames header = ["id_file", "score"] #  Add all sensitive info categories - header.extend(config["scoring_output_features"]) + header.extend(conf["spider_output_entities"]) # Add document metadata header.extend(faro_doc.get_metadata().keys()) writer = csv.DictWriter(sys.stdout, fieldnames=header, extrasaction='ignore', restval=0) result["id_file"] = params.input_file - logging.debug("JSON (Entities detected) {}".format( - json.dumps(result, ensure_ascii=False))) + message = "JSON (Entities detected) {}".format( + json.dumps(result, ensure_ascii=False)) + faro_logger.debug(script_name, + _generate_scoring_output.__name__, + message) writer.writerow(result) -def language_detection(file_lines): - try: - lang = detect(" ".join(file_lines)) - except LangDetectException: - lang = "unk" - return lang - - def faro_execute(params): """ Execution of the main loop """ # Validate params params = _check_input_params(params) # reading commons configuration - with open(_COMMONS_YAML, "r") as f_stream: + with open(_COMMONS_YAML, "r", encoding='utf8') as f_stream: commons_config = yaml.load(f_stream, Loader=yaml.FullLoader) # parse input file and join sentences if requested - logger.info("Analysing {}".format(params.input_file)) + message = "Analysing {}".format(params.input_file) + faro_logger.info(script_name, faro_execute.__name__, message) # Initialize our document representation faro_doc = FARODocument(params.input_file, params.split_lines) # Parse document and extract content and metadata - faro_doc.get_document_data() + faro_doc.parse_document_data() # Language customization lang = language_detection(faro_doc.content) faro_doc.set_language(lang) - config = _customize_faro_engine_by_language(lang) + lang = {"lang": lang} # joining two dicts with configurations # config becomes a shallowly merged dictionary with values from commons_config - #  replacing those from config - config = {**config, **commons_config} + # replacing those from config + conf = {**lang, **commons_config} - # instantiate detector with current configuration - my_detector = Detector(config) - # Detect features in the document content - entities_dict = my_detector.analyse(faro_doc.content) + faro_logger.debug(script_name, faro_execute.__name__, "Running plug-ins") + orchestrator = Orchestrator(conf) + entities_dict = orchestrator.run_plugins(str(faro_doc.content)) # Initialize our scoring class - scorer = SensitivityScorer(config) + scorer = SensitivityScorer(conf) + # score the document, given the extracted entities - result = _compute_scoring(scorer, entities_dict, faro_doc) + scoring = _compute_scoring(scorer, entities_dict, faro_doc) # output - _generate_entities_output(entities_dict, params, config) - _generate_scoring_output(result, params, config, faro_doc) + result = _generate_entities_output(entities_dict, params, conf) + + faro_logger.debug(script_name, faro_execute.__name__, str(entities_dict)) + faro_logger.debug(script_name, faro_execute.__name__, str(result)) + + _persist_entities_output(result, params) + _generate_scoring_output(scoring, params, conf, faro_doc) diff --git a/faro/io_parser.py b/faro/io_parser.py index 62c5580..7afd8fe 100755 --- a/faro/io_parser.py +++ b/faro/io_parser.py @@ -1,16 +1,25 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import logging -from tika import parser, tika import collections +import logging import os +import sys +from pathlib import Path + +from conf import config +from logger import logger +from tika import parser, tika -logger = logging.getLogger(__name__) # Tika-python will assume the server is running and will not try to download nor start a new tika server +from utils.utils import log_exception + tika.TikaClientOnly = True CHARS_PER_PAGE_PDF = 'pdf:charsPerPage' +script_name = Path(__file__).name +faro_logger = logger.Logger(logger_name=script_name, file_name=config.LOG_FILE_NAME, logging_level=config.LOG_LEVEL) + def flatten(iterable): for el in iterable: @@ -35,18 +44,19 @@ def _is_run_ocr(parsed, file_size, pdf_ocr_ratio): force_ocr = True else: filesize_chars_ratio = file_size / chars - logger.debug("PDF filesize_chars_ratio: {:.2f}".format(filesize_chars_ratio)) + message = "size: {}, chars: {}, ratio: {}".format( + file_size, + chars, + filesize_chars_ratio) + faro_logger.debug(script_name, _is_run_ocr.__name__, message) if filesize_chars_ratio > pdf_ocr_ratio: force_ocr = True - logger.debug('size: {}, chars: {}, ratio: {}'.format( - file_size, - chars, - filesize_chars_ratio)) return force_ocr def _run_force_ocr(parsed, file_path, request_options): - logger.info("performing OCR on PDF file: {}".format(file_path)) + message = "performing OCR on PDF file: {}".format(file_path) + faro_logger.info(script_name, _run_force_ocr.__name__, message) parsed['metadata']['ocr_parsing'] = True parsed_ocr_text = parser.from_file( file_path, @@ -74,15 +84,16 @@ def _smarter_strategy_ocr_pdf(parsed, disable_ocr, file_size, pdf_ocr_ratio, fil if parsed['metadata']['Content-Type'] == 'application/pdf': force_ocr = _is_run_ocr(parsed, file_size, pdf_ocr_ratio) - + message = "force_ocr {}".format(force_ocr) + faro_logger.debug(script_name, _smarter_strategy_ocr_pdf.__name__, message) if force_ocr: _run_force_ocr(parsed, file_path, request_options) except KeyError as e: - logger.debug("Did not find key {} in metadata".format(e)) + log_exception(faro_logger, script_name, _smarter_strategy_ocr_pdf.__name__, e, sys) raise e except Exception as e: - logger.error("Unexpected exception while treating PDF OCR strategy {}".format(e)) + log_exception(faro_logger, script_name, _smarter_strategy_ocr_pdf.__name__, e, sys) raise e @@ -96,8 +107,8 @@ def parse_file(file_path): """ # Retrieve envvars - timeout = int(os.getenv('FARO_REQUESTS_TIMEOUT', 60)) - pdf_ocr_ratio = int(os.getenv('FARO_PDF_OCR_RATIO', 150)) + timeout = int(os.getenv('FARO_REQUESTS_TIMEOUT', 300)) + pdf_ocr_ratio = int(os.getenv('FARO_PDF_OCR_RATIO', 500)) disable_ocr = os.getenv('FARO_DISABLE_OCR', False) # OCR is time consuming we will need to raise the request timeout to allow for processing @@ -116,7 +127,7 @@ def parse_file(file_path): if 'X-TIKA:EXCEPTION:runtime' in parsed['metadata']: return parsed['content'], parsed['metadata'] except Exception as e: - logger.error("Unexpected exception during parsing {}".format(e)) + log_exception(faro_logger, script_name, _smarter_strategy_ocr_pdf.__name__, e, sys) raise e # try to implement a smarter strategy for OCRing PDFs diff --git a/faro/language/__init__.py b/faro/language/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/faro/language/language_detection.py b/faro/language/language_detection.py new file mode 100755 index 0000000..33415a6 --- /dev/null +++ b/faro/language/language_detection.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import logging +import sys +from conf import config +from langdetect import DetectorFactory +from langdetect import detect_langs +from langdetect.lang_detect_exception import LangDetectException +from logger import logger +from pathlib import Path + +from utils.utils import log_exception + +DetectorFactory.seed = 0 + +script_name = Path(__file__).name +faro_logger = logger.Logger(logger_name=script_name, file_name=config.LOG_FILE_NAME, logging_level=config.LOG_LEVEL) + + +def language_detection(file_lines): + lang = "unk" + try: + """ + El detect no funciona correctamente. + Detecta 'ca' en vez de 'es' + """ + # lang = detect(" ".join(file_lines)) + # print("Detector: " + lang) + file_lines = " ".join(file_lines) + probabilities = detect_langs(file_lines) + # print(probabilities) + if probabilities: + lang = probabilities[0].lang + faro_logger.debug(script_name, + language_detection.__name__, + "lang: %s" % lang) + except LangDetectException as e: + log_exception(faro_logger, script_name, language_detection.__name__, e, sys) + return lang diff --git a/faro/ner.py b/faro/ner.py deleted file mode 100755 index 038d3a4..0000000 --- a/faro/ner.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import logging -from .utils import preprocess_text - - -logger = logging.getLogger(__name__) - - -class NER(object): - """ A class to extract entities using different NERs """ - - @staticmethod - def _spacy(doc, ent_list): - # using SpaCy - for ent in doc.ents: - if ent.label_.upper() in ["PER", "ORG"]: - ent_list.append((ent.text, ent.label_.upper(), ent.start_char, ent.end_char, ent.start)) - - def _spacy_extra_models(self, u_text, ent_list): - if self.nlp_extra is not None: - for nlp_e in self.nlp_extra: - doc = nlp_e(u_text) - - for ent in doc.ents: - ent_list.append((ent.text, ent.label_, ent.start_char, ent.end_char, ent.start)) - - - def get_model_entities(self, sentence): - """ Get enttities with a NER ML model (Spacy) - - Keyword arguments: - sentence -- a string with a sentence or paragraph - - """ - - u_text = preprocess_text(sentence) - - doc = self.nlp(u_text) - - # extracting entities with spacy - ent_list = [] - - # Detect entities: PER -> Persons and ORG -> Organizations - self._spacy(doc, ent_list) - - # extracting entities with crfs - self._spacy_extra_models(u_text, ent_list) - return ent_list - - def __init__(self, nlp, nlp_extra=None): - """ Initialization - - Keyword arguments: - nlp: spacy model - nlp_extra: additional spacy models (e.g. with custom entities) (default None) - """ - - self.nlp = nlp - self.nlp_extra = nlp_extra diff --git a/faro/ner_regex.py b/faro/ner_regex.py deleted file mode 100755 index 6a6275d..0000000 --- a/faro/ner_regex.py +++ /dev/null @@ -1,232 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import logging -import copy -import re -from .utils import clean_text, normalize_text - -logger = logging.getLogger(__name__) - -# Email -STRICT_REG_EMAIL_ADDRESS_V0 = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+" - -# Credit Card -STRICT_REG_CREDIT_CARD_V0 = ( - r"(?:(?P((?((?((?= self.ranking_dict[ - self.sensitivity_list[current_idx] - ][key]["min"]: - above_min += 1 - except KeyError: - logger.debug("could not find %s in scoring computation" % ( - key)) + for key in self.features: + if self.features[key]['description'] in summary_dict: + try: + if summary_dict[self.features[key]['description']] >= self.features[key]["sensitivity"][ + self.sensitivity_list[current_idx] + ]["min"]: + above_min += 1 + except KeyError: + message = "Could not find %s in scoring computation" % key + faro_logger.debug(script_name, self._check_index_surpass_min_specified.__name__, message) if (above_min > self.sensitivity_multiple_kpis and current_idx < len(self.sensitivity_list) - 1): @@ -36,23 +41,23 @@ def _get_ranking(self, summary_dict): reached_min = False current_idx = 0 - for key in summary_dict: - try: - while (self.ranking_dict[self.sensitivity_list[ - current_idx] - ][key]["max"] <= summary_dict[key]): - current_idx += 1 - reached_min = True - # check if we are already in the max level of sensitivity - if current_idx == len(self.sensitivity_list) - 1: - break - - if summary_dict[key] >= self.ranking_dict[ - self.sensitivity_list[current_idx]][key]["min"]: - reached_min = True - except KeyError: - logger.debug("could not find %s in scoring computation" % ( - key)) + for key in self.features: + if self.features[key]['description'] in summary_dict: + try: + while self.features[key]["sensitivity"][self.sensitivity_list[current_idx]]["max"] \ + <= summary_dict[self.features[key]['description']]: + current_idx += 1 + reached_min = True + # check if we are already in the max level of sensitivity + if current_idx == len(self.sensitivity_list) - 1: + break + + if summary_dict[self.features[key]['description']] >= \ + self.features[key]["sensitivity"][self.sensitivity_list[current_idx]]["min"]: + reached_min = True + except KeyError: + message = "could not find %s in scoring computation" % key + faro_logger.debug(script_name, self._get_ranking.__name__, message) if reached_min: self._check_index_surpass_min_specified(summary_dict, current_idx) @@ -95,14 +100,14 @@ def get_sensitivity_score(self, entity_dict): result_dict["score"] = self._get_ranking(result_dict) return result_dict - def __init__(self, config): + def __init__(self, conf): """ Initialization Keyword arguments: faro configuration """ - self.ranking_dict = config['sensitivity'] - self.sensitivity_list = config['sensitivity_list'] - self.sensitivity_multiple_kpis = config['sensitivity_multiple_kpis'] - self.features = config['features'] + + self.sensitivity_list = conf['sensitivity']['sensitivity_list'] + self.features = conf['entities'] + self.sensitivity_multiple_kpis = conf['sensitivity']['sensitivity_multiple_kpis'] diff --git a/faro/test/data/sensitive_data.docx b/faro/test/data/sensitive_data.docx deleted file mode 100755 index 5b840ec79418fa0ea0f2610e27f0b7dee1ec9249..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15027 zcmeHu<9}t#*6s>Bw(WFmJ007$ZFa1V(Xl$Vopfy5w$ZV5vvKa;XPLD}o$Q;LlkA;E(?Qd;Jeyfx7q+n{Ijp;roOq`0tI%28TH%l)&MlAfURB>Khan6ptWE$q_x7-5oa z#nZ0~46*uJF}Jp1O0fdb6W8~|jwwRPrst*ypsGCpq9_xhRY<-@GT{ZjpfPRUx2=OD zt&`eO$kuCRa-2~!O^^!It3wGr_B@)+jQ32Ei*Z;9aKycc z!ouK7mQ#)Tbn1lf#4JxW*iTg8s$U#l>8YgU{9U!X=3xH6Q6Fav-oib_g9{+A3y+r z_jeG0+`ow=-gm6V^AA3g`H(p150TV$Ft&1}qy6pvzoPg*SpNR@>1DB97CrPZ{O19$ z0aI=A%Uu|`GIWNMD;P_kAT-6LkXIHh7T#XD7Z-uGkMu=Hrf1?NzdELiIB&*lUtlFF z!-lm%&Ohq(Xg;^P0TO~*^BLUc?Y3dk_U}wyhl#~0h6BRYQNt#%z`s9)B#(6>?@tjUuY0DWHw|d-Lt2O7Lvx>LD)Vf zAL}`xdY$f;Eq1!L!1c$*^q=x!64T8{2J^9HaRUH|AB*B@>tIA@WNYYb{jqQTwqc!W zjybG|Ahp}hwfCS@QiVSdLs$@#qKI?IaD*BRXx0x9(vu7T(GLQ$r17uGc$+P}7O!Yj z+lP3Eym#yvLM~5##u_$+-+tham7dxMwBsC2~p{R8XCt$bJQ^I0XLou8Ta*~zMx9iI5hJ&s2}24 z3oztb=-apDFdnvq$&ZRNe=>NJpdaY-OF%Ec{k$;^9pUwTM5 zPGXmX#T^z%!Ggs#0u*cD>=Hsz*hD4BTRZdd*pMrHsA@$K(k}(!jH$yhjWpf_VUmt) zKbZyUupuF_Eo%z4q!fwu+SJA5jAQ4^OdP*O+dRb`n<~FQRUKx#rufX|>&6A7xGRQZ zG{~9reqWLfBoCJ885DN-EyTTVrHP$FuSB}dq zNM!T%G^cxw0X>z3f9tICQ;Nw@(_Iv34Z>y_RV> zVo|LuWo^XK<&?D~1nAG9W;8d0rbE*aHJA2PUS-NDYZ7q-;vNo?f|W%$i(gb=-}hoh zqZRib15GyzNVq*1WfJy#FtC6|Vcd-RL2l4(@&hSiw4ii{_J)+MZb9ixKxXph3raF%!hIn(uPqGfLz(-f^!0wmqajpjE%y##L4x^rXdcwaaQqR>@IJ|e%qpWd zR#Wb%16y?xwNb~!qPop%pB)%j%;-lsNAAHXKhfd3es&|zH(vBP(u^M^2(r6YCKZAg zbrS9x5I5<=#bVO%-RiNmBx+!zaoVzZWVGa) z5;avH0{kcp;34K92m%bK7eJa%9RRZI0B4_ka1SAvp~N4+12fLcvA~UN-^!~FC7wc+ zj6JWy1-WFsZ&wnsbe^w=BVq8Cj3jDEM)X!N-3BJgFmY0sa=^}dGMR~p;70%?L}W7w z%nXCbQFux| z256d}O+QDm*983Slw!WHLv#g)TqL==&U!8udH*cpOE3H9oX;!^7yb+OoEaws@nrM@I+!XIRh9|`@~98#OperGD<0dWPQa7LvE#bFhsSu|w6xB%`_G^-! zf^;?+%wX*GkN%7hA$$Wm+$PC8h~I0CIF@iJNFp)9rj=O(q%uypd=-l!kJ4==Cm-Zc zQ~z@4$`hXgE{D(1OXs(8MP2&Zvmdr4m`5yBg^8uu2ZOMr+yFs31sHQzL3F4n@du)= zkWsJPMR}w@4}|aPpr@3eOeG;5mKqU?DgJ^wvfS@lT3^*mUC=*Y6#^O;B1Qy6z?XZK zB_>pwXlOp|K*VCtN_zB|X-}wsQS9a@Xd^<-jWEo(jVS5fwX;Lr|Fv9Zz}b3Y&8+%4 z|FTfYxvwg3_g;M8KON&L+YQ9lZ#*=6N7#BMQnX1??t4IgD-PQAy)Y&=S|OBJDGX~6 zlcBwVg#afLFmqT7MhLqx-RA`h1`!B5X7p=#tUB2W?(v5JQ5)Z!YTUZJVmrf<7f0XZ z3!q5z9U(bb)gD$vdWLN(5#{<2Y5MUdp?aV*VPN@Uac@wTiXKQtHQlN@lo`@VeC+Uw z^JkltM!eTL)koUVHzy4OG-*N!XBjbGPGi{LkEK24r+$6(_ zNL6X<5+Wfb3#ei@#2E@qa`^f@Av*Im6hx+=Z*N7|30_6}qwdlm?2*S^dQ!rO=}3ig z*<$wAXl7*$mN=GUQRnP3BaOV>R%u*7r00v-i41c*7NiLx z@d^mzvm6vXp(P|pMRm)?FXn!x%PMw{dkK8rD{aSLZQd(P5;x3YsS90IYXwS;oq1&+ zYY{YPN>090E;G;hNMLqRpEm+gRF&&hdC~kB<&)YIt?hrh;G!K$s<-oKn9Kj_z?b6V z$g|(byWt`&wd2g% zJCL3+gAlJ)N2ofP&D?AoB5D}x+;WAFTwkhs;bIn!^sk1{t|`NB68-aB26tzeHYHaqc+J zcoQM+doI$zUf^|*t;&_|d|WT=D2xNTgIkRkRkxyBzb-yX_jGA)Y3_RszCEl*aT?8d zhK~=N=AD;6f3dB!l{WR8alQMh$?FH}L_M>S%$pl+dJ{L8>%{&Ykecy>&5HPJ{IklW z1NCy5dPDrib!(O~kJh;cH=T){Ra@z0TQC0%Gqs_zt<7vSD*DgZmlo@p=5ru<^pNio zh)nPt$Hh&WY-~wo9jtET7j3`1H+Vc7JRL5VbPlQ89F|zqBqx5>G1kQq1!mDw3LDd9S-_ib1q z@(dW4Oe=OhU!7sCG<7d`scfkkYq1CUN;h^~Ksx58UUU#x_okPfqime9!o^p<9;%b* zekm12-WSKL@l(7i;RvJLGw?;=j6E3J*SPS>ERr@O;l>LmQvTrQ@{y|A`N1p)d_K3W zR)Ug~*4xt|MWjj0s`PavTeoNVoEV&P8HA6>P-lTpeSs}nc~qm>(bS@(HFP@7YJKN* zQaMeZ!2QwO1fLG?NezB;o34&i=Izow-}|IjI~GY==fUmQwmZ}|&IhRWiE|k)HCNNQ zJ5vAinPsGQP8)JFf9)k=6Q)YX>7%Omm&;1rs)cq3Y9BqHzID}8+_1Lx!JcVPwXL%# zu9lOVuj8a`D6e5Hl}%iruF>XHz0~Q#oIg4?puKMh59T&4z`Qedo>V64&yl?|UrH|D zv+5p|QMpA7C=|u<6VKvwS7mJN_B`syG+3P`Yw5h6?*@R4S`x}1_Rh(Fd8+HfX!Es3 zXUpg0f8XyMF-&Mr>4-ea{V7ngcb?C_#-4x!-CqiFZeg<3??l^l54UvD{)VML6ghwXN+iQ3{x z-7QT^t-=z5ud4rR>zHZFPpOxA<$4f?-170`lvQB)u9 zhC3(#fCGR4`rWDc!}I>TYw<5H9Oxq;_R;%)_SG8SCDTKXAapMF@@1FHULz~3#M}dr zhu6ga;9FiRUdHLNtDW`eQAQ+2DbY)~-DG$+daZV4P+uv1^DU#Uwm7*rG-Q%8$x^dm zo7AJJ2^EoH5htQHISfs_Eb+iJoSeB+Ka_G!4n>PPKgYmkUm>=n3Or0ZfHfUue4$hS zCf|zxEK7((IyU9T3%;8BDuUIW@`bDeMSh<30o_&=MPuuSVkL_PFCyWF{d2_VmII1a zSJGhdVW_3+X6{O-l7YldU9p?K5X z@P)X$dZt~H_EOv}+R(O$4{}m?%`RU>7>o^yUD-%M#@`gv1qo_AP4aI%$w@%dFyG|! zk;P0F@a2eNtuWQ>6^OGU3|HuvkGMwh{%~}1;&wa=c!vB!E+jTQ6Q$ZFcZncA;3+s$ zw6lf@D@g+{9y9}K!cB^6E1q}XwRgYK5=d8)OakiXNlCXIg-*rNp+-Bx0AuLxHz5w? zs7Q-cmPM6wCA6tHnO5acu3#RIETssa&kj^i93z12ZzEIaB-J`%c?$ymu0VR*nH)-7 zo?d1IwK!B((y^=XO)3j{loPLvd8?4vLDxM*zgRD4M|m_;wcWG0716*V+DgF2>`ldi z%5SL5WLLXSAk)~#LJ3arua5-NEs?n94>HQ0pwo}iQQRq-;qC1m?R9|EBNr`i-s@m{*rC59cL z$&H$v6MrrSux*qgt$ve2DGZhKLnU9kIcD^BI4F8vZFco4=Co%F4m77K?m3p(EmVxD zYzDX2!^uo=2hZziZ6UmmMtK{RD_Eht{I9-y-M7~V#hV4XjW)NN1Z=wShXX$Ex0idq zH|e$-Gq2a{Yh54jyD@a1w~Ilbb0|-(z-Ti;l4b#!DeH_*a9rpZ!%GXKT#T6`vV`#NH;nDA@yG5(jJ|6>gqLHE9202dbw^nDI+f=5Y z!kHcDXwigMts`W$EQ2+5@Tm6@?q~BEBSM9*(Lj=RLh^(Nc?L`a*?V~lW@}CeaiM7+ zm9nELj`fe3heGf)r{bZtvmqahkBA`N8mt`M&~Moti_hbbu^~-Cn35#AshPY((4oj> z`q}k`cY#!@!*{0?B4`36!7`#CcFgDr6`GA~!t#ahIrA_ZSKyZztG^3Qr9iv;a?lYy zd!qHTr*Lu1Fw+`T4r2y>7HMp9h~2z{obtK=k-2YvrX@{m(e-XP=f`sR6q2D1vGAQ> zl{Z?FwXZk7l?sWf&jrj`MAQ~hJ==>PqNmVJ5Te;}F6h8l#e$9H>i#rSaBKi!k=S4rBw66oOo^cYUx{ON|ws%_% zrKvIDkY4xUsk}OQjS#EEaEI>Fjyf|h=a#8n2am0>9F<1nRG5u}nngxe+kM=?8hnQg z8nJD%UIp>h#PvqI{HZmf-AkD;`jbu(llz1-+~4*sIH1uXt(L{?ZV{C)cgvls3+dGkDTqE|>uM2?*HE|7F-ha5j>5&9S<|*>fiu)< z4?J<1#|g@`0?#*F0{gz=M?f+^)_PTfxA-0);&ocg|C32&l3IFsVi8*)hq ze*_uAi7;Cg^MOh^%Pro9yu;aWiz&yovJ+#9%#hj{=Fo?=NMS$TPN?Sk04#>z7pLen zinh%P{snBTgmp7eqP}`RFigp*~- zz3QX3NUaXg;I&ZX>haI2S|ww?UqdwY!nP7EZzq^{&ObV9UECkIC%Af+E2mA4{YLp> zfyl1a5j)E0hpBCv^bAD07r|he|^`kQMdc6BXT6aKY7OW8uHr)rmtGS$0O z+v2HSW5L!%6Qtu=&)9PD@yWwlr|0B~?_;a`&v0&=lv{Jzht)L=`?qkeqp_2dxs9pg zZyRBq>Wa+{D}oo@q!0azR;6=eSw(G?r+H0COCP7OU;7l2H8P>FU*4%(_?t&?yxCG@ zvzWpzY4-kTIwo9j&*gxuY{VU$npk(Z_{QtzjYrd6?qo>*7mA}l6Sv@*va zJJok0Y{5`YSe}8Z|cyONFDV<>87YXL5z68 zz=|P$=nk?H=1Ys3OMq3)*q(b79;!~V( z8Hhi;d;l9`Xt1csEm7*)B6#glogN|x#t%o%IQBgZpd*rau^mz5oYyg|)iyT`dbAKE z-nhbMJ2B+*zr?1K((Pf_;v)HT+6BgdYN>y&plW5o2H9<9dDyA&ApdxS)|`JIMeueF7BP(~DIxrl(ndoo406uD0l z!dtq1%lx}fMIKS7nadj=(Sr^kM)l$a%wlg#X`57}#Gkn>j>!>8Qeqet(LcO}D0tbO z&D5x+4;Tlbs1G$B9r4BN$}?OZu5y9vv7F=$Bz~*p30^KSnnE?KcBSFl#;*mf>Mi5P zZtQf)mWSb=oTZ%C#hKt^`Lkx>?DmHo=H!fb-HRte1-VnJ=Yp$)j{Q5dz0jh5_)di0 zn|T6-k@y~tw#%WTTBgfRftDl}FY25pf;qzRoMYzN#Kde`ZaLOj8L)NKoiy0I ztY$}F1T=(@FWF>*Mh{d)LSB=;Q?5RPiqw+WIEC;-!7Y?G^XJXTw>~<5RpUrJFv^nx zW98K>8E{Fl>|crZp>e7U%uFFmZCmI<$ z62Mf@>S`v3yH(KC)$@!mq6~p{I*IFeTV}6l3ghwZy;rs;gq)Ff@eYd?_QU(KkduKG8g- zYVCFaM{=_$oN@R4{i<%U7D$x6F9-l*;00{pL5;~p+9-|~3CFZ`J0!0Q@+q{_L1;ft*ZG*$7j)#M0PqXTOY(wjI(95 z>-W@1S*eUDIJm^~^c6g-R&w>bD$nQxuB12<*-_1)Z&4-z*zUb*F4k~D6wr#-NJJl&IhP|P@;i7l~ z+o%ffbIG=7h7?romecMsqwQPk&ly4UIBXfwOs%y2F-Y0IdNHI77MIa|qOFd7?&p+i zqt~}g)$xs6>}pD;>-(*t#WCO%@+3M2-9;4kTe!rgvqkmaa#@9cN_h-@75Z)?^fo&@0RYn??%R%x|1?s7G;`B#2yspRa?cd z1kY33me%r$gV?|3EdjA(!v_**-$?#K|Lg7ruYWCLyr*fU))m9mqo&`nG+)KZ&?*r% zK$}&uWKgLq*S`a%uzIvg3s=0SmYkRrAuOI ztafj0JL}<^fv!9%()_Rckht~=X3Asr1$*%6ByH3_DYX04=oKo<{5O1abmy2MHcJK< zKx^V;j96>5t_oyELXG}S?H#XJYo$RnOlO^wfR3UZISS0$cBF?!UC$@iRJe>3B6bf_ z&Sj>YkuPvAv=B0ksN$qTB( zg-?*ZBb0r^IoFkf_RtO)R=Ovt>NoCbO@B)p=uuo_L;bFW`J{vP>nT_BxY>+z^}@It z+~xtPtJquy-sKGm=k(*L%Vx6(?oO+Q8+|=mF2G$WxMf!!|NLURa-X93rYznGqzN=O zauZ36HBchaJW&y^dIu4&YDXTQZY$oN+h5Wl>K>j3iID*(LIUzrZfsXq5wjh=0?@aO z>=|F!4(UQ5Z0|}LFk%g0W<90rh?^VP8wyzv5ta#@2#X93j5%&c`Vw~+H^j@OQELAq z83tIAxpYf{BN=A(u9=`|sSYGUI#M4!P;@b>0v(f~pFo#7iJ*YiefNY3=T}S*~=i#^D6C*paJ`9!QXBc|;B2TORL;->UVm z{A#heX+C{a<{k|ZVZTbTv- z#4guSs&MJ%yh#h$G-Zu7REE=74s5BAsl62+jqmZzbp&5~ru)k!2Kplf2G=7+o7bMG z6XxkBAEdj{DDl*awz)NAg7X$(r^?Ys)8dkoJ8XRRTc4tne(Lm^bkXwko;Q3gYa`5e z_B$CiZ=v(n9nl*Zes7^lsa;snd=kEVSDP#1ujCYbz(2}2F@ciG^|)y@yB=b!z$gfp zEIKJWqYv9~E{G>Zf*|VTKb4`pK*Y-nxW&j5f>|qt@MEJAeY8N+r_1Yej~CJN_7=q; z87PSf{PF$+K?wYNHx@hVb69vr%Dk*dC`7YdLD1zQn>O~&B6>ViB@Fxz1SG@X+YtO% zDTRLP`Z?GCf*@BAYPsmoR~kGP7&b~6(M*&P@E?#s3{-_bruOGRzt87Cr|68221idB zF@TBMAN&CkjDVu}V;28+q~9KgiwL*}i(>r$99;Xl_AO-EHSLx6t)%ci>h%p@d+x{6 zlxKWPDCcRk#@+D5@jG=ldOlw6{3m#L?Y1d+q5_4_uA$4<)nDUIY(n|HE*@W7pCL2J zyf%|)=OzT3CZvE~ifl=m3LIux_R_v*4RRf)9*v1r9djQKw~vY0VR0WzG><2Zq$!LZ zL4dmEr-)p%lebc7vUP7Exb7W_5a4W8jIKfif~+P)WBUz!mmasxjrNF|adY;eAaIf% z0N?7BJxooMD>df))_+KqnmMV;!L}Y2=%O5TFa|&sHkntloq9fD#7l|v$k3P$Jf06kA(*Kjn5Y@j8 zVIv(v@BmdLKJ0`z<)ypIGI9Wh{k9j#$28_hn(2l#g?FqvEB}kv_T?qPxCMWb=^5Q* z^pq`&4Np~~#MZam&t@Ra=$1*Db@7g*#0R0wd&Rlwt*OCc3aqcoMMul<%&fBc z(=c-TWa`+gU>cuw=Sl&OuJTE9O@{wS+lhd7v|9Z}znk-2Yn5Vq?!r70^$Aw~FiSg} zcgaj=177Nzl<{q@`F?a7HG}WL)2E)23^KYh!IZNDtzk`-vvRaIZPHBCx%pR~`+P9l z2HDc{jq85Pq@p(8wXty}x)Xex>j>A|$d_AO&v}uw&R?bT*$n16oUgbcHruC>9dy0{Xot!1d^RO$=*1KUh z+|D1a9$HN%RQ;-VM{yfhe!kTUUOSNGEq(pzWAAJ~#Tz!#akcUO>A!_UFvTxfRzA{8 zhaasd|MDALjSUq478N;7ShGo^M;SOLKf{eV$A$nImWz{6pEI9kme#xn^e5|$-3c@x zr}Dl=LFO4P38yfdqqlVp47|Ow#;)A1u(4@RU0!jy!r3kZD3;N}%EflHGy}7ZVO3{ov-YsyE$PEJS5vKv ziXMMjR$>$-a>_$dtP)1#ekKO#Zw&-?rord{X;9Q^(ZR=}{SL9rnp00)?g3d2qF4`( z(TH{}e>JG6bck${qU9T}2Tna}<2M9@c>6h?E_Yqw3NvOLq!-;c;&Q+}!c#A>DBVGb zB|7d0drqp|8*MUV=aqYiQ$!D5{DfHb_KZqEcD@kO-RDlmPdtl!3>qCbWIDP#s+yr~ zxbs<5p16tU+uSYVJIv>F@p8Doo_cr^+$03g`=aldD%TDpW!7yCR$jVZ0cZJ+Ame^d zW6^$Alsh!{WmD`{plo+T8`mO|nD|!UNOmHJ<6Ve^6HO7vW>U@f9rL!B%{QXq zBV*>=6|dMs;vTSR>YJF~w&-hwGT`@*Tzw2%ZEY$-dx?_Vb(m zJlYV+5BznK6fOF?``-HYQG@@V@iB>SZzbd(rNH_h%z*SSW-$4v(pSyG=rv?O=j#00V55(hV zYvCwUOiYQE3E4(b^1Uc?9IAydubii**eL3vN-}{e)L&?74{h!rXRK>!>Ks`8ABSy7 z$K;N*>LU#$CCNg6oP<$`p)HGTZ_cF~h>xB(r^?e%Z9ja6{u~DOS}biv>TMWqCJYDqbDhBQa-3yn7Baj#`vpsCY^R-3Bi0hzja+ za>nLl+%~Y9G)NwCXKGE7d4%*ti2goH@WBlG2~+TaYI+CNK9t@PbFW2FPs8-N*!0K9!_ikY2 z9Z7CmnkdhHbtL_a@JHa=v|0#5!dybuJ>_ResIGN5+Y`NZx>iXwFQB2zc=q4$k}wLR zpLvdnAeDj%Yg&Jv6Bp}JY3V)kaDQYd5kC^%A6afW2U|NwIzwBB-?{3e z*8G37+#m855xXJVLxAABA^n1nwa#TW*T<|q<}{Oh_`;HV3A>eUCJC87|9U^|a+Z#@ zgFWDN59FHn311kmHNd%KmKX^Z1Cl>nfwB$b`^uwzlPqwsmSjc}mWUB#6pqn~W-)!} zXBw^LX$MGr9Bi z5&q7^`gV4|X~zHM;g4nexY8B7Y*y%D+Ta)YFq+vU(rI9Uh5KR*4)rymd#xu@rO3OD z#AA}PmiW%7?hJF!`V|{~?9c2FOY`t3Z13mS_aPcEu9q)aF&cxhpIPp>Sl+35wQAH5 zWjoZ@taqjUAgIQEyo+=T;pG3Ka z_G!}pi;heoG#nj7#kVKQ2-u}bd*|MFYYmUJBT3mg3OPOsO0xiEh%3qa!XLAEnq8tQ zN_Yw7QqT^(QhD@jDn z(<2D{JTxyO&nUE`)w(k;+-6M`{8!XKx?5#%n!M+4Ho@;PTqApTSm4zu=~_#V5q!-y zp4jok;g2DM+qOpxN^cD-(W>ZlLsnN4;P)$8%+kiZx;h73G5NiTYDMNnJ8B|=t5c{s zXK-0QYjDSoV;iA1r5WGIsZ7I%;@iC=$8pZO9};)zKK|oBqyq?;=EG$C@2C5IXNAAr z{>xcEImv$~@b71l{)Pp_eTdxOPA2^m_|HQce?eic{ z|0vG-C#8SZ+WbY!`J?pazgFJ-lf*wWlz)-%$N7`Q-}9CKg#R-O@E2U0@L%x%n-lma z_@A!8U*H6qKf(WS4E{;spZ4Ql6mmY;=Jy5oH*4~r_37f0zybg#WvW i{yV&#`ET$)6}Frt*av(3cJ2)t(Dh;TuCo31^nU output/scan.$SUFFIX.csv +echo "filepath,score,money,signature,personal_email,mobile,financial_data,id_document,custom_word,meta:content-type,meta:encrypted,meta:author,meta:pages,meta:lang,meta:date,meta:filesize,meta:ocr" > output/scan.$SUFFIX.csv # Run faro over a recursive list of appropriate filetypes find "$INPUT_PATH" -type f \( ! -regex '.*/\.[^.].*' \) | parallel -P $CPU_PARALLEL_USAGE python faro_detection.py -i {} --output_entity_file output/scan.$SUFFIX.entity --dump >> output/scan.$SUFFIX.csv diff --git a/logger/__init__.py b/logger/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/logger/logger.py b/logger/logger.py new file mode 100755 index 0000000..81fcf42 --- /dev/null +++ b/logger/logger.py @@ -0,0 +1,36 @@ +import logging +import logging.handlers +from pathlib import Path + + +class Logger: + + def __init__(self, logger_name, file_name, logging_level=logging.DEBUG, max_bytes=1000000, backup_count=3): + self.separator = " -- " + formatter = logging.Formatter('%(asctime)s -- %(message)s') + self.my_logger = logging.getLogger(logger_name) + self.my_logger.setLevel(logging_level) + file_log = Path(__file__).parent.parent / "logs" / file_name + handler = logging.handlers.RotatingFileHandler(file_log, maxBytes=max_bytes, backupCount=backup_count) + handler.setFormatter(formatter) + self.my_logger.addHandler(handler) + + def debug(self, file_name, method_name, message): + error_msg = "DEBUG" + self.separator + file_name + self.separator + method_name + self.separator + message + self.my_logger.debug(error_msg) + + def info(self, file_name, method_name, message): + error_msg = "INFO" + self.separator + file_name + self.separator + method_name + self.separator + message + self.my_logger.info(error_msg) + + def error(self, file_name, method_name, message): + error_msg = "ERROR" + self.separator + file_name + self.separator + method_name + self.separator + message + self.my_logger.error(error_msg) + + def warning(self, file_name, method_name, message): + error_msg = "WARNING" + self.separator + file_name + self.separator + method_name + self.separator + message + self.my_logger.warning(error_msg) + + def critical(self, file_name, method_name, message): + error_msg = "CRITICAL" + self.separator + file_name + self.separator + method_name + self.separator + message + self.my_logger.critical(error_msg) diff --git a/plugins/__init__.py b/plugins/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/plugins/address_bitcoin/__init__.py b/plugins/address_bitcoin/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/plugins/address_bitcoin/entrypoint.py b/plugins/address_bitcoin/entrypoint.py new file mode 100755 index 0000000..dd182bc --- /dev/null +++ b/plugins/address_bitcoin/entrypoint.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +from utils.pattern.entrypoint_pattern_base import PluginPatternEntrypointBase + +_CWD = os.path.dirname(__file__) + +MANIFEST = { + "name": "Address Bitcoin", + "key": "FINANCIAL_DATA", + "version": "1", + "description": "Address Bitcoin Detection", + "author": "Enrique and Hugo", + "email": "enrique@telefonica.com", + "is_lang_dependent": False, + "info": "https://en.bitcoin.it/wiki/Invoice_address" +} + + +class PluginEntrypoint(PluginPatternEntrypointBase): + """ + Address Bitcoin Plugin entrypoint class. + + """ + + def __init__(self, text, lang='uk'): + """ + Initialize pattern plugin base plus additional parameters. + + :param text: Incoming text + :param lang: Detected language from FARO Core + """ + super().__init__(_CWD, MANIFEST["is_lang_dependent"], MANIFEST["key"], lang, text) + + def output(self, unconsolidated_lax_dict=None, consolidated_lax_dict=None, strict_ent_dict=None, + validate_dict=None): + """ + Default output generation method. It can be overriden. + + :param unconsolidated_lax_dict: Detected entities by lax regex expression from pattern.lax_regexp() not validated by context. + :param consolidated_lax_dict: Detected entities by lax regex expression from pattern.lax_regexp() validated by context. + :param strict_ent_dict: Detected entities by strict regex expression from pattern.strict_regexp() method. + :param validate_dict: Validated detected entities regex expression both from lax and strict from pattern.validate() method. + + :return: Output dictionary with detected entities to be returned. + """ + return super().output(validate_dict=validate_dict) + + def run(self): + """ + Public plugin interface. It can be overriden. + + :return: Output dictionary with detected entities. (Default output is generated by output method). + """ + return super().run() diff --git a/plugins/address_bitcoin/pattern.py b/plugins/address_bitcoin/pattern.py new file mode 100755 index 0000000..35e5b23 --- /dev/null +++ b/plugins/address_bitcoin/pattern.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os + +from stdnum.bitcoin import is_valid + +from utils.pattern.pattern_base import PluginPatternBase + +_CWD = os.path.dirname(__file__) + + +class PluginPattern(PluginPatternBase): + """ + Main plugin pattern class. + + """ + + def strict_regexp(self): + """ + Strict regexp method. + + :return: Dictionary with n strict regexp expressions such as + + { + "NAME_V0": r"strict_regex expresion_0", + "NAME_V1": r"strict_regex_expresion_1", + ... + "NAME_Vn": r"strict_regex_expresion_n" + } + + """ + return { + "STRICT_REG_BITCOIN_P2PKH_P2SH_V0": r"[13][a-km-zA-HJ-NP-Z0-9]{26,33}", + "STRICT_REG_BITCOIN_BECH32_V0": r"(bc1)[a-zA-HJ-NP-Z0-9]{25,39}" + + } + + def lax_regexp(self): + """ + Lax regexp method. + + :return: Dictionary with n lax regexp expressions such as + + { + "NAME_V0": r"lax_regex expresion_0", + "NAME_V1": r"lax_regex_expresion_1", + ... + "NAME_Vn": r"lax_regex_expresion_n" + } + + """ + pass + + def validate(self, ent): + """ + Validate detected entities. + + :param ent: Input entity + :return: Return whether (True) or not (False) entity is being validated. + """ + return is_valid(ent) + + def __init__(self, cwd=_CWD, lax_regexp=None, strict_regexp=None): + lax_regexp = self.lax_regexp() if lax_regexp is None else lax_regexp + strict_regexp = self.strict_regexp() if strict_regexp is None else strict_regexp + super().__init__(cwd, lax_regexp, strict_regexp) diff --git a/plugins/address_bitcoin/test/__init__.py b/plugins/address_bitcoin/test/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/plugins/address_bitcoin/test/data/document.txt b/plugins/address_bitcoin/test/data/document.txt new file mode 100755 index 0000000..9bad7dc --- /dev/null +++ b/plugins/address_bitcoin/test/data/document.txt @@ -0,0 +1,27 @@ + ok "bc1qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq" + +n itself, who seeks after it and wants to have it, simply because it is pain..." +What is Lorem Ipsum? + +Lorem Ipsum is simply dummy text bad valid 3J98t1WpEZ33CNmQviecrnyiWrnqRhWNLy of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum. +Why do we use it? + +It is a long established fact that a reader will be distracted by the readable content of a bad valid bc1qar0srrr7xfkvy5l343lydnw9re59gtzzwf5mdq page when looking at its layout. The point of using Lorem Ipsum is that it has a more-or-less normal distribution of letters, as opposed to using 'Content here, content here', making it look like readable English. Many desktop publishing packages and web page editors now use Lorem Ipsum as their default model text, and a search for 'lorem ipsum' will uncover many web sites still in their infancy. Various versions have evolved over the years, sometimes by accident, sometimes on purpose (injected humour and the like). + +Where does it come from? + +Contrary to popular belief, Lorem Ipsum ok 1BvBMSEYstWetqTFn5Au4m4GFg7xJaNVN2 is not simply random text. It has roots in a piece of classical Latin literature from 45 BC, making it over 2000 years old. Richard McClintock, a Latin professor at Hampden-Sydney College in Virginia, looked up one of the more obscure Latin words, consectetur, from a Lorem Ipsum passage, and going through the cites of the word in classical literature, discovered the undoubtable source. Lorem Ipsum comes from sections 1.10.32 and 1.10.33 of "de Finibus Bonorum et Malorum" (The Extremes of Good and Evil) by Cicero, written in 45 BC. This book is a treatise on the theory of ethics, very popular during the Renaissance. The first line of Lorem Ipsum, "Lorem ipsum dolor sit amet..", comes from a line in section 1.10.32. + +The standard chunk of Lorem Ipsum used since the 1500s is reproduced below for those interested. Sections 1.10.32 and 1.10.33 from "de Finibus Bonorum et Malorum" by Cicero are also reproduced in their exact original form, accompanied by English versions from the 1914 translation by H. Rackham. +Where can I get some? + +There bad regex bc2qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq are many variations of passages of Lorem Ipsum ok 3J98t1WpEZ73CNmQviecrny +iWrnqR +hWNLy available, but the majority have suffered alteration in some form, by injected humour, or randomised words which don't look even slightly believable. If you are going to use a passage of Lorem Ipsum, you need to be sure there isn't anything embarrassing hidden in the middle of text. All the Lorem Ipsum generators on the Internet tend to repeat predefined chunks as necessary, making this the first true generator on the Internet. It uses a dictionary of over 200 Latin words, combined with a handful of model sentence structures, to generate Lorem Ipsum which looks reasonable. The generated Lorem Ipsum is therefore always free from repetition, injected humour, or non-characteristic words etc. + + +bad valid 1BvBMSEYstWetqTFn0Au4m4GFg7xJaNVN2 + +bad regex 0J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy + + diff --git a/plugins/address_bitcoin/test/test_address_bitcoin.py b/plugins/address_bitcoin/test/test_address_bitcoin.py new file mode 100755 index 0000000..3faaeb0 --- /dev/null +++ b/plugins/address_bitcoin/test/test_address_bitcoin.py @@ -0,0 +1,39 @@ +import unittest +from pathlib import Path + +from plugins.address_bitcoin.entrypoint import PluginEntrypoint, MANIFEST + +CWD = Path(__file__).parent +INPUT_PATH = CWD / "data" +FILE_NAME = "document.txt" +GROUND_TRUTH_RESULT = ["1BvBMSEYstWetqTFn5Au4m4GFg7xJaNVN2", "3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy", + "bc1qar0srrr7xfkvy5l643lydnw9re59gtzzwf5mdq"] + + +def load_file(file_path): + with open(INPUT_PATH / file_path, "r", encoding='utf8') as f_stream: + return [f_stream.read().replace('\n', '')] + + +class AddressBitcoinTest(unittest.TestCase): + + def setUp(self): + """ Setting up for the test """ + pass + + def tearDown(self): + """ Cleaning up after the test """ + pass + + def test_for_address_bitcoin(self): + text = load_file(FILE_NAME) + address_bitcoin_plugin = PluginEntrypoint(text=text) + plugin_data = address_bitcoin_plugin.run() + results = list(plugin_data[MANIFEST['key']]) + self.assertTrue(len(results) == len(GROUND_TRUTH_RESULT)) + diff_list = (set(results) ^ set(GROUND_TRUTH_RESULT)) + self.assertTrue(len(diff_list) == 0) + + +if __name__ == '__main__': + unittest.main() diff --git a/plugins/corporate_email/__init__.py b/plugins/corporate_email/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/plugins/corporate_email/entrypoint.py b/plugins/corporate_email/entrypoint.py new file mode 100755 index 0000000..c28201c --- /dev/null +++ b/plugins/corporate_email/entrypoint.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from pathlib import Path + +from utils.base_detector import get_unique_ents +from utils.email import EmailFilter +from utils.pattern.entrypoint_pattern_base import PluginPatternEntrypointBase +from utils.pattern.pattern_detector import PatternDetector + +_CWD = Path(__file__).parent +CONFIG_PATH = _CWD / "excl_corp_email_es.txt" + +MANIFEST = { + "name": "Corporate Email", + "key": "CORP_EMAIL", + "version": "0.1", + "type": "Corporate Email", + "description": "Corporate Email", + "author": "Hugo", + "email": "", +} + + +def config_email(): + # Email filter known corporative (non sensitive) email accounts + with open('%s' % CONFIG_PATH, "r", encoding='utf8') as f_in: + excl_corp_list = [line.rstrip("\n") for line in f_in] + return excl_corp_list + + +class PluginEntrypoint(PluginPatternEntrypointBase): + + def __init__(self, text, lang): + self.is_lang_dependent = False + super().__init__(_CWD, self.is_lang_dependent, MANIFEST["key"], lang, text) + excl_corp_list = config_email() + self.corp_email_class = EmailFilter(excl_corp_list) + self.emails_entities = list() + + def filter_emails(self, total_ent_strict_list): + for ent in total_ent_strict_list: + if self.corp_email_class.is_corp_email(ent[0]): + self.emails_entities.append(ent) + + def run_pattern_detector(self): + pattern_detector = PatternDetector(self.text, self.lang, self.get_pattern()) + + total_ent_strict_list, [], [], [] = pattern_detector.get_kpis( + self.text) + + self.filter_emails(total_ent_strict_list) + + return get_unique_ents(self.emails_entities) + + def output(self, unconsolidated_lax_dict=None, consolidated_lax_dict=None, strict_ent_dict=None, + validate_dict=None): + return super().output(strict_ent_dict=strict_ent_dict) + + def run(self): + unique_strict_ent_dict = self.run_pattern_detector() + return self.output(strict_ent_dict=unique_strict_ent_dict) diff --git a/config/excl_corp_email_es.txt b/plugins/corporate_email/excl_corp_email_es.txt similarity index 100% rename from config/excl_corp_email_es.txt rename to plugins/corporate_email/excl_corp_email_es.txt diff --git a/plugins/corporate_email/pattern.py b/plugins/corporate_email/pattern.py new file mode 100755 index 0000000..129f528 --- /dev/null +++ b/plugins/corporate_email/pattern.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +**************************************************************** +**************************************************************** +******** ******** +******** La expresión regular incluye €/EUR/euros ******** +******** ******** +**************************************************************** +**************************************************************** +""" + +import os +from utils.pattern.pattern_base import PluginPatternBase + +_CWD = os.path.dirname(__file__) + + +class PluginPattern(PluginPatternBase): + + def strict_regexp(self): + return { + "STRICT_REG_EMAIL_ADDRESS_V0": r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+" + } + + def lax_regexp(self): + pass + + def validate(self, ent): + pass + + def __init__(self, cwd=_CWD, lax_regexp=None, strict_regexp=None): + lax_regexp = self.lax_regexp() if lax_regexp is None else lax_regexp + strict_regexp = self.strict_regexp() if strict_regexp is None else strict_regexp + super().__init__(cwd, lax_regexp, strict_regexp) diff --git a/plugins/credit_card/__init__.py b/plugins/credit_card/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/plugins/credit_card/context-left.txt b/plugins/credit_card/context-left.txt new file mode 100755 index 0000000..e69de29 diff --git a/plugins/credit_card/context-right.txt b/plugins/credit_card/context-right.txt new file mode 100755 index 0000000..e69de29 diff --git a/config/keywords_creditcard_es.txt b/plugins/credit_card/context.txt similarity index 51% rename from config/keywords_creditcard_es.txt rename to plugins/credit_card/context.txt index 5a29909..8bc21a2 100755 --- a/config/keywords_creditcard_es.txt +++ b/plugins/credit_card/context.txt @@ -1,7 +1,4 @@ -tarjeta -credito -debito -visa -mastercard -cvc -cvv +visa +mastercard +cvc +cvv diff --git a/plugins/credit_card/context.yaml b/plugins/credit_card/context.yaml new file mode 100755 index 0000000..6e5db40 --- /dev/null +++ b/plugins/credit_card/context.yaml @@ -0,0 +1,5 @@ +regexp_config: + Context: + word_file: context.txt + left_span_len: 20 + right_span_len: 0 \ No newline at end of file diff --git a/plugins/credit_card/entrypoint.py b/plugins/credit_card/entrypoint.py new file mode 100755 index 0000000..32b0a58 --- /dev/null +++ b/plugins/credit_card/entrypoint.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +from utils.pattern.entrypoint_pattern_base import PluginPatternEntrypointBase + +_CWD = os.path.dirname(__file__) + +MANIFEST = { + "name": "Credit card", + "key": "FINANCIAL_DATA", + "version": "0.1", + "type": "Financial", + "description": "Credit card", + "author": "Enrique", + "email": "enrique@telefonica.com", +} + + +class PluginEntrypoint(PluginPatternEntrypointBase): + + def __init__(self, text, lang): + self.is_lang_dependent = True + super().__init__(_CWD, self.is_lang_dependent, MANIFEST["key"], lang, text) + + def output(self, unconsolidated_lax_dict=None, consolidated_lax_dict=None, strict_ent_dict=None, + validate_dict=None): + return super().output(validate_dict=validate_dict) + + def run(self): + return super().run() diff --git a/plugins/credit_card/es/__init__.py b/plugins/credit_card/es/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/plugins/credit_card/es/context-left.txt b/plugins/credit_card/es/context-left.txt new file mode 100755 index 0000000..e69de29 diff --git a/plugins/credit_card/es/context-right.txt b/plugins/credit_card/es/context-right.txt new file mode 100755 index 0000000..e69de29 diff --git a/plugins/credit_card/es/context.txt b/plugins/credit_card/es/context.txt new file mode 100755 index 0000000..8d07409 --- /dev/null +++ b/plugins/credit_card/es/context.txt @@ -0,0 +1,16 @@ +crédito +credito +credit +débito +debito +tarj. +tarj +tarjeta +tarjeta de débito +tarjeta de debito +tarjeta de crédito +tarjeta de credito +visa +mastercard +cvc +cvv diff --git a/plugins/credit_card/es/context.yaml b/plugins/credit_card/es/context.yaml new file mode 100755 index 0000000..9f7b854 --- /dev/null +++ b/plugins/credit_card/es/context.yaml @@ -0,0 +1,11 @@ +regexp_config: + Context: + word_file: context.txt + left_span_len: 20 + right_span_len: 0 + Context-left: + word_file: context.txt + span_len: 20 + Context-right: + word_file: context.txt + span_len: 0 \ No newline at end of file diff --git a/plugins/credit_card/es/pattern.py b/plugins/credit_card/es/pattern.py new file mode 100755 index 0000000..62e9169 --- /dev/null +++ b/plugins/credit_card/es/pattern.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os +from ..pattern import PluginPattern as ParentPluginPattern + +_CWD = os.path.dirname(__file__) + + +class PluginPattern(ParentPluginPattern): + + def __init__(self): + super().__init__(cwd=_CWD, lax_regexp=self.lax_regexp(), strict_regexp=self.strict_regexp()) diff --git a/plugins/credit_card/pattern.py b/plugins/credit_card/pattern.py new file mode 100755 index 0000000..2f6f47f --- /dev/null +++ b/plugins/credit_card/pattern.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import os + +from stdnum.luhn import is_valid + +from utils.pattern.pattern_base import PluginPatternBase + +# Own modules for plugin + +_CWD = os.path.dirname(__file__) + + +class PluginPattern(PluginPatternBase): + + def strict_regexp(self): + return { + "STRICT_REG_CREDIT_CARD_V0": r"(?:(?P((?((?((? person_signed_idx and int( + _ent[2]) - person_signed_idx < min_itx_signed: + min_itx_signed = (int(_ent[2]) - person_signed_idx) + id_min_itx = i + + if id_min_itx != -1: + _ent = self.feature_ent_list[id_min_itx] + self.signatures.append((_ent[0], MANIFEST["key"], _ent[2], _ent[3])) + + def update_signatures(self, total_ent_strict_list): + for signature in total_ent_strict_list: + self._entity_signed_person(signature[3]) + + def run_feature_detector(self): + feature_detector = FeatureDetector(self.text, self.lang, self) + self.feature_ent_list = feature_detector.get_kpis(self.text) + + def run_pattern_detector(self): + pattern_detector = PatternDetector(self.text, self.lang, self.get_pattern()) + total_ent_strict_list, [], [], [] = pattern_detector.get_kpis( + self.text) + self.update_signatures(total_ent_strict_list) + return get_unique_ents(self.signatures) + + def output(self, unconsolidated_lax_dict=None, consolidated_lax_dict=None, strict_ent_dict=None, + validate_dict=None): + return super().output(strict_ent_dict=strict_ent_dict) + + def run(self): + self.run_feature_detector() + unique_strict_ent_dict = self.run_pattern_detector() + return self.output(strict_ent_dict=unique_strict_ent_dict) diff --git a/plugins/signature/pattern.py b/plugins/signature/pattern.py new file mode 100755 index 0000000..fb73ccc --- /dev/null +++ b/plugins/signature/pattern.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +**************************************************************** +**************************************************************** +******** ******** +******** La expresión regular incluye Signature ******** +******** ******** +**************************************************************** +**************************************************************** +""" + +import os +from utils.pattern.pattern_base import PluginPatternBase + +_CWD = os.path.dirname(__file__) + + +class PluginPattern(PluginPatternBase): + + def strict_regexp(self): + return { + "STRICT_REG_FIRMA_V0": r"Firmado por|Firmado|Fdo\.|Signed by|Firma\s|firma del representante" + } + + def lax_regexp(self): + pass + + def validate(self, ent): + pass + + def __init__(self, cwd=_CWD, lax_regexp=None, strict_regexp=None): + lax_regexp = self.lax_regexp() if lax_regexp is None else lax_regexp + strict_regexp = self.strict_regexp() if strict_regexp is None else strict_regexp + super().__init__(cwd, lax_regexp, strict_regexp) diff --git a/requirements.txt b/requirements.txt index 826983d..4d39b5c 100755 --- a/requirements.txt +++ b/requirements.txt @@ -6,7 +6,7 @@ PyYAML==5.1.1 python-stdnum==1.11 langdetect==1.0.7 fuzzywuzzy==0.17.0 -python-Levenshtein==0.12.0 +python-Levenshtein==0.12.2 spacy==2.3.0 https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.3.0/es_core_news_sm-2.3.0.tar.gz https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-2.3.0/xx_ent_wiki_sm-2.3.0.tar.gz diff --git a/test/__init__.py b/test/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/faro/test/data/lorem.rtf b/test/data/lorem.rtf similarity index 100% rename from faro/test/data/lorem.rtf rename to test/data/lorem.rtf diff --git a/faro/test/data/no_metadata.pdf b/test/data/no_metadata.pdf similarity index 100% rename from faro/test/data/no_metadata.pdf rename to test/data/no_metadata.pdf diff --git a/faro/test/data/ocr.pdf b/test/data/ocr.pdf similarity index 100% rename from faro/test/data/ocr.pdf rename to test/data/ocr.pdf diff --git a/test/data/organizations.txt b/test/data/organizations.txt new file mode 100755 index 0000000..6cc9e1c --- /dev/null +++ b/test/data/organizations.txt @@ -0,0 +1,6 @@ +Hayek, F.A. The Collected Works of F.A. Hayek. Vol. 1, Fatal +Conceit: The Errors of Socialism. Edited by W.W. Bart- +ley. Chicago: University of Chicago Press, 1989. +Hegel, Georg W.F + +Apple Inc. diff --git a/faro/test/data/person_position.pdf b/test/data/person_position.pdf similarity index 100% rename from faro/test/data/person_position.pdf rename to test/data/person_position.pdf diff --git a/faro/test/data/protected.pdf b/test/data/protected.pdf similarity index 100% rename from faro/test/data/protected.pdf rename to test/data/protected.pdf diff --git a/test/data/sensitive_data.docx b/test/data/sensitive_data.docx new file mode 100755 index 0000000000000000000000000000000000000000..2b91fe5c3d9ed10133b5bccdce5534676e3186fb GIT binary patch literal 17101 zcmeHv19xTHwsmZ$V%ttCNyWBNv2EM7ZQHIG72B@Zwv#XC-gDnMx9)Ahkl z&DqDAd+k0k#^`g)N&tg=0RRJl0000W01$5nvb6*R03Zhg06+$S08$sQwsJ7Ea?n-b!Bhb2u;B5X0W5);vHjOEFls=m%o(TH`sif| zx5>DWUP|4Jz>G8ey5YxXGv-y-5Xe%?v1O!b9vj>}BymKhgC6-po7^v=)w(?$G1+>6 zuTJzWJ2R>v;S|ca7MyN={1q^ylj_eJ@gVfEB+-ugiHS^G9nfcoe4bF|l?V z$jK{%SY|q}Cj3h5I8%qeOj!7+M2iSF%gIZ_tu_a24uOPJgxro&6~ zM>O2b7+4m5S;2jju@hXw2>FnadvO{BCWL$jFv;LR{7UbA5xJf2N+!C6trqP08oAJl z3{-I(y0!Tl&Lx|df_115AL&R2GDG6`%-)`&Z*5N-_pP|+Ni><}Rrj>-sBDe`_LLxp zyX)v{Yz18wd|7!3FV67(3kT2pXE*!!00xl#w_b`Hi`jhfNj>S$Qs~cKs%>XzVNXZ< z$NB$y=l|j^`M0lL5!dtCJ7IV){NMbiJLFb+(DS9~^ru$Qm%$;_#Uznem&_L5-#C|+ zfHaSEMMh_4+`d0l@=dTyqI!Q8Ri$vMAhBF z%eaI(gK%0WFub6QcxscGXkwy!f7NOJR6LGvVT?4RxTD6jpr<3rh}Elr!n|ysK6cCR zD((*rq=w9e45fSVFw#I$dp`)<$Kd&TL7>{8y=9G+p~-jiIamE>|DZ}+spEqM000&R z06_eFD9+Y)26P72`i_>LQ`sML*r~?2-6|VWr}ca%u=t`Q^?Uej6c?3*-GGAF05X|E zqew^qi2<(u2&n+gpwA? zRBDKjn5St+R6N!op;#Q2bXQITig9=aT=T^8@@;x$Qw2@Us}z>%MCr6BL-l>iSxO^U zxHHER?CEy7@bC>QQkv;oOv-%TY7F>N+ND)>jQa)tSED(YWKebv2tV9!_jTmX=5l<< zjcQ1{XT%TeeUaOdx`*XBg# zqsyM}1{;EY6?`ZqT4iEOPNt_p)&#~q#Tc_9W}Q>Q;=&d43!i*r5bwY3I2rpzuW>Fe zvp$m048$te)#Ob{Ild^-PzlTFM`&eck=N^yt{^4PyJgwOk!m#1avua~K-sN|gz(d% zFMuZzae*=I$}%7iyHnRha!*lPLM{~pkbPA;fM=cvub7Ez1ni~fzEs)J zi7N7f?CkX`q~?x;4zw!EtFZv%hD_=x!8kr^;nnmLFQwEA5Hk=xayJnJZgy6r1mngf@x(>)2+fbAp`IBq00!k?G-mD?Mzlyy!bAKiIYt6X=~vo=n-S36QLlOl@Ptj|!_syDmA`{PVEgOU|d z*AW9#m6G)tM~LPboOFYNHLR38_(!{*(j^G-55&5HQyB%~s1Db7Eq;CxP%DdHv zNk6Xl;cMH_{I_O7)+0IXf}ZxsUxZqi|SF29-Hu* ztf}pCjiIEXrQMSGHkXnNU7Zg3aBaAFOafy&eCJq5Rprgb+t_6!e4E&ayHY#lveMmC z9^z8=6OTZsBDQKSzEA@xY>uc98|yf#UZ&t8(AD|loAI5hcbx~o=4MAWAMHvEB$4J|3P#Qqwpm-#6EzoPz;D@+zXc@b&(Q!;tR25}{hV-jF8gYF@qVP(B33cSAV`SOH zVu~Yxdj(Z;xi4Ti-r)V@IXr-@2P(0AcLDtxcs|y8LJ72UyfVK8f`CA2b0`{|yFeP_ zhDAyO@0h?c18j)(uH_hk31+9HIKUe(j;grbikKxXrdc@3Y_s~TU!+V+C2Kx|_bdRTT@fh0s~%+*E;!Z^cvRQS+d>uI}MqTt!RHNB2< zO^~NOi+X-2*#*WBb!BQdMK6fM(-hC1(d_dncub8C()VN7Un4hc z5Xc_zekF%F}7Q*F)Z&~tJ(njXeKGhn)?rHQ&1$(+TTFsuVOb?9 z%bsE$wI}12dsPV>aN?1@8xb*%9+4<*wGPz|!+29mI# z12J~gY_phRmF+p6HCYIf%8~{ZG20TQ=maN@^a4hYffK*#@PG$^}-0B%vz@l!26;N*_M2M3@iPIzH zD&7FH!dQ;5%s3A%O22;r1^16$Y6Es)QuIU9(;{Uih(lF8HD;K(${%vHw9X=YhWXVL z^id(-d6ofoC;wj1u_8AaLlyiAE!^5g&(15tgfYwQ4(??LU~-v|g0yXE3n~L;Fpx`6 z_tL+sF9#*?MqpW`1hxR)4DFwXIYsj(pF#FT-bMq%}ZxJ7Wn&jpac)*m9#USZt#ei zk~_@u$ONATDM@Zdb$)PSImVtIa3DOYVna^FL_v^5W#|l7`#T2^M)^@FG?&9_-MlXi zYxPYBP6sUh!)+V&858YBTzEwd@Jw=PP?bd!juJ{M%<95OgK+f}ZE&KwmVq62963k3 zvmqPooz$#deP-QfOUmxHWLr0kANy{^mt5OVzE5l42Lp)YPpDN5qbfim1k`*ts}GcQ zd+ z^iEUbfSdB4Sff&}I?6l%gAm}W>QnoB6HJ(0tL-`c0Ln8fGCYRL8T{b7B{HN%AXn(@ zcko)WJP?#DH4uHY^uY+?zpt+A=XKYzeQ*(f1WX4{4QC@IqfT|(I1zJqBdH8;TDhv) z0Gh*Q)WK6^)#nHl9&LS7S@qQyY9pybLG20#W`RfWe@OMyv2dBfzD<3*o5aY$B|PnW z-sq$-&fbxFaN#_J_R*8zL6OurJ1Hk`P)B_~KPz-RDC|7nkux8wUra5nlxpeWE31B( zn)`b}u>c3pxIyFSfFgDe-L5*7FB}VPJ1t#^Z|e7Dtq?rN^SoJr`nUzLkHx?d}IiJNCCXw#-6k^)-=FPz;KP%0_Se{s9kJ`%es?DtD>qoW=$s=9>8 zZX4mHYw?};*|mVN0(4?u?C%?fYCH6NJ;v^E!+Njy33zimA8)*r#`nE(qw^KOc^yqW za#sa15S>{(=ve#bZR-BC_6O=**Itv@({B6@|B=1j_fC?mIH#& zZr8{21K5&zCcE-fJzniYNQY+aqM@rsT8ERh{o1l+@vN=1ZT2amb9BopsI!bMhBd0E zWE3f0vmc%p!D)qa69CVoEpYkH$^&HAByQ*nSkc#NMLtj%>cU+YO4IbK{>X0~_1!mU zFgQ-9v)!F<3v?@1zQS4S^4Y5vtU*=UzXqm&F0bQ;15A@NrT{tSBi^mt;Nw|SO`t^G z%68j^-F=N~-Q;h0#GP8|;@PQZX{VaPSFi%Xl1(5{ZH^6%r%Vt557>L zQTp3WR$b$KgR@dk>cMl!lRY}qS6Q0m#9^F5m6sW11Z$HYG01y%l0|jYOWOv>RIg^g z!coX5c`O~c_>u15*b|Q45CBjj_1$q=LwUo^`q%pf7ujbh;=lM?T{Utj!k?a)FDL*2 zHUI?RKm4tK1||OPb^R+q0r(j{{H*=I`)E&?lm=oz7`Pzq>??9^_W6K>#-0oCN9zYC zUteAiI(L|jg0j8<0j3ValERF>VYwNn+i1X%Qf`#`)eXKwkNnm^hQxbj=Wj@=wGVI1 zN(fm%|9yeLKo6Era-f(jpR6XVg+j3i60%KaNKj}FrVyspPc3kc#m7UQO`B?aCb>?| ztyDb5U+Y5rxhWVO&xN#}6WJ^2=Jl!7cf+MEj~??p!>bbTXb3`;t>A~6ijKmbM2_TX zLBe_fC!8%V@-`0YB=1kiU@l>aQ=lJFkn*+h&0xvy5o#C`R-yL(7bJvXq%&;2Qm1e&!m*U5QSs&%QlUC&?$$o4T}NT#bzEnl!#A7uC6`r0z;x#xUeg*>z1xtK^ z zX-D$IUUelA1EEFOD7eC*3|mrzEUHtkB2Ns=oNhMxp3<_{>bo*kC-ijhTvuNw%;lc? z2#oOzWbZZZC5QU*=!jtFDZwyoU9ezDq>T4kZOP@bLt4?-_%Qp_fT!p))@-sy^v43) zI`iWM!*OE57XLyS%+=KK_jNq2XCMQRsuCE(!Cj+mYFC!6e(h@)@rJYdVTUM_SrF3zqF3ho&bJ869{ zc(|TIvl}!ahh<~a--p6a!g`n85a*#g$scBrbWaL10~iG?&h9eti)qid8Z6sUCoC1w z0XL6vo37KQ9~I$C-T3b;UU{&fsatZ}OTz9~4_Y3Ld$%WF+Z^kn-lJ}pgdfxxIM#7?K02uTvIIzx%Y z9==OM47-;%CWq?Hv7n%VWC2@viRy>L>KcgcqgN}s!u<)yCSVZWSQs3Is0EXnZ%uyg zpNEKlNe}f>a@1b}YNve1TQ7%SQznuYO_XVC#Sr!7PHb^Ube5X}SLC=5`GTxPQL%ed z%`A$>;}~{Zr3=LjNGFiS-H^RrL+28DqRyRUlTPYl^V+ zE?)UUQijs=&FqU?$VVb^V!@{3(u{q9DO_z#uSKev3>%(=e~%^bvghv4sDttL4k&Qk zWbee{1}H;tY&|a=@2u}h-jXo>fE?Qhm3kqN-qp)0y#&y0fg*wX#4TIhKr;Y&xH^1? z-F_SN+iAX{e{dQ04=%5u4C-p?^nG%<6JMgDj%bJOtjVy;B&~;V=cpYT+Z(huS(9I zSNEY@ln9t*H3Q0Kc&!>pQoVLrKzr1-`Z80lZz;bv*86R!>^d$FbVh$Y9~9Umj$|c! zA#ienn8q||QfTfFHowJYSD9muU+kDRZpxf#3F`t$_Y$`^eG#|b1Yx(=jW#+PwBKqU z2u$QwThCF}V!+HUBhZv0&u)A0_y7PX{~DSdObjgz>HcwM_(xb)RXrSw^@|tXEl-H6vq$!SN8hXqn84zX<1uCa@Z97DJdzfP<553z_96vs&C8H~WC5-TiFs$! zu?lN2vW!C3j`P@OWC=Pcx5w{bcwM@JW)9go*pC1(@Iz(#OX*&QHBjCU;OV#xhx)o{uN9 z!ChQ$r}f3~-fEQ{RL-Eqa&qVW58CfKmw~1JE;g1JAUhl6DJnvE+btax~ zH#geeUiaf@-tU(~fEQ368UfKJ{KTz%($kh%-C#J-G5S|#Ncrfqd4jBw#L~4%wmIOG zO;fB%DZ1cQA{HFKP-18`y5Z3tsCz|fRmhJ4w4#xrU540CY`4~C9Xph#p~9K%=x9-e z)+{4rG|Yq5wQ#BT5gz6W86!f4Zcu@fcR~sT3AhH01K56Y7tPh35a2-5J}Kr#Qyl9a zGY^O0s!u0CYvw{e8Xgfsyf<0cyP(~%*_U3#BV$1t12ZLy_fj)?g`h!^Ne{5;3he?a z*M{#-%SX`oOMqrYLF|~&6Uet3ScMe|J$x&`XkLY1W~`0npH78#^=y5PmM0}shkg;Bts^1CJQCP$pQ8m|-7oxA&g&(5Tem?NPesrJx$eh6-k?N;my7^L+(z;A2@z57m zxIuZ-z*EG~M_lgU(aN9oKTZAmxyMyk1)n8ebieDr>H{j0y6c=8n(>Pgs7ZP3fOmK; z&2cyUz#35(lHs3CR_0Z2l)1b&!z-cB@_bqHgxFErr1yUe=#Mbo^F1NmRvwgDKZ3BgqDYLqD-{QAWi#@QU6)p!T z<0@RAG=?nac8XU6KO;tPPkCd?vAl+z$oY|}>Uhy#9CvhcVRcuV4b8 z<7Hnr1H>C^_XEQCZ4Q*z#J`v4n{-$YN0S@fpjTTZ=rK-i7Iow*ECuVfc2Ko-t&G_g zWZn)mpN|IK3KE!}|KPo?Jet6>zKEl%8M*1~r<7P<{ZZ91;rL<|%G!khHFw=lZI)K+ zuf}brz|rTIQ?o|O{IHH_?1^O|QrU^W;8=Kc*1ohq_<(=?B3ntD5*I}IYKF+B(G@qw z=!>Cgocsbs19Ik)t>{;Ux8PT!XbS2dr~N=xs617f+{!0g4d-W8xKYjpkYB=93bwo= z@K`&URL=TfsrF(PVsaF5%m$4%w=8-GBT+J9+7)^mp(7_hLt{W+;0pzUFz^}135}&ihVV^tdwb9<(g*CjcSY5 zPU^s2FFJXEW5$C?^LNqis(?%#Rmk!c# zW8GQg`rO6qWi+`pZ4H^0+`Ou20*g5_rbvgx^8WiQkJVzm$u~qKR+1_efvO@h!lEvX zkH@}MeZr8098yx=>5u~2sv>ZTRReCce&0Ejc3m1%fHej!eaLJNo@$1!lNHsTV^oRO zLX^bfAK|iWI3`Q;4LeedvSVgyHl$0^l<@&)P7U~dJo-J^JI$=as?+lYu`4?n@lbj? zk!(}S=uOX!767;~!*9)BZMDA(UmWHbTRaT8Jn39zTqf|1s`3T+K$lS=<68<3z9Z*> zZ&-If)Cfs|?!(QL-wgRDBeuvu^!j1-nuyRF5RWZ4gL-Nst6?|Xv6qG_NFzj*es{ZdAN)3^mWH_OXc@YWqY&ermqG$-G! zj+l?vpFz^%{VITVIpt6VFhtO_DoWkMh)7dPxsDnZwpp}DJ`Cra*xtVyX+UGLt{>v% zn}^Gtu>x%B5c>vt*0uiQfr4?YlQ4c9{A zdZD7xWf&^!M}WleLuLqV3523FU}1iyZXm{p_FBK6zF7`C9!I}UH|}*}vI`$BQH`ni z;Z3vx-t>JbiFq$RT7)xMUcg>=>0!{e@yMMfg}EijyyGyNwIaaCMe>_;cDDBjFJ7Jg zdGB#Z&uX`+n3d8u%iU5!^Ok(rH}NE?r(EPXp#09w55%5kAfH>n$?O~eD;XM9WNMC~ zpo?8evbEN{GuXY1s|P7a(5v-~>GdcZ8{XV*D|}=MV-e4Z3kcdRv3F@N4!(B~!$Y25 zG>RLVBLxO`ZMBNp7mK^sDV_8x39Y?z+f0fh{*)uhTF3C2!ns6#dEG*#D52~4c9lbR zwL@H3+X1v;j#oUwYdTx*ov>GH_>mPBS2|Q6{+U+VQ)_$g~Bcuu$^I9lJ541HAQYII9cG%|vQBo%}D^a48?~a+y^vo)_mI$0BRCi;TE(>adIBi7Q>G?>%;@t=Ix|7B;ga*j z(b50OYZr&PMF~ovI~bgnWenk+%2;zXUnFX^vk(hq@FKcN+JuN_W@SjIc^$nRY`>hT z#C$~NK_h$Cg>z7lKYUJRN2Ef%a)CszY2`LpQ_&-Th5Sd4<*M_}*{>gI$`0N!QD!+v zo#Ky<1w2je&fOZ3Ke&o`Y;v8zR{Hc3eh51i9blxgj(8B=h^Xoq^pHbGQY$i|??VJb zUc%vm)hk}(lv6qp@3kR%6CT6!!JAcQu7RCFWvh_UIiM6JtV#qSNpNxo-azYTZb$3M z}Fc&t)awVU7EzSe=RWRcP}AUF?DTGajT7g;F^cLjiI`e zfPQ#SJ(Z^xW!D*CK9u@!dc~%>ao`$lpG$qhfHEG%4>R`q!ml4FVzmeQ0V8DQ<*ohk z=?4GjP};=1oevEI0D$KR0Ps1Q{!{8<@8D|j&v;LJrl$RBD`L?kc)?&qC{uJ^s>gGtvbN>^MA z-Xw47o!RMja@1^R=Np#^g(PoT=&aO8ra}efl;me!R>S97jnki$nl=2AfnWMh&U=3S za#7TGzza(o7aXHg)F|Sl(s-@fpFvPcn(Qakp2>Qd%1V?*RqN-R9Oo;p+b)zepc;Ee z1Slk2p!{CH?a!XlDgtNN`|z-)U8(^bW$ObBfIj#NI{2u<vptTL3iZ`_oy8MqdMDYc)do=LZtnR$Vn0#9j3K6 zVkr9A3hK>ATBM9*Rumju(nZE9u0=bU>V1tz^Z`e5yphb9`tpR#!@``6jELlg~(+iOcXm6v=Zw!!2Ac*DJ{CK zzMTG&SR(6~GWSc_wn&yFRQ;C2?h2#zd;5=Be$#j?X_0J=^!;&2nf^vmq%0Pvv3luS1dTf{#&t2iP`~z0#lGzv2$8XHE3AtkyfDJG<8n*q0Y7 zJLp>^q4;ZlRVW)$?8*1*f+?;YtI@y_>#HZ@suF{X15hw*dKTnNL1lVydx5bfh2_ac zYw8Z4qfQGAmcQ|^8Fan>S*UgyLK3=#2*g4seufbn!(RCmVV>2WVW&6CXGwx?X!wmJ zB^~;ncVJtUpVLEvP)uG$%0z)@vQiABr$Ab@V*avrMOE~+Fd+`PwsTsN zVGvUbi*4+3rdv1YI~W{OJH3gCFpD5dq)CAp=Csc|bDnMaKG}TBKi-%M?kLX&MXz>{ z7f4)r!}t3_>yY9KDvdYULUl2Y^h9mrWHwi zSg3eKB!$`OrD`N*>d+@hkJWnTz44G3?=giesvlMKIw!Je# zjfeP`NyFijq#p4SX=>#QiT8W_)7sOQi5>c_1n z-_|Y-d%>(8k$Or^rQx04k+4rcKlNB`mcZR>)N!Kyik9_vRSa(1mBYKZ+^*iID7~#n zZ~$%riHqDs68#z=o@AP&fLptRh+DHGhex-SV9V(zVHfoPPlLqBfE^(Y37#L<(^JB1 zLoW{)w2?dO1KTB442b1bEe%Sf2E?qRcoT7ZD|1UB11!uki5+2<#g0DDX-{9~>g0lW zwLC`ccO=aKOFW-pj(;T0tlBdhI3wAGL_kO4tpkE4O0|-y%=GdaG*Xt{0vU6U*+{Hm z@tut_*87S=b&)!}P8;OAU}}OmqNcifwvEQ_euw#L?^-;zf13@N>ezuesfK&hFrNAG zp4gqr!0LIM)ottPlM?4xxCx`S=9#tLfV(1`v?c_^^nu+KG50dO608q z^Y;of&dFVlqcoxN%>|=2(izG+OQ$s*d?}Wc-UZA&2U*C*#txlY1;Ywma{VlL6|Cx(t!ZjJ|g~ z4NC)z54L-0RxiPe_8pO1X`S64;XsH6Iq-@vuMi1xd@eC^1fZ6RA-q^9gr6nQ^cixxoD(JV z+&@cVkn|Kq`TlwR6F~sTf<)+px{=_Pj*Yon} zt^EZuo78hNnRb4XzhzPq@U_I6xTVN$j%6=BHfM<9wHxiD4uvXiWxN}aWL3&DBsP#7P3t7>cwA^>?_4~&)e&Am;xcq#bQO4k%1hBu@t7;0VY(K% zH;{_C7zARL?f?l`xgL1|#gsXE2ED?d&&ylG%Q6sgm^!_MK_xn+1eB6yE|h)Zu#hUH z1b|d`-Nr;~WJ`mD+Iv=69N>!T}`- zOZlC$!mp)$@%7oCU-QorNAbw=q}P9z*v`wR|I>^wnE@TC#=lHlOMKQ@Dfr98|JVMX zR)(qm?G{#2A^49_C1N8Eh|`|hYb>J&pjhvF0X$6O_9WRZNYl8--{<7cxvgJc6Ajz& zrkGyPjK)q`b69b|i$a> zHZnV>WcoaU+&PstKF6QVW7)l0#HFot(ps10H`;N+rx~r%xG~`3c;8;5(3!uufJA+Q zSvbPd3FlQd+uekl_AY67mv6csolec*bMOq_SC&OeSHYipcAzn$u6$OB`mRZmjWWOR z#`RDLYTYFBnb5o$Fi$S&@L3<9P^3G-v$~0JzKeXl!vV=+$PSTAbNHzSwzfXS=uqK4 zv7_W&6)46hqpBee+S(v3Gw;JV`_tLnqIsU_p!v4T(C$!U31iD?b(Q18!Om80BU<)j zOvggu-Yk0OYU>AzajdopceEiqsTH!@R z@xy1P0_M}J`0_8m!P!tx;crox)1(dSbb5rr4YD(D-geekbcC=p#gQ`U_#)HeR-SHj z3K*^CGHm%3j6`+tND(LaF>Sfg38TsC+smeZ&k*CaJys zm<`0J#cnvFdEQnlR8Vy!Ny!>EZP45FW3Rx%T|q#O;}mVct+A5J+}u-=_=pLnF6Llr z514A;t!6B;MlcIR{aIv?-$f)7GJJ`PF=6JMg8H!;kLA#D13O6iq46`pLM8z)3}t4~ zpL`K@c2oMwO*2l1w&p`_oQER-f}{ir1aOVWG=<_&vlpS5DL~_DL~wEI7yZ|f>m3EG zx9>^38c{aV`VJifIf5VEvIL**JbqOt%buM-H_{-MK=9aTy&84Ztib)cJoMZ=*@r(D zF`3MyszKeg_8QYyGBsXKMh{fGvDM33=QeH#sPmWN0}v0epgu*^OIU6gx+cyyt##9Q zN|@5oXfPif3<^JPAd^GgH_iGfw!pe)_xiE>_^?k8=ZW4<_V$Qyu*?}f`$`zcN)*Q_ z?7O*_sKIBOO*682?72|tmIBx zPwcX61$F~~A+ekFMZAufjRAR$j>o%|gI1oTl8;_HEigwcMqQn@cWNpFQZJ~?04c0= z#!sbXUX?uLT{RK7Oi+w^LHJb{m$u$cb3b<^d6|a zJxt@K*?VYnvv#cW(*(Ehe-bMZOw%7UmR$RFK-R3;@~VT5HIs9m?u z(eq*19CC7SiwElFHQ1O-054G4$kCe)2BhYYUBLD&g_4oCV1e)l@9LXEp9`2QF4=NVVfFFetL8&Egjg{%kabwy1MM00fWmjhWmW2S=khxQ9_hopMQr~bp{bDlY!r9 z22Q+Lz+4-g5o+A%%%pYf?Q+nr?C&m%)&^bbW(JK7-+MEl@v zxv<_83>3H!oTYxZxSdgHqErGQq@Cr@w_-0L?+U6?<)g2~hs+#p*l^C@x9hNFY1@;j!4@vHc*fqPY%m$+Ejb*q>K zbR;Ink6ghYqrDpMr08k^f~O+qrr5*`J2Y$|9ZK*8tI?mh*zbXt^F@W^N!N+S1-{>{ zI&VglD`7SrG;;#tiUGvDf#0n9RnOsecu|GZAuC*&_p)7ZilZq9etJ_Z3UpH7YPn?r zYJH068g-twtVYIe8WWqac7CKcO%31N5EvC-`qzfTGV%_qQFwzXN~YEcq97 z;8U;jA6qAXhyT0w_b>3L3iESa{r^(}|8D7b_2yr;I#K_$^lw_u-|@fe<^IA)VEl>y zyL#?-`0om(zu>Caf5Ly)EB$Wb_q^_3CX9&wH1YQ=@9*&6a~XfZYsvow|DVjp@8I7Z zn!mtD%zuJ^b8mjP@cT;lFAGSYto_F?Y4rJT5dX98{T=`B%a*^;004pP00957i1{7< k?{mc8;omv`2LE%$kd*-aWG4Us?B|R3li<#OWZD7#9|D2~CIA2c literal 0 HcmV?d00001 diff --git a/faro/test/data/sensitive_data.pdf b/test/data/sensitive_data.pdf similarity index 80% rename from faro/test/data/sensitive_data.pdf rename to test/data/sensitive_data.pdf index 6a9caf991beebd4ea26b995201e08e9d9cf05665..0230b42be5399db0d1765faf3ce8a3409c96a859 100755 GIT binary patch delta 18449 zcmZ^}18`(fw>CVnF|lo{W7{?-w#|++nb=MywmGqF+nm@={+YS=`@i?rt*^T3oZWk2 zKlQBM>-1h{HS9pdH$cWJgR-#3sZxTnGspR2g0i#3CDS9bFv?h(IO`Cx5-|}e6R|MG zjjN(_{T(xkvl21?n`IPdBVq<5(1GZ}Ff;#CV`fQ+1;I`z14#mMu=DdHz&g7)nHt(6 zzxYwK-~153pQe9MWHlo^bKnRb5d-ouX`IjTee-B3v1>g`;y*8-x8NS>lMt3uBe}-yz zx3v%R;=R4Q0m5yZ&W`HlbCu({W}Z@s?gnV6pw^&fgC=vHkJd0-P|25ANRJpWyte#P zHXo2fbV2VuVqg>47p$7Abcw7+$Ij2L=ZEVJ9;5)-t@Vz2Jhet@y01wVCz6s(o(%-= zV%}eXE=2J>!WkwF*RuLVsV)onHCmC^eT*N$sC~3vHEUq27&&QZP19;-W=Jqil<5A7 zAUV=cs8<@ZTlyBQvsI0`)>|~XKr#B|p>6{)_r_D((yd-pWpNBwGyVu;1#FcPJOjM& z@rGlE5LWw8G39M~)YqKpT2Mb^*X^(e2>K}C4A1(;<4&W4O&sg!!O@|oet56!;bBS{ zLsSAtbwH|bM7#XlY@v~$no4VF+hUDwG+ChT>3+~hPi>o^YWwY2chg_i)C=0&TaMob@7rSHkJ8#y`p)ChzFz!2x><543~n!`1eOwCNRa z3;&Cxh|dBE?ZWAd!F)(+q`ieD>yI&iv#zI%nx|__9o#TK83kXs8%Gt}&j1&fr=i#E z)GmnNi1SAtvHCOpxQkyV>E9T4QOwZiMH?o2l^uOQlC9eFt$8W$d!#kkDjd^8*-+fh zPY9N8e=ulbqChL04D8vna$~WXlet}wR`d4y0AXrCSn25H#$d+-;j?2- z=p2+SrqF>Czc|>*$b3_)Cpu1;bMf(f~UBYEjVNIunT>eMu@cGf-r+6`TVHv z(KNS^=KZphiO=tow;@B}xMVnO(CX7mPeO^IWk2ffBXM{&V!z~JzV4GxQ3?e*v4+$KqDm8jLR`P{LSVt6I znlQVIN~WLKplXFx~)*h7P2Wtvhcq z1cl3A;%SBV?Zu^Ds@>1Xvf2PbT%48sE!K&)f{}Q%x9oX`!~4lYFR1VeUXseS<7rzO zy>NKK_xTr1EcwdLAn&ic?IH#%XK&}qQDKv?i1NtKWPU~5oY;Wisu>yMESTCIX@@!=sI(7lJcG2e` zP3bIN2W=^YOdc}Xeh)ug-)U{IS_rohRsqd;uhQ$CPo_`5APMzTxhJNvfs@}`CyAx`AVI$8yAOHXw7>ZW z1pQSnR`dztz#}rUR_HUm%bw>?}srZ(2_MfLH{gayu~SIz3UgU zob8)UuD6`l&C1#4KwxTd%I3glL0{IpPBm{~s}3+KO^4^**vVWUF3n-+DKG}P1m*L< zJg(nP1l~q>KQC*X9W8NkNeP^lA$xC!losh8r`&u^YrPTSEPj-hrN&{>Kb$jPHEaj{ zHAg8))xjs0+_hn$uCdI$O3$&smFvMV7P5_5dA z5-tYZR?t(5xoX$B)+h3N()4!Ql8GL53WAUc53uvQ_QCfpgjUAz6mSmDOmGn|3?h4t ze1p;-R?=M@`q12qcUt&^PgsnbCD7avSw}p%`AcgS(apF?qE#<$jwede@ad*zPKWt) zX0h;JYTlQ|JlGk7Vbc+K>W{roRSaa4mRYjNmSfCHMCQbs!ydzHy=KaS%b#S`yL6kPpo&Ppx3|Sn*pMYY)2+GWO9QVvg<@gQZO^H=*4KY|#cws`dUHm>=1JAek)J^i==JuEAX*2+W{MxQkid7Q%@>VoZC z&;ip*gCUp4Dqg6$0$*x}?ihDE4-uY34MKxyW);e3YKxXvQ=@}JFMdtc!fGQ4mIplh z!@{$cgJiP^h>$n8X#AqH$^g6Hx#?5GiW!A(Ht^$&WxRiL0L%Gh^)0rQc`ReEW&s$_ z!m-oc-{?9c+aQ;=TpC(g-U7ecJY2cfAR-l?ap*u@<<^sk(A=K#`zb5 zrJ2^Q;Z-U66Aitw-6^8P)c7pf2wiz)S(#|$B(e$<{j1$cTqOhAnW34)$K*MtryAkp z7e=Y{Q1TtjHxfQix8WzpaB#QRFPp$GR*r$^NfHi{$3qM)jbBaONUzph8{&d3;XDyK z`*usqq!$*H{LU^v_JBH=UqNTGng^fTS< z4ktquZxI-%QR8LCiL4iEtb52PLLDcS&U!9{k!Q_Jyo8EpG@imS_r;%48CqEJ3#Jyh z&jG6q^+6b&o9_5UL@YbYnpoo;us=MYEgRvjDn*G8-6JO@_ePv{*_z29&3;$8$y?=I zao>pdug&nNY(Zj0=1yrvfb0UTCsdBEgKW`f65VPWLiGpEh7D)!aPdj6YcRWO8a!&$ zi0i1#zVAP13_}Jhw`xmKTWIY?+hQaKDAyz<&Z-_5`-@b5kw9q8o!lwl>x>!n>!qZ< z?kxG?1q*)kn^39AD`C8?+5&3tc^(q6p7q9tpLt=EY{I62$f!RLjQIc<>S8A4bo`A{ z>>CcQdiLWe$`ylrFt#T;kRnn+`1rL(*OuG64k_yr{an2H&wm-x^ddzHiO7uJV6PLV zT+ma?vN3zmJ~~8kIq?asXv1Up&%j3Zn6gv;6cdjdq?u7t!mw{5v9kD zxRIFx$v9!c2?sOeeUmVX2s zk99;*C%}xQkS=l<;b>urdsQtBr{4E4-$(&9Bqi3!1vxZ-mVE^13$62znC-Z>pZDOy zw0HEu>jCHuBA}C2_8ykjXMEhad1!1v7*t@@gsx>ZS~%b+suML$$Ic`dMlT4KtBYu~ z#y;F;Kq5*FpozfO-=!kR_CmEp6QCHf{_KT%3vU=<@0+x1JBghzb@HDG-H%R6_D{x| z%W>!ByZ7tulsgC3GeX&db-1xiT#mX_3v(x)=+V#1TDicfOeL$CCG%4u;;+CDVjJYR1d#;`g3qdjBUCKCi?4O!OqA%Quvv>xEk%k*(_~=s zx2K3_gxPhL@x>6FR)amBL3sKQ>oEW|pH7wK&fG0fYJC$ER9bu#15~+IfeM_Bu|9s5 z*Q&!KEO^UH`WMLHEL)fm#7{1%v5BSRPpuQHD z->R^H^F#>4rU(OQGuG0yaV>Z1iab?i4e@6sGIaUurvn{TUUT}s!uJ#p$v(pGMT$zf zrOM>J-us5wl&yPN(1@6DbRke_C^C2Jm|F2_(jjO~HRsl|G%|No2%Z(n%+qt@SmS)7 zHjoBHBKjJ$Bj*_*|P zDMEVvC81odbPzyGo$$V$X3R(yI*~R%?!JZxWly78{JNbd6|(Ls1oM~ze_W#fQ|c;E zl2l{JN001O*$@&Chhsoxu8L$yz8Q=DvT=lQ-@j#3P~E587OgOZJ@3v!9BMf_&fJ*c6xfd2 zUtms*8Z~vol~5-??ORncCk+q7UY7~1cEVWdZ*fCYUm97A&O0(w4Wu?{h96@JFDJfvY3Dk1iGaRo2RcozGI|xCrixPH$N8~fEmC|W657yh|NXe7ku)&~9HTcdHI=PG zz=H4lYDo1*-ED{4p~t1P$+oUPGxi}OiUJ?S^>7}3zHO8fzxaN4mV%k*jfgk&Gu?vG zGhMGD4G>mZ309}aPZxmj*35oi1ms_PbDzu?&pAwV6!8<$zu3`Jtg80j-jZ6<=wq4$-riojbK)$@{QVwlsI|BZ-nm-B$23>u~I-$ z5oT2Dfpb|ET{!q91qYEHj}ex0h-C%YC5gO|aQr#zPWx`@wW5B}k9`Jm*759Bi8F+x zfoeG`w`s|iQ0qlaCnro+2d!L%nIG%(h!0bLamXiYiU6s+Ms9eMGzbauDo)jiF&DBdA>Gk*WykLY*e+Bx1@4wR`znn5E z`F!ACT-aYh_wBtFQ>g+wg@ixBsxS8Hp{RB~gWn0afHzz+HlWqPwzSGmULN-So5g>U z7p{)URTvkjr$bs-PEgUE5KAoQ+#pWoCzTw@kgi!`HJ)+O z#R*u;1DhTLMDCmU39;{S;Us>_!{iRX7CP zEa4#aF(XFUr;T4^CBR{YCV1eYCaho+L8+LU z+ncIMC5&QFgEBKEFyr7709Y9{EKOW2oQVK{zs1YH(g4NnS$>o5902$}Kw{x|+VW$i7#1SSwgDmhaVOT&Lr`AYzR3&6la#L33Yz{T<> zCL(`;{0X`<5esud7l;fJ>wm8EXAYl?9LU1V`FHmJ7Q71;1j4yI5U4g z9W&;=*k;hrWM>ekvWe}ee=eM|0rk$FhoJ<_+iFrb`X$Nvg&)53!fQ&3$e|z|^+>$! z$VQkj$nWa|`7wM~p6_f~GG$KV3$S<`67}s;-Z)7I?2RwXOKk_Le^FfF?_NU(WnoEx zB8X*TW@2VxVdh|CV`8OcVy0wbqNFBbl(RPxHFPnh66I!L`m;~|QzK!Bzyy@_&-Nn3 zf#YHnw{&uL`NJwx$|n{GE)c-=kJ1LV)?L;XpFMFklMmX8@J^i!_2Hj5aVI6CzZK4J?7ee<)m728sG()dCXxhA)_}`htvZi+CE*8v002Tm%<$orEb6>b`stb+Bt4Ep7WUNwV6lo^$0sHW%z2-&~ zBV@!uaq#;hL?FV@sb5OdOh944B@?~`-n0ew3hLDEQl48^idozHI)Q%g$y2tXZ*pkV zSktzxwy4~NDqQ$T9B*#~K5`|L=IR6jgaWOYogT7NETd!KhKrM>xo5dcAJzz=vgK1I%}qj<2p=ic;DX2QyL2Qp%`wI2gmi`k(3_m7C7(+-3_ z4L5xTWWk|UXiS#{nFXvh=(0=%oU#-SeG{B|K4;1mq}_10!<9clI*JrT0Ci&uADA_f z^~Zorp+1;Y&JVya3}M((6B7*bAsw>Vm5{E7;;pU)fB4)MaLkb;mZ=g|P^6Mp_=g$G z`jqH1^_MW(Zna*+O_;5asV~CxbiHq@RGS}DEQ?8jlL$i}O2%*!KcDOG_T*{U)r3NT zOb%0m=STEGurwt8!bRf}H6@?)FUzZR@7KcmW1@va1FQgUedG*~ZQ^Wi5 zrFHpMpVB|89YToSm~Sip?KSYP>p<8~Ut}vo>KsH|@oR>c4zyLo+Z9@iX6PXu2l8_! zfh(!=sqsE?*~sKQMN3Fpn>`Uw~IZtb4myFzac-Vb{iftQxa30QFqJx6a~R>T_Z#75h1}EB7*=sP*2eoD>#g*z{)_+Z=;|^o;o?BRj+^JiW^v`(e5k9*@6Ny-7ojtR2UG@w zV{vIBgQvL2>!G47EeUAKTJ$Pitf%61uWEmtnAZ!c;HO{3b5v%tv@eB9X%aQ%67BpQ z`^0!?c~04udStOcG-4$Ynb4fAfAH8E(xGo5vx3wPM%-CZizy^w|?uq zj-q*nSgdu|f?KVuErpW;9=@QgY%3!0^aGc?(%L2A7mC2<9VHOTE!!BAmdudXkIVcO zY_`7@GHmXS_EpiB^dW9`6bBEntv_n0T1zv;p#I%RkY#~n%P@tjo~*?7rzg>y6*Eij z^33;^Dk13@G+bOp@_i-$t7|2rNpc~TF6PDx-M|7rrcRoGBz7JHc$7p*R6VNiFx2cg zE#&ccl;LF3{oFt(F#;}#bGM==6!sG63I}y??&-XML&J>h6#C!DK}I!IJ!eAkFnaV^ znOoEL-|3x`8wW_hu)to75r&x+ArWU56e8!KNjZ%N=KT=w94+|a(3=Af(c|siSy3jP z8gLiM5SQSCV=lw1K+iC8QB0*VglChbxV4~bMQ)zSm+65I=7dNW*i1q)^iRbas2zZv2{tr^uGfSnXno7ebEtR`_W7sLP6X-$GE$ z`+8{9$A1E4b_ulE52NZqZ5(~t-t!t7O+(JSKlr|Wx`WHbUo#@=;_DKgjd0Tx!LwEW zB&ohq0nn;{@7b`GUEq@i+-cMriE&Ldka`Kw^>0St(yV5o_o3$PN?z?rJi2lgsA8UD z!K*8NeFsU@$H1lyJt+3>y&$1dNZT(rM=CM5_GbmYyd6Zr%2?K-tMplb@({G5J?YWr`JT@Nsyrqe=spvI-mv#czH=Wu7M)hQ#+KPZ?XV`%i zPw9cyt0$T`W!(zWMwsk)p#&A5Md~!&_&-06mW~1?wzO*x(&eI2F?WRQyeaM+iaX$0 zFj<1AG+Dgtv%fht&+K!eQ3ak8K`qI0g@;Ci_9z6hMOTB}^5DMc{h5<@{eTpJcOaVW zpn$+Y>M?$UxH)x$YGCeG^9L}DIB`jTk!bfMsLhq1v>&!Mi=l~(J*D3G=M2FF0d$Tf zL-7#757~@m^k#9o47Y-6yX=C9?=O(0v*_5;3{x&E`4MXQW#(dKAoRrP=1Tdy z??<6zufe3OFIgr>Z+?U;zrHzwZ91?P-RyrYz2; z`d>oxNgzcc$CMaMX9FeS6y+THgEF%ln+fkhY)l(Ljcc4vzkm$RNqB&3Lvf~=$5W0P zp*}O=*x)1Y;}{e)`tx>-uZ?=ZzXbfgj97TO>%_{5i`YAR53WNbIZ}!|H-g4viRU!B z9MX4<WEx5{@p;)&6ruuHOx`TzAMvoX9 z53Phud_x+{BfKyA{S4YL5?MHSn-u)ox4(p`k2sW(>5ab+X!69{cgYO&iQZ|(`k?P) zWO)Pan_~7!-%)}8jNB>4<{!CIffpFLf(eabeZ%UjGkJ$6{N{nw*M^PA(jKv66k5#G zp0$&Kwb^@R7RtfA-s?c$rv^Wya^|PY8HUKN-~m;u;IVcT6Moz2L&0S_e%mGL(ib)I zfNLlFJ^&=?u;a^9)B6tOULw-|#l6z$bf0|Du-PcJ$ULS&q`eqWQpq>6x!AckW?9NN zf;*;+kIQ6m4zZ*;J7(^;@AtN`3czZ?f(Vq7c=7X5Zkjq;TuWUKdTP>A@!c?n}%Je5Q{x5 zMKrfVFs}PFz9S&-v@_lBew@zooW>Q$FN0*-c3?h=_0m3=l6i2``2l%((-D`oci&+s z;pQtIim3o=a5I%9Uwz4uK;ex!?^xBU!IeUVz1uDU6Bcc9q#J~3q(_K9-SQC(5cq&O zy6H%b|LqF{xBLv8F8}3egM3?VJlD}_P*~ux<(KKqw;9%G^4im8d|fN&XOl#z;{)^b z26@<4tF*n7LP{9*^3AH#DN$vIgj4UEsc+WVCgnY7IV7W2vX zgxk7M>|+=n3_ZP}9yYzAm&{JAXTAADVs3cwgLT=)4+X{E?hp_f4&D{k9HX_iaE`XC zbS>=mDGOP^&A`=08_KGzRQsNGuv@t`h^36R&(5sVWqAr4$_Hw7owx;Ui$*j}R9#)sTYU#e!U82!$h=;Oh(RM^b09xNdYB}>K zUcG-N$v#>S$~bT8uiA%)W{^_(9ImI3F?cU_eS*cVz@(r>FrnP~yoSJ-+glyo~Ia!;D(@1}OMf`ZY zBe{{3S9nLix)FVCB3WX8qJ_*83RUU-JnD!t!oodkfhe2heIR!HkQ!figsXCAx%Q7z zz1eAnVGZ0kT1TF1{qmIHERUpxz0EvTJE>+ zSqEUEj42s$PRX<)@Uf+dI=V9fXGW_(k|H#J4O7RUlS7=G=y}IkV2(+FT9{AMU9AQ6|mIWB`! zq%*<>9g8v!k$TJA#(eZ`a?qxq?%N?V>Q=iTCo-%qbUWdb@wU%)NB6I8oq#P!efV}{ z4+aNr2b!z;?zw=3fMQSuNO|yRC;~VUfLNCzlocONfW@qi>ifh81 zD7qZBYl1@O2Ml8wNx*-j24OLb;K%%N65a5?&)y5AY+7a@c9ZZJaLA|o zfmSw$fXw5|1I-?Z>i1Hu4}1L{tJ~xQ){vAxU0_u zEeX6(fN8jZ;~eoN?j1|b09$JZU?w6b%;TlI!yoTyvEo#XBRMdS-PJFLbUfS4@H=%*h$hUZzv_R{&kz6F?Yp|`nlmVIa?-9F zvtmea$;5MU^m*F*6?iecZ8On|6x0mo>=@j)<33`GWq1|)rI0`F=$B>=GA;dA6lS*7 z#GW?K=op?FF$r02rJ)nZUUQ@Y%cJA`T9a(-39Mf)&|MrSudT56T=)hVq)9 z!wESyaehrK8+>z0hZesFoFu^YWVj3|GoyRm^#{0uWk}FR7qITVWMR&Nm~#HPZU}r%S^2JJ zEG3pnKgaOIdVKhM>kK=mMoc2im$2m+ zwGWf z@P9f%ub4HD-NSZTdEV+_RF56F0X%i>XIbU3e=)r-pkxTbzZCq1fxbh_iqcSe?Nsp;0);h2vEff?T%;Ee=S$msf_m_X+? zaPCg8=H6pT?!Z6h9$0&YVBi>-oZ4V)14%mX;1=a;kWZ1OxmO8!+9@7H z4KnVSk5HRsv734bHq^sC(~C4~PF^FQL|=vN`EJSZE6i2@gw zydJ<&fcchL@)aKS=bd}8O!%w7KtTBLf9Fz}F8~2l={oyWW|pB(KSX zIhF4%d}nN*mr6ua^Uxz1Hj%pFhR?DlIiyF^D*TT?Y!bMBebvKG(_{s|FeV8rDcHh;p7 z>v?eSl41Nu;z}l(`ww?kp%R-rk5~n#naqNNp)|Q{pZf5M)s4rjw(#4wW}nZGcikSD zuBX0?+%(3H$=h78+~V&ssv?~5jXsQAx$pGFLAMs0Hv6jA6YIXKm|R&Ow6I5zAKVNj z@7TqA?Q!h6NWWfN^s^g(0k6>d1%!WMw;n4XrDedvowV$DI0fnq)jQUPTuZ8gahn}q z>c+xd*^JNJA3KhHtE~aQJXiE4Z-2ltQSwvy<%f?nS|!6O zXAnwK25p-;>5fc#o1I-dJ6A`E)5ZE|-kphUJ45+FST9i-yF3=JYPwim_lC=)?_}n# z>)5x4)X4)NZ?DXV6ENibc1G=1?(DZibiM#SK**DVW(w^9yL2gvCdMziYKc6CE`|`` zXTtySydD_1zDE60PC1p3B2_w_=o9>TIx=?a;EIM+qy*_cr?}P=<^*?h%POTWBt-B` zT?tQL_*C6+f1bE|DHxnFhWgStl<>%cF`OkymzTfc|D8Aa63Bp8A#7z}X9q$_6D7<| zHaTn8MLkSEEEG6MCUgDpAt)Kb87Vcmqb6xLI879{eikv4@d978&mR?a4$uX8oc0|@ zPp8ZFfrBK86G{HuPQQn01>P<~+2=e@1m$vcJxp{oU#twlaPZ-0N;Me9iz7CWxKR7D z;Us80e*&k&Vd(`yMm6WTW>T;S!Rd>@6E5tG`f*jERK<#NSA^SJCQcq);Y%$kwmNm+ zo(&I%q3>{VNKpc2zs%24GB&dtD2Vd)46CTKZAE=8uDe3}M1sh0L71!q{B&QFH_#R) zEWmkb0CMvoWZtnk%+7U7b;0d*-Adzm975MQPXxzz)@stG`hBT<)ouf(c44m)Q zLLf9;J%gUhj!D!EPM~gXu;Ix9ex;je{}Ki4ak5^R>*`0FTStnnGH$)TuI-B+H?+5~ z*QL`@aKvXo;peW2tg#ig)WkS7>f&mDf_(WP2kNe2Ex$P=H%X3JTKapV!xAaNGzQA& zA6-zeBed3ses~+PXm1cHh=^+KnxiB5xej+{AH`q|<7IR7rynpWA{O)BVL0qXVCu<4 zV1n(muTzBOPa7_xMRV{xr7*n3Cx#&=+!mav8#}kokdN9Lt^V4uF`br~U^`?n&Rk;) z@iP7DES-g|H%r&m1v}~Z+U2*T7sJ140CXbkWcxG}zdpqj2KpQoPli=_UzHm?C&zhh zDepuT=OD}s)4U9NODOVhK1(paYx1gK_GzG z5c0R7qAzge*nSiG(B^uHJI&OX;ZBqTMg9$eG@1-*>~pPXQzJW*-q9)?p>g`_7zpW< zicfL{!Ig^t|k)kc5ky-PyApH7%q;a&1t6jfe`YvzE;{{FV;8Ay4jbj&zr zufzS^`)j>h?{2*NnT?(^2aG;0)<`aQ^A4rNkC2V}_iojuq>=FSv*9kKviArj_s6MbDiZ`b|fK&gw`k9U)W zyq{e}<#mMx1uo++u8&)+g)>KFtYR8gHYM6uvZ)(N?2U~@OwkI~q+7q5DDD>Sr~yBH zAnQ|n5#vEmI>ED#Pztdgp5YkrzPoqb@V=&+UX>0@(4A3D^r)!%0fBNjFv-X-MTX>^E7h36~Zi=8~=-gyJw ztqZZ*{%eU7pec>o|G(TPPWU^#IfDkwje!LuL%5^ILmMiE`tHglN58Wc>_iYJupcvT z)uMY4ovHz2ZHcM@vGvygY-QNUdIYSkzgVv|EIuZqMG=h)@7}E-IKdHsLf*Im(o$tt zSKJ!=Qzmke`2$w`6uTpChMB4%il=b8j)8^#lw|JjO|&u2HO^8CGcEbQZtU9FH~vwN_&6o2fx2UQSAg|(wU&Hn%A&``{L<*^6 zCrqyPp!Js3`#s+|k?`};)oE@ZIpF5@e+$nPb<7K(p4Y2LcjL z&6BZ8^LZz6#8iNB6LlEYI%%C>;i`u^HzMmXbRbd>ME9FnH}O(KSd>m(YW&L6Y^%-Q z*EA#8HA9@6YP(=WYfB{2ENNN%kaU(16UplW`@E)K^dC|pxNG$g>AG3ov!qwui6g=6B>t#L@4Vdnqx(GL-^PG-=a9lB1@dj0klnN3E4! zB&}Ps%WGtpnu1PJZUqj9-lle^=6gGq>!6APaaH_Rq8BS#hb5fQrkqGS0z5~Z z4r%8faAQv;R!ng9N4vaIW89d-O;=KwSG-oV+3K=MZr>2b`Rn*QM6A&)N?DkiP-XTN4|{&AhDu8O0G=A{T3Ak+VgK?>1))O(?2+`sxzijZ9o5)~u_O9nwgi z3vGqO;Hb4+_ONZeOFe23sc%9?){;ZSt6F#0Vh5D z129!otrUHBgZ<1?Gaf^*Tf&!&oAQ;5M+0@0mB%;IcInLlYcx7*@;U33&3FZL8e0

E3j5*4ch%d(0^1m6fS!s6@~5{ITFjQwt~BPKHSWOm4z%mhu`M~5 z1FJAMc?DHb;_Buz2-{KJ8?JV3tw)<8=VqXjxu{W~+u`|Gl56x+?U{z2^gPK_4WvZU z<@W`V?A@@PHqSfH2PDl1Vz}qDi+MZzQ7uBn2P@j>PU#iWzH8X5fD4Nia|&^j_P14% zj&J+tk>z~Uqv9ZLV=mF0<_wG1#)HPi_E4vomcA!*^xpF)7`5-moAXvTo;a26qE*11 zVM~uC_$|>mz_{7bI}ZC;U9OW(V%W5^jD9Wwp6yd;QlXt;T>~}8!fI|~B)`l)z=f7} zv(HWHC>B1>mv1(uYQ1cRqrJlPPL6f9IE=nFd&P0VTz6>LR*=9nm z7_7|VAoQ~Nte_V#mviOYo=&(ND zSw-wpnb^$htb5fb3+*ajfiKeS?1GJpC*}x;{aw_D79WvQmzoenc@|WXW-EsM824cA zr4~tco~vd(w9LUUy~HHAO|4*QPPh7RZ8sz}!?uvK=!BsX5njx(sI$m&@Hk+SWhu_u z)IhDp7OcfLXVq;BIL$F`_6NNE#qD}wO9}5pVsqkTYQBW5z#AXu+7!I!Ddh`Mty!=A z2oap=x`mx5HR*E~JZ}u#m{EnJYLmNlVlG;AsbW^#nYfcwQz|6{TBQqN zpGBiK2%C=+S#zz`U$SA+6)8a1(r+l!Y0hYR2X6EH5s5gh;;NlEvVGce=IkX#v+!%x z@HvfCT=+nW#l?DDBZ3Dy^K^!9i{*+o;*FJtg@%>;Ni@c*isj0hT)+kx9Q-v}aarOzBTk3G&T5l0AV&5Y~V!%l2=> zA~b1_qq7q2D%r;RiDWlEF1Gom;U0}o%-*5|@p#z zx9RumzQBdA@`sOtZ}A)O$fr0_L&3LNb&Gc4QTo2a>CUpMQzzP|IZ{)?w@Y=4ap7@{ zarKpaBcty!)z3{qbhNt0eX{B619h`oBoAaT$n8Dw7#|D`K$%xbl8D(Mx@=JV0>~A$p2cfKsf$2PqDJH{m*Wx zka#(}ASUFHNB)qMa}vs#jkL>?)_UrGWEJVSN0!$_tOwBW@iT0YpPldOi*32IcSpuy z(xZUc><(ZsJ(dGQueWReL5;Rs5hXbH)&6er_sy)FhvcrzMhW5_Ipq!%UJ{2B=iRbjDc z6opLjp^{@c^!+&)Qlh`W)h?$q6t_&Gy3TSsu@UOm@*e$8rGhgId{fZrI1cIzT%9JX zF{)87Tvp7?x1ILRzF{ui_<^>SbZz`sU-bXi%C&_=6@+1D$?%erScInSsdh7~%{eo3 zX3p7a>)4*9=wevWMNRiX!_?h%Tf#21LKX!*^md>dWpp9P5P}6CiWH*95(bQdAm=hAdq?vl$PL;6PDYaqjXQ3qE1muGCLA8g9t zKdmrZ8o^2%#pcCEP}oGdx|@rPB_as{Z)GqT;Wt7mygASil#qC4-u70l)v$qd87rFZ zh1f@kC?&ZP&p$m(`ypVDYANzx<=M9FHhu1ycSE{9t0V8Sy`deHla4KO9%m2MR1a3J zODpO<-8B%hz0Y(ERbL2YvVjx#hdZzC>VNU0=X~qIN3pJ-Pm0=3U5R%Y@zuR+@;=1V z>`NXl_*~Fze0B794{ohXuk9SkXnH;}@0D+OceKW4-fSvAE1jXU;!)gsxbM@zzWBAY zk`TBZnr$rC3+{$;=ikwb3*YuR$@TO-7kb)n1b=_=9q&HpzLe`dx$@Tf_eV<)Jg69P zFUbDZZ-3TV{3s#y&1wtRH{@jTDL~98&0vwastJ@vxh7JW6yQChrYM|YVUw7mLe6vw zrZKLioQhEdE%7L#ma!DeT&piEI+=tLE_6?TNf<+XG7C$>RMirYK|B_4(vNXHa_U${ zDA%7S#f$PbqmT>QlfVRqt=KdyFqK*SR2f@}RtdBes8UVM7*8B~-3(p=K+MH0Ab&nl z#X{xJBwpB<5i>8qyOhKlp4z3Dt$dVCLFUx}a6~n#at*$)2{vSiEUYm^X)5CkF+7za z2vvkgYVj(fVw;(gL(>ouuU~LFkyz6byHJWFLn-rw*Qd!T0R<+`1B{Q2g#G+LMK~vm zpDWnec&wgJ>yJpINz3+D e);H{owD4QOi)Uftm_kxzglRM<%jq;Ke18B5InIRu delta 8145 zcmch6byU<{*R}&lBPkt2OHK!Zba#V-f}|h~N;4o`5+fY~B1kGNsVE>I(jX-vA<|tE z@8J7-e;YYSWQ?xcQsxY4awgA|w!ynw^!q2@sYrq)!fk0Kq_AkRl8SfnfnaeQZ2{m{{x*146*qTmTaQhHU^O zVGwXhNeWVT4>#-kjufO`na!qZaZ`$kEoA)|;F3NG_V>@nTT$^*pZ;c=+LBnXj4_Jr+4LiQR{Owi@9K2XO}!WIsQmw zcB&vR+ZOpW+f2eQuo+ z!n%(BTyKxw{{fNz!J<@mO@B$RB*jc)*lMM~+zi7O#$+Oh>_OY6!Rzi_p?uf1yDO~= z6NP&I;tGm?H=bo8^I!YOS`!C4)$Lr(^7 zjxUbm-5031mMZ9@m@Gbf-oh0?^&G>~_oC(Yj}m(#5#4g;K2R#$qu0>+U@LBZXbP5) z7g6P4&!>ga463FRNR$qFA59=iR++s^I0oB($%B_^gH z*rBxN{@tl)1HJi2`!9!v3na|Q>S7?}<;G`P6g$#1v-IyuRVXUlCrSv3pR8iuh&0kz zM-e;}t2gA*TK_NRSm8y4N;j)6tJIXmyXLoBN89AczsMP0dEL3m(BJ;v>(?HEUhkG0Mt_kPrvc%)M?;_PK*ZRGxUN!Wk4{#hx|ay!qoYkA1LEG3nBmuRgyZ*& z9PJt(&@MAitH|-E=IBGKso3 zI6EPlc37|;ewzMe<4&O?*OXeC)2} z>HIQ%R(-jHy^Dy8lw%&tgj?5kwUFbP8ozw$##Z37s42t8RDZgZ$Y}(+K@70VDh=M$oa71q~yab#B3lT*Zs?;D5hg-tjidAS)53hD*=#_xX~%Rf@wUyw(OI$%Omqu#cYg=#HVSt7t+vUmgS*V`+?rB7cqJ9?%{Pnf*Qf~k%T zK{8V>>_up2OK)qlR`O_SeLJAxOCWMFH+iR^;%rI&F21QSfF*Ct@>lBsW?B2X_h5B1 zSUdto?p>WFw?UtX@vz%Lg9-``xSwu#(#$tKigVEMI9vB2+0&Mxl~<(=F=&hq9+5FU zrPP!Fn-l)zq#(F@BP&jYtImBw2ca2SiwF`R(jB5)y(g6AD0pC z+8_S%=fe^2^^UtvoU^fO9-K~@vBl~BT?zc`{KkPn3c+_GoXk0$@esM9`Xh z$@f6cHl+e}Q<@75rLE+=mJ&R{zLmk1$lH_JfC! z%|ihirStqI5;biM$WukjSn(R6L$>n*w&t?Op!-K*7Walfz9p&)7^n7VD%H6T?bYP; zOSjyo(ZM{P<_^pwXH|;N%sf1%&(m~(SA%*qvO2M7t-@En4ez}K#+SFZd+8ZpN9e&2 z*>0V!vzfrsthJRqvfvOt0@8QO&(Fj>@Fr zilOt{2ctumek_>4tzLBY9nYSzlGY*5w&@Mi}-Tqx5{tlVUyD85yP|#o&?ZZ_g5b% zvUP5cMm@03%#Bd?8rApDyzdyd!$uwRkd1cL#hM~(_eM$l2chuEr+q87_D=juYOpD=5 z^gw-7)euvnXHOy9I{O|U5h)x{`>>2Cu*H+$75?pd6gu|GK#!B3-v+9-Kn6R%+PH59oVxNY^nMOM#Dp+W_Y*=7R7WywdeJ%$a4V8n~s2PZBNl zT)+3#mK_G&LO#Yr977{^g)=5{hwy^Ki-xa7$cqt9Sg7iL^0Jv2#b}pE zEww&Xle$5A%g&9!fv-cZkO{KF(#X)u!CZ_^!|6qv<*kW&>>>YEy%9;BwAgA_a|~0R z-Ph!&2PRy(B((I;(z()Sg-IWlpF-A1YE>gA2$~2tij~|Qq7FGFf(kvu%0(ItCyO1M zM!tpoXgu=w6pWA&<^_@bDtdvPth_xuWN`FdU^y&4qpAnlKHH%6i3^o^kl;h5Gm*wl zoJ;CfP>yj1tpK?hzfZ$57-d`WoH4R%R-y@9*S1v?G&J_z0-g(YeVH{#I!m+Gka$)d@IhCxb7TOhZ_(oJD4cl^=b;l!_J&%QT*^PQAuqLwMn{=};w zliuiDG>}~-7V-UpOFeYeUbeLxgDJ03Li_QNPQ#5fcW?Mijo0zNSk+iaU6N`&77@*& zR{Z$cUxRe_@j}G-wGmFk=dZhGnyi=fE4&Wc!Ca9e3l_q4M~dc5{o7vrp*9nQ&=H|- zpKotJ+8bFH&|OdavY`nOALdCi?k{+>4CPKctM@a@yArjaxl|+JbHDnT21dxND~!q9 z|E9Bx+bCt1_YfK5Io-$Q$Z=5B05J@~zCHA7MU5aS@~H*{ar!*)o1jZl&}PvWvX?vq zXoPe|b4W`VW!?`Zbky5UshRX;(`iVy2p*@5f>7O!kceYfduCarQ!_+FwK+UL~C8%fh| z*-VT&43$(5x7NSlJu3D>!Vs~%D;6Vc^A$Ic3`k11b2-b<<<_>@mTXKPe>*=nL)FkT zp{Q=jNxX%hZ1WUq`h??iQGh6sB=>^(L0{9pt`DbbX4}EM2@`mwnS>}Qyh0dt#_6uB zzU|4=?Y9SQ2HvVg7CclV7Ip3kL z{vSlbvqrp+YeD&xCxpyT>lo1LYLBq`lk%}|NN9W5TTnTf6>3?S7{B@HTPyQPYEK*@92pwjA#Up(LNL1yP84W*4D^wD#6bY`yQJxp)Qfvtr^AB z#k6c7c=*o5m3}lq{frsUNRfRBw9r2bTd_A^RJ1CPIXHT@PH}mdibb8MUMU8zlk%?lf# z%IM@aW$TVpXt`95szOd!aGS|Ov_d9P5rUQT@E6PZmRxA4kHvK)&68%3*%&#mgWE{d zS)4G%waMD(1q>%M0_=Mb57+Cu# zs(S2OB9nk_p4hGI-7>klxD~GT>=iYx@R1i}ti!ghqWlk7`|^*D8>Wt}NMUBsP-R-- zR&wP?T9*p$H+Q_RUD)tNPo%u~BIT<2($jrqr;m(v#}?xSKG|N-X&T7MzvpEVOx%6y z^(5xeHc3c_0jgokpH$=KOtD&7+*pHfe-26Sso%!|%JNvs@R~0Va%*dn!e{vnk>zi8 zeU`f^OC*)tt@(xGHiacRD(?4LW-JH>4GqZ70K~?BnH@?SJ zO^orxYWZV66{d)ChU)Q;HRrl(6zg}WvUWGRBf76uO}n@qDBC&Jy$6Le20z_AqDvNd zHfmw<_S1Aot>Xy4DC~+V(Y482JH?AD7|U43u1!oVk8F%1)3{%UTo5w4)s!#RdGW>G z*{RY4s)o+B8P55>$)}YFi{iw0eLYLNuB!YrpZhK_BAmOXBVNQ;2ki!FM{ecTq~0pg z4II(t*sc=bIMyqJ(zrb%ueE9_R@$8_|?v%W-T1g9<9%E^u$lE_@}>UvT5Ja8~eE z)d?2>bFS=k^Oru)Nva^{(AbpHaN%{1tZTQYq!xOr2G56Lq zdaf9MlaIJGuO%)Agk4zWH~Z2ZUM*d<{e1gepIdTADXsu6Yi-O0QTT$5%4`wwby$ur z{fWBT`(~NeVos8m#lnmZr|SdK%IJ* zV&tRXX}yWM!dkzxq%Ayc@R8k5r=<~Rss~xqa$kRWhquju8l5{!l;>aDUpIfB65km! ze56T3Pt|Ffd)(qaeb_y(G@7Pze z?=p<@juzz19(|A70@X}utQCUB^aqtCSx=-~S#0Gi*P^bQkL(HX#!-deDlCLV@v3h@Zcz{ zEWnQ5R9lqehsg`-3vKqw3%iWb8LFk1t_SVd-bW*8hUibMe+VsKG0 z2oMg2i=tt`-y?7&7*?A_7y!YFvaw?`Sgv8m7-_Kg*%+|D7=ZvNmX(Qx0}4ZdjO?sD z9=HP$(97e_%i~RJCo5+QdkRtr6f4XsgbidOz?!hWzY4zW$2zkJW7XNHv2`pgST8nK z8u;&nQ+;OxCp$}LD{CNJ49m>Q3^V~k5SKl7fq$9r68Sx|gao#jg#&xRrUpg+9t zBltv(PaTvGiD0yP7N{(3FE1Fu!-cpJMuj0&bq$dbz7FIcjaZ6TrqB&6AkQa=W#J6h ziK4zfaf47a5E7*_8^`@z&-{b+M=&UKl{Gbwq#%^O6@{`|P^RnP;pK*Ue`8J@9SOKM z!(byvsF(9DC?2i^glx%4Mh_?YH6$-gEmghYrsN}4 zy!v8BZT#rnvZwv-)k%;%T#h!w?@V!3#Nt zvn1EJ98~TMbkBrvy;C|R_m^&HZ5?KWsy0KvBc`mZYm9ygGTC4|@Q}L%45?dTSH=Je zH7d&U=U4ihhVgev^GMDDD^yKbpdQbExot10Q{Air@gR27{ZQi{^zW~s!?)uqJJqn| z(3ISaBVCJ-(q6~$BZI7qE|=M6Taz-|(ejz6gJ0GRJBCHO(j{zmyK7IKg;Y;#Pa7qK z)+!MR^v55n-`wX25h-zyW8Yb9foha}JhMSP!nn`HkIT3<=}c}ng^zcv6||n^RF&@U zu+?N#S5LlJ{c3b^1>f|b>Qd|1aXPOW=heXjz*A;q0en?DH^O%D;9@DWvH)HT6#BPo zhP&~t_8-CIVVeP@Y?E@X6Yj(w0>P}KvDUsj zF`A&2GlL#ur;W*Gou@IZCC1mx9G3m2g;|GI>4`XJ~+`O;!hXOz95H_cJp^d{Du!;^CTglNDHzeBU1SH$jCH^Yh90pH-K8~Mx&>p^QsBLe4XDf=?r0y>7Ni1B; zA?Tu(n6uP?w5c!d;xlGC-}{{+{?Da~wNr|e41fp#uEWJ*P#OTKs|p5y0C)&!%0dW$ zhXjg30bwv)vPzi^1~8^@hXP>uXcRQXC=?(-2**hpiNI3xGs+=xi4Y7#!cjmZ8i4~Z z)7RyLO9X#;BOy2r;xaK^J}@*IH@6rN2}T1E2zbhN2!JU$6hQjF{h|Hs#$R(I0#=6& z1x(@5f>h_wVm--ad!m-y!4rA<3p^1 z2i^7^9x;bmo8>7H_nF5PXHZ%ha*}*-keRj3g^1xr4}d`ewk$)z zJS)eR-XBC?olEA=_?`-nQHI!3T-S9-Q7!9JWRh5yoWpC2Lz-W8H`cu+b1Udmf2%m9 z1q&d)hCot~;tJ631`$gc#sb7Kxb=D2fKLC@g@EB!*`F8!g~XN5zhPpRYx_?O2|?lN z;NLJ@lK3Zv)5dQe=HG^qa2QSqe;bUzN#gGq3i%HsC@=(f3Hs+)6a?`vBmcI6f}sDz zFt{avpnJ%E(& zDF7M_8VScO+{^ZFuf?G7zdMM6Apa3o6hsXAcODvzxD>)4^P$n`fAYj&uuC5AUk!^v z5&t0KFeEzVeJbE`rvcJ+cE;^I{#$5HHqOBRZPER=*zUV|c)3~Q;wuKlEjkKPUS0() yMT!)YGyvQG4<|s{Zq|>1zioqzb**i18!y~9To`cPG2-sn|Lq