Skip to content

Commit

Permalink
Merge pull request #40 from esmero/ISSUE-39
Browse files Browse the repository at this point in the history
ISSUE-39: NLP for Webpages/OCR and some improvements for SBFlavor docs (more data)
  • Loading branch information
DiegoPino authored Aug 15, 2021
2 parents 1cf6eda + 234af4c commit 0537d88
Show file tree
Hide file tree
Showing 9 changed files with 1,012 additions and 32 deletions.
88 changes: 88 additions & 0 deletions config/schema/strawberry_runners.schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ strawberryfield_runners.strawberry_runners_postprocessor.binary:
weight:
type: integer
label: 'Order or execution in the global chain'
processor_queue_type:
type: string
label: 'The queue to use for this processor'
strawberryfield_runners.strawberry_runners_postprocessor.ocr:
type: config_object
label: 'Strawberry Runners Post Processor Config Entity OCR specific config'
Expand Down Expand Up @@ -131,6 +134,15 @@ strawberryfield_runners.strawberry_runners_postprocessor.ocr:
weight:
type: integer
label: 'Order or execution in the global chain'
nlp:
type: boolean
label: 'If NLP should be triggered for the extracted Text'
nlp_url:
type: string
label: 'The URL of the NLP64 server'
nlp_method:
type: string
label: 'The NLP method, spaCy or Polyglot'
strawberryfield_runners.strawberry_runners_postprocessor.filesequence:
type: config_object
label: 'Strawberry Runners Post Processor Config Entity JSON sequence specific config'
Expand Down Expand Up @@ -163,3 +175,79 @@ strawberryfield_runners.strawberry_runners_postprocessor.filesequence:
weight:
type: integer
label: 'Order or execution in the global chain'
strawberryfield_runners.strawberry_runners_postprocessor.waczpages:
type: config_object
label: 'Strawberry Runners Post Processor Config Entity WACZ URL sequence specific config'
mapping:
source_type:
type: string
label: 'The type of Source Data this Processor works on'
ado_type:
type: string
label: 'DO type(s) to limit this Processor to'
jsonkey:
type: sequence
label: 'The JSON key(s) containing the desired Source File(s)'
sequence:
- type: string
mime_type:
type: string
label: 'Mimetypes(s) to limit this Processor to'
output_type:
type: string
label: 'The expected and desired output of this processor'
output_destination:
type: sequence
label: 'Where and how the output will be used'
sequence:
- type: string
timeout:
type: integer
label: 'Timeout in seconds for this process'
weight:
type: integer
label: 'Order or execution in the global chain'
strawberryfield_runners.strawberry_runners_postprocessor.webpage:
type: config_object
label: 'Strawberry Runners Post Processor Config Entity WebPage Text specific config'
mapping:
source_type:
type: string
label: 'The type of Source Data this Processor works on'
ado_type:
type: string
label: 'DO type(s) to limit this Processor to'
jsonkey:
type: sequence
label: 'The JSON key(s) containing the desired Source File(s)'
sequence:
- type: string
mime_type:
type: string
label: 'Mimetypes(s) to limit this Processor to'
output_type:
type: string
label: 'The expected and desired output of this processor'
output_destination:
type: sequence
label: 'Where and how the output will be used'
sequence:
- type: string
timeout:
type: integer
label: 'Timeout in seconds for this process'
weight:
type: integer
label: 'Order or execution in the global chain'
processor_queue_type:
type: string
label: 'The queue to use for this processor'
nlp:
type: boolean
label: 'If NLP should be triggered for the extracted Text'
nlp_url:
type: string
label: 'The URL of the NLP64 server'
nlp_method:
type: string
label: 'The NLP method, spaCy or Polyglot'
89 changes: 89 additions & 0 deletions src/Controller/Redirect.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
<?php
/**
* Created by PhpStorm.
* User: dpino
* Date: 4/23/18
* Time: 9:02 PM
*/

namespace Drupal\strawberry_runners\Controller;

use Drupal\Core\Controller\ControllerBase;
use Drupal\Core\Logger\LoggerChannelFactory;
use Drupal\Core\Queue\QueueInterface;
use Drupal\Core\Access\AccessResult;
use Symfony\Component\DependencyInjection\ContainerInterface;
use Symfony\Component\HttpFoundation\RedirectResponse;
use Symfony\Component\HttpFoundation\Request;
use Symfony\Component\HttpFoundation\Response;

/**
* Class WebhookController.
*/
class Redirect extends ControllerBase {

/**
* Drupal\Core\Logger\LoggerChannelFactory definition.
*
* @var \Drupal\Core\Logger\LoggerChannelFactory
*/
protected $logger;

/**
* Drupal\Core\Queue\QueueFactory definition.
*
* @var \Drupal\Core\Queue\QueueInterface
*/
protected $queue;

/**
* Enable or disable debugging.
*
* @var bool
*/
protected $debug = FALSE;

/**
* Secret to compare against a passed token.
*
* Requires $config['strawberry_runners']['webhooktoken'] = 'yourtokeninsettingsphp'; in settings.php.
*
* @var string
*/
protected $secret = NULL;

/**
* Constructs a new WebhookController object.
*/
public function __construct(LoggerChannelFactory $logger, QueueInterface $queue) {
$this->logger = $logger->get('strawberry_runners');
$this->queue = $queue;
$secret = \Drupal::service('config.factory')->get('strawberry_runners')->get('webhooktoken');
}

/**
* {@inheritdoc}
*/
public static function create(ContainerInterface $container) {
return new static(
$container->get('logger.factory'),
$container->get('queue')->get('process_payload_queue_worker')
);
}

/**
* Capture the payload.
*
* @return \Symfony\Component\HttpFoundation\RedirectResponse
* A simple string and 302 response.
*/
public function islandora(Request $request, $PID) {
if ($PID) {
$parts = explode(':', $PID);
}
$response = new RedirectResponse('/do/'.$parts[1], 302);
return $response;
}


}
62 changes: 49 additions & 13 deletions src/Plugin/QueueWorker/AbstractPostProcessorQueueWorker.php
Original file line number Diff line number Diff line change
Expand Up @@ -213,11 +213,22 @@ public function processItem($data) {
// If argument is not there we will assume there is a mistake and its
// a single one.
$data->{$input_argument} = isset($data->{$input_argument}) ? $data->{$input_argument} : 1;

// In case $data->{$input_argument} is an array/data we will use the key as "sequence"
// Each processor needs to be sure it passes a single item and with a unique key

if (is_array($data->{$input_argument})) {
$sequence_key = array_key_first($data->{$input_argument});
}
else {
$sequence_key = (int) $data->{$input_argument};
}

if (is_a($entity, TranslatableInterface::class)) {
$translations = $entity->getTranslationLanguages();
foreach ($translations as $translation_id => $translation) {
//@TODO here, the number 1 needs to come from the sequence.
$item_id = $entity->id() . ':' . $data->{$input_argument} . ':' . $translation_id . ':' . $file->uuid() . ':' . $data->plugin_config_entity_id;
$item_id = $entity->id() . ':' . $sequence_key . ':' . $translation_id . ':' . $file->uuid() . ':' . $data->plugin_config_entity_id;
// a single 0 as return will force us to reindex.
$inindex = $inindex * $this->flavorInSolrIndex($item_id, $data->metadata['checksum'], $indexes);
$item_ids[] = $item_id;
Expand All @@ -239,7 +250,7 @@ public function processItem($data) {
$inkeystore = $inkeystore && FALSE;
}
}
//@TODO allow a force in case of corrupted key value? Partial output
// Allows a force in case of corrupted key value? Partial output
// Extragenoxus weird data?
if (($inindex === 0 || $inkeystore === FALSE) ||
$data->force == TRUE) {
Expand All @@ -248,8 +259,17 @@ public function processItem($data) {

// Check if $io->output exists?
$toindex = new stdClass();
$toindex->fulltext = $io->output->searchapi['fulltext'];
$toindex->plaintext = $io->output->searchapi['plaintext'];
$toindex->fulltext = $io->output->searchapi['fulltext'] ?? '';
$toindex->plaintext = $io->output->searchapi['plaintext'] ?? '';
$toindex->metadata = $io->output->searchapi['metadata'] ?? [];
$toindex->who = $io->output->searchapi['who'] ?? [];
$toindex->where = $io->output->searchapi['where'] ?? [];
$toindex->when = $io->output->searchapi['when'] ?? [];
$toindex->ts = $io->output->searchapi['ts'] ?? NULL;
$toindex->uri = $io->output->searchapi['uri'] ?? NULL;
$toindex->label = $io->output->searchapi['label'] ?? NULL;
$toindex->sentiment = $io->output->searchapi['sentiment'] ?? 0;

// $siblings will be the amount of total children processors that were
// enqueued for a single Processor chain.
$toindex->sequence_total = !empty($data->siblings) ? $data->siblings : 1;
Expand Down Expand Up @@ -316,23 +336,29 @@ public function processItem($data) {
// Possible input properties:
// - Can come from the original Data (most likely)
// - May be overriden by the $io->output, e.g when a processor generates a file that is not part of any node
$input_property_value_from_plugin = TRUE;
$input_property_value = isset($io->output->plugin) && isset($io->output->plugin[$input_property]) ? $io->output->plugin[$input_property] : NULL;
// If was not defined by the previous processor try from the main data.
if ($input_property_value == NULL) {
$input_property_value_from_plugin = FALSE;
$input_property_value = isset($data->{$input_property}) ? $data->{$input_property} : NULL;
}

// If still null means the child is incompatible with the parent. We abort.
if ($input_property_value == NULL) {
$this->logger->log(LogLevel::WARNING, 'Sorry @childplugin is incompatible with @parentplugin, skipping.', [
'@parentplugin' => $data->plugin_config_entity_id,
'@childplugin' => $childdata->plugin_config_entity_id,

]);
$this->logger->log(LogLevel::WARNING,
'Sorry @childplugin is incompatible with @parentplugin or its output or the later is empty, skipping.',
[
'@parentplugin' => $data->plugin_config_entity_id,
'@childplugin' => $postprocessor_config_entity->id(),
]);
continue;
}
// Warning Diego. This may lead to a null
$childdata->{$input_property} = $input_property_value;
$childdata->plugin_config_entity_id = $postprocessor_config_entity->id();
$input_argument_value = isset($io->output->plugin) && isset($io->output->plugin[$input_argument]) ? $io->output->plugin[$input_argument] : $data->{$input_argument};
$input_argument_value = isset($io->output->plugin) && isset($io->output->plugin[$input_argument]) ?
$io->output->plugin[$input_argument] : $data->{$input_argument};
// This is a must: Solr indexing requires a list of sequences. A single one
// will not be enqueued.
if (is_array($input_argument_value)) {
Expand All @@ -345,6 +371,14 @@ public function processItem($data) {
// The count will always be relative to this call
// Means count of how many children are being called.
$childdata->siblings = count($input_argument_value);
// In case the $input_property_value is an array coming from a plugin we may want to if has the same amount of values of $input_argument_value
// If so its many to one and we only need the corresponding entry to this sequence
if ($input_property_value_from_plugin &&
is_array($input_property_value) &&
count($input_property_value) == $childdata->siblings &&
isset($input_property_value[$value])) {
$childdata->{$input_property} = $input_property_value[$value];
}
Drupal::queue('strawberryrunners_process_background', TRUE)
->createItem($childdata);
}
Expand Down Expand Up @@ -479,6 +513,10 @@ public function flavorInSolrIndex(string $key, string $checksum, array $indexes)
$parse_mode = $this->parseModeManager->createInstance('terms');
$query->setParseMode($parse_mode);
$query->sort('search_api_relevance', 'DESC');
$query->setOption('search_api_retrieved_field_values', ['id']);
// Query breaks if not because standard hl is enabled for all fields.
// and normal hl offsets on OCR HL specific ones.
$query->setOption('no_highlight', 'on');

$query->addCondition('search_api_id', 'strawberryfield_flavor_datasource/' . $key)
->addCondition('search_api_datasource', 'strawberryfield_flavor_datasource')
Expand Down Expand Up @@ -535,9 +573,7 @@ private function invokeProcessor(StrawberryRunnersPostProcessorPluginInterface $
$io = new stdClass();
$input = new stdClass();

// @NOTE: this is the only place where we just pass filelocation fixed instead of the
// actual property named $input_property. Which may be weird?
$input->{$input_property} = $data->filepath;
$input->{$input_property} = $data->{$input_property};
$input->{$input_argument} = isset($data->{$input_argument}) ? $data->{$input_argument} : 1;
// The Node UUID
$input->nuuid = $data->nuuid;
Expand Down
Loading

0 comments on commit 0537d88

Please sign in to comment.