Skip to content

Commit

Permalink
Merge pull request #137 from salsadigitalauorg/feature/crawler-filter…
Browse files Browse the repository at this point in the history
…-patterns

Add a new group plugin for crawler.
  • Loading branch information
Andy Rowlands authored Jun 9, 2021
2 parents 3b22fb4 + 7c16be0 commit 4a329d7
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/Command/CrawlCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,10 @@ protected function execute(InputInterface $input, OutputInterface $output)
$crawler->setDelayBetweenRequests($delay);
}

if (!empty($this->config['options']['ignore_robotstxt'])) {
$crawler->ignoreRobots();
}

$io->success('Starting crawl!');

$crawler->startCrawling($baseUrl);
Expand Down
102 changes: 102 additions & 0 deletions src/Crawler/Group/ElementFilter.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
<?php

namespace Merlin\Crawler\Group;

use Psr\Http\Message\ResponseInterface;
use Symfony\Component\DomCrawler\Crawler;

/**
* Allows regex element filters to extract to separate files.
*
* @example
* id: group-by-node-type
* type: element_filter
* options:
* selector: .node # DOM or Xpath
* pattern: /node-\w+/
* filter_attr: class
*/
class ElementFilter extends GroupBase
{

/**
* The filtered type - used to separate output by id.
*
* @var string
*/
protected $filter_type;


/**
* {@inheritdoc}
*/
public function __construct(array $config=[])
{
parent::__construct($config);
$this->filter_type = NULL;

}//end __construct()


/**
* {@inheritdoc}
*/
public function getId() : string
{
$id = parent::getId();

if ($this->filter_type) {
$id .= "-{$this->filter_type}";
}

return $id;

}//end getId()


/**
* {@inheritdoc}
*/
public function match($url, ResponseInterface $response) : bool
{
$dom = new Crawler($response->getBody()->__toString(), $url);
$filter_attr = $this->getOption('filter_attr') ?: 'class';
$pattern = $this->getOption('pattern');

if (empty($this->getOption('selector')) || empty($pattern)) {
return FALSE;
}

try {
$element = $dom->evaluate($this->getOption('selector'));
} catch (\Exception $error) {
$element = [];
}

if (!is_callable([$element, 'count']) || $element->count() === 0) {
try {
$element = $dom->filter($this->getOption('selector'));
} catch (\Exception $error) {
return FALSE;
}
}

if ($element->count() === 0) {
return FALSE;
}

$types = $element->each(
function(Crawler $node) use ($filter_attr, $pattern) {
preg_match($pattern, $node->attr($filter_attr), $matches);
return reset($matches);
}
);

$this->filter_type = reset($types);

return TRUE;

}//end match()


}//end class

0 comments on commit 4a329d7

Please sign in to comment.