Skip to content

Commit

Permalink
Merge branch 'release/0.0.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
shulard committed Mar 13, 2015
2 parents e3c760f + 623f5dc commit 223e3bd
Show file tree
Hide file tree
Showing 7 changed files with 284 additions and 27 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
bee4/robots.txt
bee4/robots.txt v0.0.0
======================

[![Build Status](https://travis-ci.org/bee4/robots.txt.svg?branch=develop)](https://travis-ci.org/bee4/robots.txt)
Expand Down
60 changes: 60 additions & 0 deletions src/Parser.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
<?php
/**
* This file is part of the beebot package.
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*
* @copyright Bee4 2015
* @author Stephane HULARD <[email protected]>
* @package Bee4\RobotsTxt
*/

namespace Bee4\RobotsTxt;

/**
* Class Parser
* Take the content of a robots.txt file and transform it to rules
* @package Bee4\RobotsTxt
*/
class Parser
{
/**
* Robots.txt file content
* @var string
*/
protected $content;

public function __construct($content) {
$this->content = trim($content, "\xEF\xBB\xBF");
}

public function parse() {
$rules = new Rules();
$ua = $rule = null;
$separator = "\r\n";
$line = strtok($this->content, $separator);
while ($line !== false) {
if( strpos($line, '#') !== 0 ) {
if( preg_match('/^User-Agent\: (.*)$/i', $line, $matches)) {
if( $ua !== null ) {
$rules->add($ua, $rule);
}
$ua = $matches[1];
$rule = new Rule();
} elseif( preg_match('/^Allow: (.*)$/i', $line, $matches)) {
$rule->allow($matches[1]);
} elseif( preg_match('/^Disallow: (.*)$/i', $line, $matches)) {
$rule->disallow($matches[1]);
}
}

$line = strtok( $separator );
}
//Handle the last item in the loop
if( $rule instanceof Rule ) {
$rules->add($ua, $rule);
}

return $rules;
}
}
47 changes: 47 additions & 0 deletions src/ParserFactory.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
<?php
/**
* This file is part of the beebot package.
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*
* @copyright Bee4 2015
* @author Stephane HULARD <[email protected]>
* @package Bee4\RobotsTxt
*/

namespace Bee4\RobotsTxt;

/**
* Class ParserFactory
* Take an URL, try to load the robots.txt file and return the parsed rules
* @package Bee4\RobotsTxt
*/
class ParserFactory
{
public static function build($item) {
if( filter_var($item, FILTER_VALIDATE_URL)!==false ) {
$parsed = parse_url($item);
if( isset($parsed['path']) && $parsed['path'] != '/robots.txt' ) {
throw new \InvalidArgumentException('The robots.txt file can\'t be found at: '.$item.' this file must be hosted at website root');
}

$parsed['path'] = '/robots.txt';
$parsed = array_intersect_key($parsed, array_flip(['scheme', 'host', 'port', 'path']));
$url = $parsed['scheme'].'://'.$parsed['host'].(isset($parsed['port'])?':'.$parsed['port']:'').$parsed['path'];

$handle = curl_init();
curl_setopt($handle, CURLOPT_URL, $url);
curl_setopt($handle, CURLOPT_RETURNTRANSFER, true);
$item = curl_exec($handle);
$status = curl_getinfo($handle, CURLINFO_HTTP_CODE);
curl_close($handle);

if( $status !== 200 ) {
throw new \RuntimeException('Can\'t access the robots.txt file at: '.$url);
}
}

$parser = new Parser($item);
return $parser->parse();
}
}
56 changes: 36 additions & 20 deletions src/Rule.php
Original file line number Diff line number Diff line change
Expand Up @@ -19,28 +19,40 @@
class Rule
{
/**
* The UserAgent corresponding to the rule
* @var string
* The regex patterns that identidy if the rule match or not!
* @var array
*/
protected $ua;
protected $patterns = [
'allow' => [],
'disallow' => []
];

/**
* The regex pattern that identidy if the rule match or not!
* @var string
* Add a pattern to match in the current rule by allowing
* @param string $pattern
* @return Rule
*/
protected $pattern;

public function __construct($ua) {
$this->ua = $ua;
$this->pattern = '';
public function allow($pattern) {
$this->patterns['allow'][$pattern] = $this->handlePattern($pattern);
return $this;
}

/**
* Add a pattern to match in the current rule
* Add a pattern to match in the current rule by disallowing
* @param string $pattern
* @return Rule
*/
public function addPattern($pattern) {
public function disallow($pattern) {
$this->patterns['disallow'][$pattern] = $this->handlePattern($pattern);
return $this;
}

/**
* Transform current pattern to be used for matching
* @param string $pattern
* @return string;
*/
private function handlePattern($pattern) {
$ended = substr($pattern, -1) === '$';
$pattern = rtrim($pattern, '*');
$pattern = rtrim($pattern, '$');
Expand All @@ -49,13 +61,7 @@ public function addPattern($pattern) {
array_walk($parts, function(&$part) {
$part = preg_quote($part, '/');
});

if( $this->pattern != '' ) {
$this->pattern .= '|';
}
$this->pattern .= implode('.*', $parts).($ended?'':'.*');

return $this;
return implode('.*', $parts).($ended?'':'.*');
}

/**
Expand All @@ -64,6 +70,16 @@ public function addPattern($pattern) {
* @return boolean
*/
public function match($url) {
return preg_match('/^'.$this->pattern.'$/i', $url) != false;
arsort($this->patterns['allow'], SORT_NUMERIC);
arsort($this->patterns['disallow'], SORT_NUMERIC);

if( count($this->patterns['disallow']) > 0 && preg_match('/^(?!('.implode('|', $this->patterns['disallow']).')).*$/i', $url ) == false ) {
if( count($this->patterns['allow']) > 0 ) {
return preg_match('/^('.implode('|', $this->patterns['allow']).')$/i', $url) != false;
} else {
return false;
}
}
return true;
}
}
82 changes: 82 additions & 0 deletions src/Rules.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
<?php
/**
* This file is part of the beebot package.
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*
* @copyright Bee4 2015
* @author Stephane HULARD <[email protected]>
* @package Bee4\RobotsTxt
*/

namespace Bee4\RobotsTxt;

/**
* Class Rules
* Represent a collection of Rules
* @package Bee4\RobotsTxt
*/
class Rules
{
const DEFAULT_UA = '*';

/**
* The collection of rules
* @var array
*/
protected $collection = [];

/**
* Add a new rule to the collection
* @param string $ua
* @param Rule $rule
* @return Rules
*/
public function add($ua, Rule $rule) {
$ua = $this->handleUa($ua);
if( isset($this->collection[$ua]) ) {
throw new \RuntimeException('You can\'t add 2 rules for the same UserAgent');
}
$this->collection[$ua] = $rule;

return $this;
}

public function match($ua, $url) {
if( ($rule = $this->get($ua)) === null ) {
return false;
}
return $rule->match($url);
}

/**
* Retrieve rules for a given UA
* @param string $ua
* @return null|Rule
*/
public function get($ua) {
$item = null;
$it = new \ArrayIterator($this->collection);
iterator_apply($it, function($it, $ua) use (&$item) {
if( $it->key() != Rules::DEFAULT_UA && preg_match($it->key(), $ua) != false ) {
$item = $it->current();
return false;
}
return true;
}, [$it, $ua]);

return $item!==null?$item:(isset($this->collection[self::DEFAULT_UA])?$this->collection[self::DEFAULT_UA]:null);
}

/**
* Update the UA to make a valid regexp
* @param string $ua
* @return string
*/
private function handleUa($ua) {
if( $ua == self::DEFAULT_UA ) {
return $ua;
}
return '/^'.preg_quote($ua).'.*/i';
}
}
49 changes: 49 additions & 0 deletions test/units/ParserTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
<?php
/**
* This file is part of the beebot package.
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*
* @copyright Bee4 2015
* @author Stephane HULARD <[email protected]>
* @package Test\Bee4\RobotsTxt
*/

namespace Test\Bee4\RobotsTxt;

use Bee4\RobotsTxt\Parser;
use Bee4\RobotsTxt\ParserFactory;

/**
* Parser unit test
* @package Test\Bee4\RobotsTxt
*/
class ParserTest extends \PHPUnit_Framework_TestCase
{
protected $content = "User-agent: *
Disallow: /mentions-legales/
User-agent: google-bot
Allow: /truite.php
disallow: /";

public function testParse() {
$object = new Parser($this->content);
$rules = $object->parse();

$rule = $rules->get('*');
$this->assertInstanceOf('\Bee4\RobotsTxt\Rule', $rule);

$this->assertFalse($rule->match('/mentions-legales/'));
$this->assertTrue($rule->match('/another-page.html'));

$this->assertFalse($rules->match('Google-Bot v01', '/toto'));
$this->assertTrue($rules->match('Google-Bot v01', '/truite.php'));
}

public function testParserFactory() {
$rules = ParserFactory::build("http://www.bee4.fr");

$this->assertInstanceOf('\Bee4\RobotsTxt\Rule', $rules->get('*'));
}
}
15 changes: 9 additions & 6 deletions test/units/RuleTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,18 @@
class RuleTest extends \PHPUnit_Framework_TestCase
{
public function testPatternSetAndMatch() {
$object = new Rule('Bot1');
$object->addPattern('/toto*.php$');
$object->addPattern('/truite');
$object->addPattern('/section*');
$object = new Rule();
$object->disallow('/section*');
$this->assertFalse($object->match('/section/tout/cetuqiadjoa.jpg'));

$this->assertTrue($object->match('/toto/tata.php'));
$object->allow('/toto*.php$');
$this->assertTrue($object->match('/toto/tata.PHP'));
$this->assertTrue($object->match('/toto/tata.php'));

$object->allow('/truite');
$this->assertTrue($object->match('/truite/et/tout/cetuqiadjoa.jpg'));
$this->assertTrue($object->match('/section/tout/cetuqiadjoa.jpg'));

$object->disallow('/');
$this->assertFalse($object->match('/to/tata.php'));
}
}

0 comments on commit 223e3bd

Please sign in to comment.