From ef3a2c981198685b78e67b9783ffdb2c717f1560 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ste=CC=81phane=20HULARD?= Date: Fri, 13 Mar 2015 12:53:24 +0100 Subject: [PATCH 1/5] Update rules tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane HULARD --- src/Rule.php | 56 ++++++++++++++++++++++++++--------------- test/units/RuleTest.php | 15 ++++++----- 2 files changed, 45 insertions(+), 26 deletions(-) diff --git a/src/Rule.php b/src/Rule.php index 78f05c7..e08ce33 100644 --- a/src/Rule.php +++ b/src/Rule.php @@ -19,28 +19,40 @@ class Rule { /** - * The UserAgent corresponding to the rule - * @var string + * The regex patterns that identidy if the rule match or not! + * @var array */ - protected $ua; + protected $patterns = [ + 'allow' => [], + 'disallow' => [] + ]; /** - * The regex pattern that identidy if the rule match or not! - * @var string + * Add a pattern to match in the current rule by allowing + * @param string $pattern + * @return Rule */ - protected $pattern; - - public function __construct($ua) { - $this->ua = $ua; - $this->pattern = ''; + public function allow($pattern) { + $this->patterns['allow'][$pattern] = $this->handlePattern($pattern); + return $this; } /** - * Add a pattern to match in the current rule + * Add a pattern to match in the current rule by disallowing * @param string $pattern * @return Rule */ - public function addPattern($pattern) { + public function disallow($pattern) { + $this->patterns['disallow'][$pattern] = $this->handlePattern($pattern); + return $this; + } + + /** + * Transform current pattern to be used for matching + * @param string $pattern + * @return string; + */ + private function handlePattern($pattern) { $ended = substr($pattern, -1) === '$'; $pattern = rtrim($pattern, '*'); $pattern = rtrim($pattern, '$'); @@ -49,13 +61,7 @@ public function addPattern($pattern) { array_walk($parts, function(&$part) { $part = preg_quote($part, '/'); }); - - if( $this->pattern != '' ) { - $this->pattern .= '|'; - } - $this->pattern .= implode('.*', $parts).($ended?'':'.*'); - - return $this; + return implode('.*', $parts).($ended?'':'.*'); } /** @@ -64,6 +70,16 @@ public function addPattern($pattern) { * @return boolean */ public function match($url) { - return preg_match('/^'.$this->pattern.'$/i', $url) != false; + arsort($this->patterns['allow'], SORT_NUMERIC); + arsort($this->patterns['disallow'], SORT_NUMERIC); + + if( count($this->patterns['disallow']) > 0 && preg_match('/^(?!('.implode('|', $this->patterns['disallow']).')).*$/i', $url ) == false ) { + if( count($this->patterns['allow']) > 0 ) { + return preg_match('/^('.implode('|', $this->patterns['allow']).')$/i', $url) != false; + } else { + return false; + } + } + return true; } } diff --git a/test/units/RuleTest.php b/test/units/RuleTest.php index 9991b3f..4e65008 100644 --- a/test/units/RuleTest.php +++ b/test/units/RuleTest.php @@ -20,15 +20,18 @@ class RuleTest extends \PHPUnit_Framework_TestCase { public function testPatternSetAndMatch() { - $object = new Rule('Bot1'); - $object->addPattern('/toto*.php$'); - $object->addPattern('/truite'); - $object->addPattern('/section*'); + $object = new Rule(); + $object->disallow('/section*'); + $this->assertFalse($object->match('/section/tout/cetuqiadjoa.jpg')); - $this->assertTrue($object->match('/toto/tata.php')); + $object->allow('/toto*.php$'); $this->assertTrue($object->match('/toto/tata.PHP')); + $this->assertTrue($object->match('/toto/tata.php')); + + $object->allow('/truite'); $this->assertTrue($object->match('/truite/et/tout/cetuqiadjoa.jpg')); - $this->assertTrue($object->match('/section/tout/cetuqiadjoa.jpg')); + + $object->disallow('/'); $this->assertFalse($object->match('/to/tata.php')); } } From b8a350642e8c874985c7d27be63517f1a0c8a970 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ste=CC=81phane=20HULARD?= Date: Fri, 13 Mar 2015 12:53:30 +0100 Subject: [PATCH 2/5] Add a parser Object + rules collection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane HULARD --- src/Parser.php | 61 +++++++++++++++++++++++++++++++++++++ src/Rules.php | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 143 insertions(+) create mode 100644 src/Parser.php create mode 100644 src/Rules.php diff --git a/src/Parser.php b/src/Parser.php new file mode 100644 index 0000000..f8fdc86 --- /dev/null +++ b/src/Parser.php @@ -0,0 +1,61 @@ + + * @package Bee4\RobotsTxt + */ + +namespace Bee4\RobotsTxt; + +/** + * Class Parser + * Take the content of a robots.txt file and transform it to rules + * @package Bee4\RobotsTxt + */ +class Parser +{ + /** + * Robots.txt file content + * @var string + */ + protected $content; + + public function __construct($content) { + $this->content = trim($content, "\xEF\xBB\xBF"); + } + + public function parse() { + $rules = new Rules(); + + $ua = $rule = null; + $separator = "\r\n"; + $line = strtok($this->content, $separator); + while ($line !== false) { + if( strpos($line, '#') !== 0 ) { + if( preg_match('/^User-Agent\: (.*)$/i', $line, $matches)) { + if( $ua !== null ) { + $rules->add($ua, $rule); + } + $ua = $matches[1]; + $rule = new Rule(); + } elseif( preg_match('/^Allow: (.*)$/i', $line, $matches)) { + $rule->allow($matches[1]); + } elseif( preg_match('/^Disallow: (.*)$/i', $line, $matches)) { + $rule->disallow($matches[1]); + } + } + + $line = strtok( $separator ); + } + //Handle the last item in the loop + if( $rule instanceof Rule ) { + $rules->add($ua, $rule); + } + + return $rules; + } +} diff --git a/src/Rules.php b/src/Rules.php new file mode 100644 index 0000000..05214dd --- /dev/null +++ b/src/Rules.php @@ -0,0 +1,82 @@ + + * @package Bee4\RobotsTxt + */ + +namespace Bee4\RobotsTxt; + +/** + * Class Rules + * Represent a collection of Rules + * @package Bee4\RobotsTxt + */ +class Rules +{ + const DEFAULT_UA = '*'; + + /** + * The collection of rules + * @var array + */ + protected $collection = []; + + /** + * Add a new rule to the collection + * @param string $ua + * @param Rule $rule + * @return Rules + */ + public function add($ua, Rule $rule) { + $ua = $this->handleUa($ua); + if( isset($this->collection[$ua]) ) { + throw new \RuntimeException('You can\'t add 2 rules for the same UserAgent'); + } + $this->collection[$ua] = $rule; + + return $this; + } + + public function match( $ua, $url ) { + if( ($rule = $this->get($ua)) === null ) { + return false; + } + return $rule->match($url); + } + + /** + * Retrieve rules for a given UA + * @param string $ua + * @return null|Rule + */ + public function get($ua) { + $item = null; + $it = new \ArrayIterator($this->collection); + iterator_apply($it, function($it, $ua) use (&$item) { + if( $it->key() != Rules::DEFAULT_UA && preg_match($it->key(), $ua) != false ) { + $item = $it->current(); + return false; + } + return true; + }, [$it, $ua]); + + return $item!==null?$item:(isset($this->collection[self::DEFAULT_UA])?$this->collection[self::DEFAULT_UA]:null); + } + + /** + * Update the UA to make a valid regexp + * @param string $ua + * @return string + */ + private function handleUa($ua) { + if( $ua == self::DEFAULT_UA ) { + return $ua; + } + return '/^'.preg_quote($ua).'.*/i'; + } +} \ No newline at end of file From 228141fea588ec97353839fc0d0e6d7114a22c5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ste=CC=81phane=20HULARD?= Date: Fri, 13 Mar 2015 12:54:10 +0100 Subject: [PATCH 3/5] Add tests on Parser + Rules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane HULARD --- test/units/ParserTest.php | 42 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 test/units/ParserTest.php diff --git a/test/units/ParserTest.php b/test/units/ParserTest.php new file mode 100644 index 0000000..15935dc --- /dev/null +++ b/test/units/ParserTest.php @@ -0,0 +1,42 @@ + + * @package Test\Bee4\RobotsTxt + */ + +namespace Test\Bee4\RobotsTxt; + +use Bee4\RobotsTxt\Parser; + +/** + * Parser unit test + * @package Test\Bee4\RobotsTxt + */ +class ParserTest extends \PHPUnit_Framework_TestCase +{ + protected $content = "User-agent: * +Disallow: /mentions-legales/ + +User-agent: google-bot +Allow: /truite.php +disallow: /"; + + public function testParse() { + $object = new Parser($this->content); + $rules = $object->parse(); + + $rule = $rules->get('*'); + $this->assertInstanceOf('\Bee4\RobotsTxt\Rule', $rule); + + $this->assertFalse($rule->match('/mentions-legales/')); + $this->assertTrue($rule->match('/another-page.html')); + + $this->assertFalse($rules->match('Google-Bot v01', '/toto')); + $this->assertTrue($rules->match('Google-Bot v01', '/truite.php')); + } +} From b0100ac9ed88c9b6873d9566338195c2e74ad7fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ste=CC=81phane=20HULARD?= Date: Fri, 13 Mar 2015 14:59:35 +0100 Subject: [PATCH 4/5] Add a ParserFactory which build a parser object from an URL or a file content MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane HULARD --- src/Parser.php | 1 - src/ParserFactory.php | 47 +++++++++++++++++++++++++++++++++++++++ test/units/ParserTest.php | 7 ++++++ 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 src/ParserFactory.php diff --git a/src/Parser.php b/src/Parser.php index f8fdc86..d9b1f11 100644 --- a/src/Parser.php +++ b/src/Parser.php @@ -30,7 +30,6 @@ public function __construct($content) { public function parse() { $rules = new Rules(); - $ua = $rule = null; $separator = "\r\n"; $line = strtok($this->content, $separator); diff --git a/src/ParserFactory.php b/src/ParserFactory.php new file mode 100644 index 0000000..0d300eb --- /dev/null +++ b/src/ParserFactory.php @@ -0,0 +1,47 @@ + + * @package Bee4\RobotsTxt + */ + +namespace Bee4\RobotsTxt; + +/** + * Class ParserFactory + * Take an URL, try to load the robots.txt file and return the parsed rules + * @package Bee4\RobotsTxt + */ +class ParserFactory +{ + public static function build($item) { + if( filter_var($item, FILTER_VALIDATE_URL)!==false ) { + $parsed = parse_url($item); + if( isset($parsed['path']) && $parsed['path'] != '/robots.txt' ) { + throw new \InvalidArgumentException('The robots.txt file can\'t be found at: '.$item.' this file must be hosted at website root'); + } + + $parsed['path'] = '/robots.txt'; + $parsed = array_intersect_key($parsed, array_flip(['scheme', 'host', 'port', 'path'])); + $url = $parsed['scheme'].'://'.$parsed['host'].(isset($parsed['port'])?':'.$parsed['port']:'').$parsed['path']; + + $handle = curl_init(); + curl_setopt($handle, CURLOPT_URL, $url); + curl_setopt($handle, CURLOPT_RETURNTRANSFER, true); + $item = curl_exec($handle); + $status = curl_getinfo($handle, CURLINFO_HTTP_CODE); + curl_close($handle); + + if( $status !== 200 ) { + throw new \RuntimeException('Can\'t access the robots.txt file at: '.$url); + } + } + + $parser = new Parser($item); + return $parser->parse(); + } +} diff --git a/test/units/ParserTest.php b/test/units/ParserTest.php index 15935dc..02b3816 100644 --- a/test/units/ParserTest.php +++ b/test/units/ParserTest.php @@ -12,6 +12,7 @@ namespace Test\Bee4\RobotsTxt; use Bee4\RobotsTxt\Parser; +use Bee4\RobotsTxt\ParserFactory; /** * Parser unit test @@ -39,4 +40,10 @@ public function testParse() { $this->assertFalse($rules->match('Google-Bot v01', '/toto')); $this->assertTrue($rules->match('Google-Bot v01', '/truite.php')); } + + public function testParserFactory() { + $rules = ParserFactory::build("http://www.bee4.fr"); + + $this->assertInstanceOf('\Bee4\RobotsTxt\Rule', $rules->get('*')); + } } From 623f5dc94bd7ca8ffeb3df8b4e0d27bc3249bdb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ste=CC=81phane=20HULARD?= Date: Fri, 13 Mar 2015 15:01:34 +0100 Subject: [PATCH 5/5] Bump version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stéphane HULARD --- README.md | 2 +- src/Rules.php | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 80eef6f..7862e89 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -bee4/robots.txt +bee4/robots.txt v0.0.0 ====================== [![Build Status](https://travis-ci.org/bee4/robots.txt.svg?branch=develop)](https://travis-ci.org/bee4/robots.txt) diff --git a/src/Rules.php b/src/Rules.php index 05214dd..ac3765a 100644 --- a/src/Rules.php +++ b/src/Rules.php @@ -42,7 +42,7 @@ public function add($ua, Rule $rule) { return $this; } - public function match( $ua, $url ) { + public function match($ua, $url) { if( ($rule = $this->get($ua)) === null ) { return false; }