-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
284 additions
and
27 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
<?php | ||
/** | ||
* This file is part of the beebot package. | ||
* For the full copyright and license information, please view the LICENSE | ||
* file that was distributed with this source code. | ||
* | ||
* @copyright Bee4 2015 | ||
* @author Stephane HULARD <[email protected]> | ||
* @package Bee4\RobotsTxt | ||
*/ | ||
|
||
namespace Bee4\RobotsTxt; | ||
|
||
/** | ||
* Class Parser | ||
* Take the content of a robots.txt file and transform it to rules | ||
* @package Bee4\RobotsTxt | ||
*/ | ||
class Parser | ||
{ | ||
/** | ||
* Robots.txt file content | ||
* @var string | ||
*/ | ||
protected $content; | ||
|
||
public function __construct($content) { | ||
$this->content = trim($content, "\xEF\xBB\xBF"); | ||
} | ||
|
||
public function parse() { | ||
$rules = new Rules(); | ||
$ua = $rule = null; | ||
$separator = "\r\n"; | ||
$line = strtok($this->content, $separator); | ||
while ($line !== false) { | ||
if( strpos($line, '#') !== 0 ) { | ||
if( preg_match('/^User-Agent\: (.*)$/i', $line, $matches)) { | ||
if( $ua !== null ) { | ||
$rules->add($ua, $rule); | ||
} | ||
$ua = $matches[1]; | ||
$rule = new Rule(); | ||
} elseif( preg_match('/^Allow: (.*)$/i', $line, $matches)) { | ||
$rule->allow($matches[1]); | ||
} elseif( preg_match('/^Disallow: (.*)$/i', $line, $matches)) { | ||
$rule->disallow($matches[1]); | ||
} | ||
} | ||
|
||
$line = strtok( $separator ); | ||
} | ||
//Handle the last item in the loop | ||
if( $rule instanceof Rule ) { | ||
$rules->add($ua, $rule); | ||
} | ||
|
||
return $rules; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
<?php | ||
/** | ||
* This file is part of the beebot package. | ||
* For the full copyright and license information, please view the LICENSE | ||
* file that was distributed with this source code. | ||
* | ||
* @copyright Bee4 2015 | ||
* @author Stephane HULARD <[email protected]> | ||
* @package Bee4\RobotsTxt | ||
*/ | ||
|
||
namespace Bee4\RobotsTxt; | ||
|
||
/** | ||
* Class ParserFactory | ||
* Take an URL, try to load the robots.txt file and return the parsed rules | ||
* @package Bee4\RobotsTxt | ||
*/ | ||
class ParserFactory | ||
{ | ||
public static function build($item) { | ||
if( filter_var($item, FILTER_VALIDATE_URL)!==false ) { | ||
$parsed = parse_url($item); | ||
if( isset($parsed['path']) && $parsed['path'] != '/robots.txt' ) { | ||
throw new \InvalidArgumentException('The robots.txt file can\'t be found at: '.$item.' this file must be hosted at website root'); | ||
} | ||
|
||
$parsed['path'] = '/robots.txt'; | ||
$parsed = array_intersect_key($parsed, array_flip(['scheme', 'host', 'port', 'path'])); | ||
$url = $parsed['scheme'].'://'.$parsed['host'].(isset($parsed['port'])?':'.$parsed['port']:'').$parsed['path']; | ||
|
||
$handle = curl_init(); | ||
curl_setopt($handle, CURLOPT_URL, $url); | ||
curl_setopt($handle, CURLOPT_RETURNTRANSFER, true); | ||
$item = curl_exec($handle); | ||
$status = curl_getinfo($handle, CURLINFO_HTTP_CODE); | ||
curl_close($handle); | ||
|
||
if( $status !== 200 ) { | ||
throw new \RuntimeException('Can\'t access the robots.txt file at: '.$url); | ||
} | ||
} | ||
|
||
$parser = new Parser($item); | ||
return $parser->parse(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
<?php | ||
/** | ||
* This file is part of the beebot package. | ||
* For the full copyright and license information, please view the LICENSE | ||
* file that was distributed with this source code. | ||
* | ||
* @copyright Bee4 2015 | ||
* @author Stephane HULARD <[email protected]> | ||
* @package Bee4\RobotsTxt | ||
*/ | ||
|
||
namespace Bee4\RobotsTxt; | ||
|
||
/** | ||
* Class Rules | ||
* Represent a collection of Rules | ||
* @package Bee4\RobotsTxt | ||
*/ | ||
class Rules | ||
{ | ||
const DEFAULT_UA = '*'; | ||
|
||
/** | ||
* The collection of rules | ||
* @var array | ||
*/ | ||
protected $collection = []; | ||
|
||
/** | ||
* Add a new rule to the collection | ||
* @param string $ua | ||
* @param Rule $rule | ||
* @return Rules | ||
*/ | ||
public function add($ua, Rule $rule) { | ||
$ua = $this->handleUa($ua); | ||
if( isset($this->collection[$ua]) ) { | ||
throw new \RuntimeException('You can\'t add 2 rules for the same UserAgent'); | ||
} | ||
$this->collection[$ua] = $rule; | ||
|
||
return $this; | ||
} | ||
|
||
public function match($ua, $url) { | ||
if( ($rule = $this->get($ua)) === null ) { | ||
return false; | ||
} | ||
return $rule->match($url); | ||
} | ||
|
||
/** | ||
* Retrieve rules for a given UA | ||
* @param string $ua | ||
* @return null|Rule | ||
*/ | ||
public function get($ua) { | ||
$item = null; | ||
$it = new \ArrayIterator($this->collection); | ||
iterator_apply($it, function($it, $ua) use (&$item) { | ||
if( $it->key() != Rules::DEFAULT_UA && preg_match($it->key(), $ua) != false ) { | ||
$item = $it->current(); | ||
return false; | ||
} | ||
return true; | ||
}, [$it, $ua]); | ||
|
||
return $item!==null?$item:(isset($this->collection[self::DEFAULT_UA])?$this->collection[self::DEFAULT_UA]:null); | ||
} | ||
|
||
/** | ||
* Update the UA to make a valid regexp | ||
* @param string $ua | ||
* @return string | ||
*/ | ||
private function handleUa($ua) { | ||
if( $ua == self::DEFAULT_UA ) { | ||
return $ua; | ||
} | ||
return '/^'.preg_quote($ua).'.*/i'; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
<?php | ||
/** | ||
* This file is part of the beebot package. | ||
* For the full copyright and license information, please view the LICENSE | ||
* file that was distributed with this source code. | ||
* | ||
* @copyright Bee4 2015 | ||
* @author Stephane HULARD <[email protected]> | ||
* @package Test\Bee4\RobotsTxt | ||
*/ | ||
|
||
namespace Test\Bee4\RobotsTxt; | ||
|
||
use Bee4\RobotsTxt\Parser; | ||
use Bee4\RobotsTxt\ParserFactory; | ||
|
||
/** | ||
* Parser unit test | ||
* @package Test\Bee4\RobotsTxt | ||
*/ | ||
class ParserTest extends \PHPUnit_Framework_TestCase | ||
{ | ||
protected $content = "User-agent: * | ||
Disallow: /mentions-legales/ | ||
User-agent: google-bot | ||
Allow: /truite.php | ||
disallow: /"; | ||
|
||
public function testParse() { | ||
$object = new Parser($this->content); | ||
$rules = $object->parse(); | ||
|
||
$rule = $rules->get('*'); | ||
$this->assertInstanceOf('\Bee4\RobotsTxt\Rule', $rule); | ||
|
||
$this->assertFalse($rule->match('/mentions-legales/')); | ||
$this->assertTrue($rule->match('/another-page.html')); | ||
|
||
$this->assertFalse($rules->match('Google-Bot v01', '/toto')); | ||
$this->assertTrue($rules->match('Google-Bot v01', '/truite.php')); | ||
} | ||
|
||
public function testParserFactory() { | ||
$rules = ParserFactory::build("http://www.bee4.fr"); | ||
|
||
$this->assertInstanceOf('\Bee4\RobotsTxt\Rule', $rules->get('*')); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters