Skip to content

Commit

Permalink
Merge pull request #9 from Laurentvw/develop
Browse files Browse the repository at this point in the history
Easily get the URL of the crawled page in the match config
  • Loading branch information
Laurentvw committed Aug 27, 2015
2 parents 504c983 + c1a6bd2 commit 2ba0c4b
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 15 deletions.
12 changes: 7 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ Note that the kind of value passed to the "id" key may vary depending on what se

_**RegexSelector** uses <http://php.net/manual/en/function.preg-match-all.php> under the hood._

For your convenience, when using Regex, a match with `'id' => 0` will return the URL of the crawled page.

### Retrieving & Sorting

Once you've specified a selector using the **with** method, you can start retrieving and/or sorting the data.
Expand Down Expand Up @@ -188,18 +190,18 @@ $matches->filter(function($match) {

### Mutating

In order to handle inconsistencies or formatting issues, you can alter the matched values to a more desirable value. Altering happens before filtering and sorting the result set. You can do so by using the `apply` index in the match configuration array with a closure that takes 1 argument: the matched value.
In order to handle inconsistencies or formatting issues, you can alter the matched values to a more desirable value. Altering happens before filtering and sorting the result set. You can do so by using the `apply` index in the match configuration array with a closure that takes 2 arguments: the matched value and the URL of the crawled page.

```php
$matchConfig = array(
array(
'name' => 'url',
'id' => 1,
// Add domain to URL if it's not present already
'apply' => function($match) use($url)
// Add domain to relative URLs
'apply' => function($match, $sourceUrl)
{
if (!stristr($match, 'http')) {
return $url . trim($match, '/');
return $sourceUrl . trim($match, '/');
}
return $match;
},
Expand All @@ -218,7 +220,7 @@ $matchConfig = array(

### Validation

You may validate the matched data to insure that the result set always contains the desired result. Validation happens after optionally mutating the data set with `apply`. To add the validation rules that should be applied to the data, use the `validate` index in the match configuration array with a closure that takes one argument: the matched value. The closure should return `true` if the validation succeeded, and `false` if the validation failed. Matches that fail the validation will be removed from the result set.
You may validate the matched data to insure that the result set always contains the desired result. Validation happens after optionally mutating the data set with `apply`. To add the validation rules that should be applied to the data, use the `validate` index in the match configuration array with a closure that takes 2 arguments: the matched value and the URL of the crawled page. The closure should return `true` if the validation succeeded, and `false` if the validation failed. Matches that fail the validation will be removed from the result set.

```php
$matchConfig = array(
Expand Down
8 changes: 5 additions & 3 deletions src/Laurentvw/Scrapher/Matcher.php
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,15 @@ public function addLog($msg)
/**
* @param $content
*
* @param $sourceKey
* @return array
*/
public function getMatches($content)
public function getMatches($content, $sourceKey)
{
$filteredResults = array();

$this->getSelector()->setContent($content);
$this->getSelector()->setSourceKey($sourceKey);

$matches = $this->getSelector()->getMatches();

Expand Down Expand Up @@ -140,14 +142,14 @@ private function fetch(array $matchLine)
foreach ($this->getSelector()->getConfig() as $match) {
// Get the match value, optionally apply a function to it
if (isset($match['apply'])) {
$result[$match['name']] = $match['apply']($matchLine[$match['name']]);
$result[$match['name']] = $match['apply']($matchLine[$match['name']], $this->getSelector()->getSourceKey());
} else {
$result[$match['name']] = $matchLine[$match['name']];
}

// Validate this match
if (isset($match['validate'])) {
if (!$match['validate']($result[$match['name']])) {
if (!$match['validate']($result[$match['name']], $this->getSelector()->getSourceKey())) {
$this->addLog('Skipping match because validation failed for '.$match['name'].': '.$result[$match['name']]);

return false;
Expand Down
16 changes: 10 additions & 6 deletions src/Laurentvw/Scrapher/Scrapher.php
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ public function __construct($contents = null)
public function addUrl($url)
{
$page = new Page($url);
$this->addContent($page->getHTML());
$this->addContent($page->getHTML(), $url);

return $this;
}
Expand All @@ -109,12 +109,16 @@ public function addUrls(array $urls)
* Add content to scrape.
*
* @param string $content
*
* @param null $key
* @return Scrapher
*/
public function addContent($content)
public function addContent($content, $key = null)
{
$this->contents[] = $content;
if (!is_null($key)) {
$this->contents[$key] = $content;
} else {
$this->contents[] = $content;
}

return $this;
}
Expand Down Expand Up @@ -305,8 +309,8 @@ protected function scrape()

$results = array();

foreach ($this->contents as $content) {
$results = array_merge($results, $this->getMatcher()->getMatches($content));
foreach ($this->contents as $id => $content) {
$results = array_merge($results, $this->getMatcher()->getMatches($content, $id));
}

if ($results) {
Expand Down
4 changes: 4 additions & 0 deletions src/Laurentvw/Scrapher/Selectors/RegexSelector.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ public function getMatches()

foreach ($matchLines as $i => $matchLine) {
foreach ($this->getConfig() as $config) {
if ($config['id'] == 0) {
$matches[$i][$config['name']] = $this->getSourceKey();
continue;
}
if (!isset($matchLine[$config['id']])) {
throw new MatchIdNotFoundException($config['id']);
}
Expand Down
12 changes: 11 additions & 1 deletion src/Laurentvw/Scrapher/Selectors/Selector.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

abstract class Selector
{
private $content, $expression, $config;
private $content, $sourceKey, $expression, $config;

public function __construct($expression, $config)
{
Expand All @@ -22,6 +22,16 @@ public function getContent()
return $this->content;
}

public function setSourceKey($key)
{
$this->sourceKey = $key;
}

public function getSourceKey()
{
return $this->sourceKey;
}

public function setConfig($config)
{
$this->config = $config;
Expand Down

0 comments on commit 2ba0c4b

Please sign in to comment.