diff --git a/README.md b/README.md index 566ed7d..23413d7 100644 --- a/README.md +++ b/README.md @@ -128,6 +128,8 @@ Note that the kind of value passed to the "id" key may vary depending on what se _**RegexSelector** uses under the hood._ +For your convenience, when using Regex, a match with `'id' => 0` will return the URL of the crawled page. + ### Retrieving & Sorting Once you've specified a selector using the **with** method, you can start retrieving and/or sorting the data. @@ -188,18 +190,18 @@ $matches->filter(function($match) { ### Mutating -In order to handle inconsistencies or formatting issues, you can alter the matched values to a more desirable value. Altering happens before filtering and sorting the result set. You can do so by using the `apply` index in the match configuration array with a closure that takes 1 argument: the matched value. +In order to handle inconsistencies or formatting issues, you can alter the matched values to a more desirable value. Altering happens before filtering and sorting the result set. You can do so by using the `apply` index in the match configuration array with a closure that takes 2 arguments: the matched value and the URL of the crawled page. ```php $matchConfig = array( array( 'name' => 'url', 'id' => 1, - // Add domain to URL if it's not present already - 'apply' => function($match) use($url) + // Add domain to relative URLs + 'apply' => function($match, $sourceUrl) { if (!stristr($match, 'http')) { - return $url . trim($match, '/'); + return $sourceUrl . trim($match, '/'); } return $match; }, @@ -218,7 +220,7 @@ $matchConfig = array( ### Validation -You may validate the matched data to insure that the result set always contains the desired result. Validation happens after optionally mutating the data set with `apply`. To add the validation rules that should be applied to the data, use the `validate` index in the match configuration array with a closure that takes one argument: the matched value. The closure should return `true` if the validation succeeded, and `false` if the validation failed. Matches that fail the validation will be removed from the result set. +You may validate the matched data to insure that the result set always contains the desired result. Validation happens after optionally mutating the data set with `apply`. To add the validation rules that should be applied to the data, use the `validate` index in the match configuration array with a closure that takes 2 arguments: the matched value and the URL of the crawled page. The closure should return `true` if the validation succeeded, and `false` if the validation failed. Matches that fail the validation will be removed from the result set. ```php $matchConfig = array( diff --git a/src/Laurentvw/Scrapher/Matcher.php b/src/Laurentvw/Scrapher/Matcher.php index 8931eef..989f663 100644 --- a/src/Laurentvw/Scrapher/Matcher.php +++ b/src/Laurentvw/Scrapher/Matcher.php @@ -101,13 +101,15 @@ public function addLog($msg) /** * @param $content * + * @param $sourceKey * @return array */ - public function getMatches($content) + public function getMatches($content, $sourceKey) { $filteredResults = array(); $this->getSelector()->setContent($content); + $this->getSelector()->setSourceKey($sourceKey); $matches = $this->getSelector()->getMatches(); @@ -140,14 +142,14 @@ private function fetch(array $matchLine) foreach ($this->getSelector()->getConfig() as $match) { // Get the match value, optionally apply a function to it if (isset($match['apply'])) { - $result[$match['name']] = $match['apply']($matchLine[$match['name']]); + $result[$match['name']] = $match['apply']($matchLine[$match['name']], $this->getSelector()->getSourceKey()); } else { $result[$match['name']] = $matchLine[$match['name']]; } // Validate this match if (isset($match['validate'])) { - if (!$match['validate']($result[$match['name']])) { + if (!$match['validate']($result[$match['name']], $this->getSelector()->getSourceKey())) { $this->addLog('Skipping match because validation failed for '.$match['name'].': '.$result[$match['name']]); return false; diff --git a/src/Laurentvw/Scrapher/Scrapher.php b/src/Laurentvw/Scrapher/Scrapher.php index d25b90c..43c13ee 100644 --- a/src/Laurentvw/Scrapher/Scrapher.php +++ b/src/Laurentvw/Scrapher/Scrapher.php @@ -84,7 +84,7 @@ public function __construct($contents = null) public function addUrl($url) { $page = new Page($url); - $this->addContent($page->getHTML()); + $this->addContent($page->getHTML(), $url); return $this; } @@ -109,12 +109,16 @@ public function addUrls(array $urls) * Add content to scrape. * * @param string $content - * + * @param null $key * @return Scrapher */ - public function addContent($content) + public function addContent($content, $key = null) { - $this->contents[] = $content; + if (!is_null($key)) { + $this->contents[$key] = $content; + } else { + $this->contents[] = $content; + } return $this; } @@ -305,8 +309,8 @@ protected function scrape() $results = array(); - foreach ($this->contents as $content) { - $results = array_merge($results, $this->getMatcher()->getMatches($content)); + foreach ($this->contents as $id => $content) { + $results = array_merge($results, $this->getMatcher()->getMatches($content, $id)); } if ($results) { diff --git a/src/Laurentvw/Scrapher/Selectors/RegexSelector.php b/src/Laurentvw/Scrapher/Selectors/RegexSelector.php index 1bb93a7..58f9c0a 100644 --- a/src/Laurentvw/Scrapher/Selectors/RegexSelector.php +++ b/src/Laurentvw/Scrapher/Selectors/RegexSelector.php @@ -14,6 +14,10 @@ public function getMatches() foreach ($matchLines as $i => $matchLine) { foreach ($this->getConfig() as $config) { + if ($config['id'] == 0) { + $matches[$i][$config['name']] = $this->getSourceKey(); + continue; + } if (!isset($matchLine[$config['id']])) { throw new MatchIdNotFoundException($config['id']); } diff --git a/src/Laurentvw/Scrapher/Selectors/Selector.php b/src/Laurentvw/Scrapher/Selectors/Selector.php index 342c0a4..ac8e987 100644 --- a/src/Laurentvw/Scrapher/Selectors/Selector.php +++ b/src/Laurentvw/Scrapher/Selectors/Selector.php @@ -4,7 +4,7 @@ abstract class Selector { - private $content, $expression, $config; + private $content, $sourceKey, $expression, $config; public function __construct($expression, $config) { @@ -22,6 +22,16 @@ public function getContent() return $this->content; } + public function setSourceKey($key) + { + $this->sourceKey = $key; + } + + public function getSourceKey() + { + return $this->sourceKey; + } + public function setConfig($config) { $this->config = $config;