Skip to content
This repository has been archived by the owner. It is now read-only.

Commit 30430cd

Browse files
authored
Detect page variants to increase reliability (#3)
Detect page variants to increase reliability
1 parent e551ba6 commit 30430cd

19 files changed

+277
-68
lines changed

Diff for: README.md

+10-5
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ Due to the pages usually have different structures depending on different variab
6363
trying to cover maximum page variations possible. The scraper WILL NOT BE ABLE to scrap page variations not incorporated
6464
in the dataset.
6565

66-
Once we did the job, all is ready to work. You should not care about updates always you have enought data in the dataset
66+
Once we did the job, all is ready to work. You should not care about updates always you have enough data in the dataset
6767
to cover all the new modifications on the page, so the scraper will recalculate the modifications on the fly. You can
6868
check [how it works](how-it-works.md) to know much about the internals.
6969

@@ -74,6 +74,7 @@ We will check more deeply how we can create a new dataset and what options are a
7474
The dataset is composed by `url` and `data`.
7575
* The `url` part is simple, you just need to indicate the url from where you obtained the data.
7676
* The `type` part gives a item name to the current dataset. This allows you to define multiple types.
77+
* The `variant` identifies the page variant. The identifier is a sha1 hash build based in the xpath used to get the data.
7778
* The `data` part is where you indicate what data and assign the label that you want to get.
7879
The data could be a list of items or a single item.
7980

@@ -83,9 +84,10 @@ A basic example could be:
8384
use Softonic\LaravelIntelligentScraper\Scraper\Models\ScrapedDataset;
8485

8586
ScrapedDataset::create([
86-
'url' => 'https://test.c/p/my-objective',
87-
'type' => 'Item-definition-1',
88-
'data' => [
87+
'url' => 'https://test.c/p/my-objective',
88+
'type' => 'Item-definition-1',
89+
'variant' => '8ed10778a83f1266e7ffed90205f7fb61ddcdf78',
90+
'data' => [
8991
'title' => 'My title',
9092
'body' => 'This is the body content I want to get',
9193
'images' => [
@@ -112,6 +114,7 @@ use Softonic\LaravelIntelligentScraper\Scraper\Models\ScrapedDataset;
112114
ScrapedDataset::create([
113115
'url' => 'https://test.c/p/my-objective',
114116
'type' => 'Item-definition-1',
117+
'variant' => '8ed10778a83f1266e7ffed90205f7fb61ddcdf78',
115118
'data' => [
116119
'title' => 'My title',
117120
'body' => regexp('/^Body starts here, but it is do long that.*$/si'),
@@ -160,7 +163,7 @@ After configure the scraper, you will be able to request an specific scrape usin
160163
```php
161164
<?php
162165

163-
scrape('https://test.c/p/my-objective', 'Item-definition-1');
166+
scrape('https://test.c/p/my-objective', 'Item-definition-1', 'variant-sha1');
164167
```
165168

166169
The scrape will produce a `\Softonic\LaravelIntelligentScraper\Scraper\Events\Scraped` event if all worked as expected.
@@ -170,6 +173,7 @@ So attach a listener to that event to receive the data.
170173
$event->scrapeRequest->url // Url scraped
171174
$event->scrapeRequest->type // Request type
172175
$event->data // Contains all the data in a [ 'fieldName' => 'value' ] format.
176+
$event->variant // Contains the page variation sha1 hash.
173177
```
174178

175179
All the output fields are arrays that can contain one or more results.
@@ -212,6 +216,7 @@ The scraper is auto configurable, but needs an initial dataset or add a configur
212216
The dataset tells the configurator which data do you want and how to label it.
213217

214218
There are three services that have unique responsibilities and are connected using the event system.
219+
215220
### Scrape
216221

217222
It is fired when the system receive a `\Softonic\LaravelIntelligentScraper\Scraper\Events\ScrapeRequest` event. It

Diff for: src/Scraper/Application/Configurator.php

+20-3
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,16 @@ class Configurator
2222
*/
2323
private $xpathBuilder;
2424

25-
public function __construct(Client $client, XpathBuilder $xpathBuilder)
25+
/**
26+
* @var VariantGenerator
27+
*/
28+
private $variantGenerator;
29+
30+
public function __construct(Client $client, XpathBuilder $xpathBuilder, VariantGenerator $variantGenerator)
2631
{
27-
$this->client = $client;
28-
$this->xpathBuilder = $xpathBuilder;
32+
$this->client = $client;
33+
$this->xpathBuilder = $xpathBuilder;
34+
$this->variantGenerator = $variantGenerator;
2935
}
3036

3137
/**
@@ -76,18 +82,23 @@ private function getCrawler($scrapedData)
7682
private function findConfigByScrapedData($scrapedData, $crawler)
7783
{
7884
$result = [];
85+
7986
foreach ($scrapedData['data'] as $field => $value) {
8087
try {
8188
$result[$field] = $this->xpathBuilder->find(
8289
$crawler->getNode(0),
8390
$value
8491
);
92+
$this->variantGenerator->addConfig($field, $result[$field]);
8593
} catch (\UnexpectedValueException $e) {
94+
$this->variantGenerator->fieldNotFound();
8695
$value = is_array($value) ? json_encode($value) : $value;
8796
Log::warning("Field '{$field}' with value '{$value}' not found for '{$crawler->getUri()}'.");
8897
}
8998
}
9099

100+
$this->updateVariant($scrapedData);
101+
91102
return $result;
92103
}
93104

@@ -134,4 +145,10 @@ private function checkConfiguration($data, Collection $finalConfig)
134145
throw new ConfigurationException("Field(s) \"{$fieldsMissing}\" not found.", 0);
135146
}
136147
}
148+
149+
private function updateVariant($scrapedData): void
150+
{
151+
$scrapedData['variant'] = $this->variantGenerator->getId($scrapedData['type']);
152+
$scrapedData->save();
153+
}
137154
}

Diff for: src/Scraper/Application/VariantGenerator.php

+53
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
<?php
2+
3+
namespace Softonic\LaravelIntelligentScraper\Scraper\Application;
4+
5+
class VariantGenerator
6+
{
7+
protected $type = null;
8+
9+
protected $configPerField = [];
10+
11+
protected $allFieldsFound = true;
12+
13+
public function setType($type): void
14+
{
15+
$this->type = $type;
16+
}
17+
18+
public function addConfig($field, $xpath)
19+
{
20+
$this->configPerField[] = $field . $xpath;
21+
}
22+
23+
public function fieldNotFound()
24+
{
25+
$this->allFieldsFound = false;
26+
}
27+
28+
public function getId($type = null)
29+
{
30+
$type = $type ?? $this->type;
31+
if (empty($type)) {
32+
throw new \InvalidArgumentException('Type should be provided in the getVariantId call or setType');
33+
}
34+
35+
if (empty($this->configPerField) || !$this->allFieldsFound) {
36+
return null;
37+
}
38+
39+
sort($this->configPerField);
40+
41+
$id = sha1($type . implode('', $this->configPerField));
42+
$this->reset();
43+
44+
return $id;
45+
}
46+
47+
public function reset()
48+
{
49+
$this->setType(null);
50+
$this->configPerField = [];
51+
$this->allFieldsFound = true;
52+
}
53+
}

Diff for: src/Scraper/Application/XpathFinder.php

+12-3
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,15 @@ class XpathFinder
1212
*/
1313
private $client;
1414

15-
public function __construct(GoutteClient $client)
15+
/**
16+
* @var VariantGenerator
17+
*/
18+
private $variantGenerator;
19+
20+
public function __construct(GoutteClient $client, VariantGenerator $variantGenerator)
1621
{
17-
$this->client = $client;
22+
$this->client = $client;
23+
$this->variantGenerator = $variantGenerator;
1824
}
1925

2026
public function extract(string $url, $configs): array
@@ -32,6 +38,7 @@ public function extract(string $url, $configs): array
3238
$subcrawler = $crawler->filterXPath($xpath);
3339

3440
if ($subcrawler->count()) {
41+
$this->variantGenerator->addConfig($config['name'], $xpath);
3542
break;
3643
}
3744
}
@@ -43,11 +50,13 @@ public function extract(string $url, $configs): array
4350
);
4451
}
4552

46-
$result[$config['name']] = $subcrawler->each(function ($node) {
53+
$result['data'][$config['name']] = $subcrawler->each(function ($node) {
4754
return $node->text();
4855
});
4956
}
5057

58+
$result['variant'] = $this->variantGenerator->getId($config['type']);
59+
5160
return $result;
5261
}
5362
}

Diff for: src/Scraper/Events/Scraped.php

+8-1
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,22 @@ class Scraped
1919
*/
2020
public $data;
2121

22+
/**
23+
* @var string
24+
*/
25+
public $variant;
26+
2227
/**
2328
* Create a new event instance.
2429
*
2530
* @param ScrapeRequest $scrapeRequest
2631
* @param array $data
32+
* @param string $variant
2733
*/
28-
public function __construct(ScrapeRequest $scrapeRequest, array $data)
34+
public function __construct(ScrapeRequest $scrapeRequest, array $data, string $variant)
2935
{
3036
$this->scrapeRequest = $scrapeRequest;
3137
$this->data = $data;
38+
$this->variant = $variant;
3239
}
3340
}

Diff for: src/Scraper/Listeners/ConfigureScraper.php

+2-2
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,8 @@ private function extractData(ScrapeRequest $scrapeRequest, $config): void
7474
{
7575
$this->logger->debug("Extracting data from $scrapeRequest->url for type '$scrapeRequest->type'");
7676

77-
$data = $this->xpathFinder->extract($scrapeRequest->url, $config);
78-
event(new Scraped($scrapeRequest, $data));
77+
list('data' => $data, 'variant' => $variant) = $this->xpathFinder->extract($scrapeRequest->url, $config);
78+
event(new Scraped($scrapeRequest, $data, $variant));
7979
}
8080

8181
/**

Diff for: src/Scraper/Listeners/Scrape.php

+2-2
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ private function extractData(ScrapeRequest $scrapeRequest, $config): void
7777
{
7878
$this->logger->debug("Extracting data from $scrapeRequest->url for type '$scrapeRequest->type'");
7979

80-
$data = $this->xpathFinder->extract($scrapeRequest->url, $config);
81-
event(new Scraped($scrapeRequest, $data));
80+
list('data' => $data, 'variant' => $variant) = $this->xpathFinder->extract($scrapeRequest->url, $config);
81+
event(new Scraped($scrapeRequest, $data, $variant));
8282
}
8383
}

Diff for: src/Scraper/Listeners/UpdateDataset.php

+7-4
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,19 @@ public function handle(Scraped $event)
2323

2424
private function addDataset(Scraped $event)
2525
{
26-
$scraperDatasets = ScrapedDataset::withType($event->scrapeRequest->type);
26+
$scraperDatasets = ScrapedDataset::withType($event->scrapeRequest->type)
27+
->withVariant($event->variant);
28+
2729
if (self::DATASET_AMOUNT_LIMIT <= $scraperDatasets->count()) {
2830
$scraperDatasets->orderBy('updated_at', 'desc')->first()->delete();
2931
}
3032

3133
ScrapedDataset::create(
3234
[
33-
'url' => $event->scrapeRequest->url,
34-
'type' => $event->scrapeRequest->type,
35-
'data' => $event->data,
35+
'url' => $event->scrapeRequest->url,
36+
'type' => $event->scrapeRequest->type,
37+
'variant' => $event->variant,
38+
'data' => $event->data,
3639
]
3740
);
3841
}

Diff for: src/Scraper/Models/ScrapedDataset.php

+6
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,17 @@ class ScrapedDataset extends Model
3535
protected $fillable = [
3636
'url',
3737
'type',
38+
'variant',
3839
'data',
3940
];
4041

4142
public function scopeWithType($query, string $type)
4243
{
4344
return $query->where('type', $type);
4445
}
46+
47+
public function scopeWithVariant($query, string $variant)
48+
{
49+
return $query->where('variant', $variant);
50+
}
4551
}

Diff for: src/Scraper/Traits/VariantId.php

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
namespace Softonic\LaravelIntelligentScraper\Scraper\Traits;
4+
5+
trait VariantId
6+
{
7+
protected function getVariantId(string $type, array $variant)
8+
{
9+
if (empty($variant)) {
10+
return null;
11+
}
12+
13+
sort($variant);
14+
15+
return sha1($type . implode('', $variant));
16+
}
17+
}

Diff for: src/Scraper/helpers.php

+10-6
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
<?php
22

3-
function regexp($regexp)
4-
{
5-
return ['regexp' => $regexp];
3+
if (!function_exists('regexp')) {
4+
function regexp($regexp)
5+
{
6+
return ['regexp' => $regexp];
7+
}
68
}
79

8-
function scrape($url, $type)
9-
{
10-
event(new \Softonic\LaravelIntelligentScraper\Scraper\Events\ScrapeRequest($url, $type));
10+
if (!function_exists('scrape')) {
11+
function scrape($url, $type)
12+
{
13+
event(new \Softonic\LaravelIntelligentScraper\Scraper\Events\ScrapeRequest($url, $type));
14+
}
1115
}

Diff for: src/database/factories/ScrapedDatasetFactory.php

+4-3
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@
1616
/* @var \Illuminate\Database\Eloquent\Factory $factory */
1717
$factory->define(ScrapedDataset::class, function (Faker\Generator $faker) {
1818
return [
19-
'url' => $faker->url . $faker->randomDigit,
20-
'type' => 'post',
21-
'data' => [
19+
'url' => $faker->url . $faker->randomDigit,
20+
'type' => 'post',
21+
'variant' => $faker->sha1,
22+
'data' => [
2223
'title' => $faker->word,
2324
'author' => $faker->word,
2425
],

Diff for: src/database/migrations/2018_07_16_091154_create_scraped_datasets_table.php

+1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ public function up()
1515
Schema::create('scraped_datasets', function (Blueprint $table) {
1616
$table->string('url', 1024)->primary();
1717
$table->string('type');
18+
$table->string('variant', 40)->index()->nullable();
1819
$table->json('data');
1920
$table->timestamps();
2021
});

0 commit comments

Comments
 (0)