Skip to content
This repository has been archived by the owner. It is now read-only.

Commit cae36fa

Browse files
authored
Fix http errors behaviour using the guzzle middleware (#10)
1 parent 1d9702b commit cae36fa

File tree

5 files changed

+148
-42
lines changed

5 files changed

+148
-42
lines changed

README.md

+28
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,34 @@ To publish the scraper config, you can use
2121
```bash
2222
php artisan vendor:publish --provider="Softonic\LaravelIntelligentScraper\ScraperProvider" --tag=config
2323
```
24+
### Dependencies
25+
26+
This package depends on [goutte](https://packagist.org/packages/fabpot/goutte) that depends on [guzzle](https://packagist.org/packages/guzzle/guzzle), so you can customize the client to
27+
your requisites. The only requirement for this package is that you must include the `http_error` midleware in the
28+
handle stack.
29+
30+
Example:
31+
```php
32+
<?php
33+
34+
use GuzzleHttp\Handler\CurlHandler;
35+
use GuzzleHttp\HandlerStack;
36+
use GuzzleHttp\Middleware;
37+
use Goutte\Client as GoutteClient;
38+
use App\MyMiddleware;
39+
40+
$client = new GoutteClient();
41+
$stack = new HandlerStack();
42+
43+
$stack->setHandler(new CurlHandler());
44+
$stack->push(MyMiddleware::getHandler(), 'my_middleware'); // Your custom middleware
45+
$stack->push(Middleware::httpErrors(), 'http_errors'); // Required middleware for the package
46+
47+
$guzzleClient = new GuzzleClient(['handler' => $stack]);
48+
$client->setClient($guzzleClient);
49+
```
50+
51+
The default stack already has the http_errors middleware, so you only need to do this if you are not using the default stack.
2452

2553
## Configuration
2654

src/Scraper/Application/Configurator.php

+13-8
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
namespace Softonic\LaravelIntelligentScraper\Scraper\Application;
44

55
use Goutte\Client;
6+
use GuzzleHttp\Exception\ConnectException;
7+
use GuzzleHttp\Exception\RequestException;
68
use Illuminate\Support\Collection;
79
use Illuminate\Support\Facades\Log;
810
use Softonic\LaravelIntelligentScraper\Scraper\Events\ConfigurationScraped;
@@ -74,21 +76,24 @@ public function configureFromDataset($scrapedDataset): Collection
7476

7577
private function getCrawler($scrapedData)
7678
{
77-
Log::info("Request {$scrapedData['url']}");
78-
$crawler = $this->client->request('GET', $scrapedData['url']);
79+
try {
80+
Log::info("Request {$scrapedData['url']}");
7981

80-
$httpCode = $this->client->getInternalResponse()->getStatus();
81-
if ($httpCode !== 200) {
82+
return $this->client->request('GET', $scrapedData['url']);
83+
} catch (ConnectException $e) {
84+
Log::notice(
85+
"Connection error: {$e->getMessage()}",
86+
compact('scrapedData')
87+
);
88+
$scrapedData->delete();
89+
} catch (RequestException $e) {
90+
$httpCode = $e->getResponse()->getStatusCode() ?? null;
8291
Log::notice(
8392
"Response status ({$httpCode}) invalid, so proceeding to delete the scraped data.",
8493
compact('scrapedData')
8594
);
8695
$scrapedData->delete();
87-
88-
return null;
8996
}
90-
91-
return $crawler;
9297
}
9398

9499
/**

src/Scraper/Application/XpathFinder.php

+20-8
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
namespace Softonic\LaravelIntelligentScraper\Scraper\Application;
44

55
use Goutte\Client as GoutteClient;
6+
use GuzzleHttp\Exception\ConnectException;
7+
use GuzzleHttp\Exception\RequestException;
68
use Illuminate\Support\Facades\Log;
79
use Softonic\LaravelIntelligentScraper\Scraper\Exceptions\MissingXpathValueException;
810

@@ -26,15 +28,9 @@ public function __construct(GoutteClient $client, VariantGenerator $variantGener
2628

2729
public function extract(string $url, $configs): array
2830
{
29-
Log::info("Requesting $url");
30-
$crawler = $this->client->request('GET', $url);
31-
$httpCode = $this->client->getInternalResponse()->getStatus();
32-
if ($httpCode !== 200) {
33-
Log::info('Invalid response http status', ['status' => $httpCode]);
34-
throw new \UnexpectedValueException("Response error from '{$url}' with '{$httpCode}' http code");
35-
}
31+
$crawler = $this->getCrawler($url);
3632

37-
Log::info('Response Received. Starting crawler.');
33+
Log::info('Response Received. Start crawling.');
3834
$result = [];
3935
foreach ($configs as $config) {
4036
Log::info("Searching field {$config['name']}.");
@@ -68,4 +64,20 @@ public function extract(string $url, $configs): array
6864

6965
return $result;
7066
}
67+
68+
private function getCrawler(string $url)
69+
{
70+
try {
71+
Log::info("Requesting $url");
72+
73+
return $this->client->request('GET', $url);
74+
} catch (ConnectException $e) {
75+
Log::info("Unavailable url '{$url}'", ['message' => $e->getMessage()]);
76+
throw new \UnexpectedValueException("Unavailable url '{$url}'");
77+
} catch (RequestException $e) {
78+
$httpCode = $e->getResponse()->getStatusCode();
79+
Log::info('Invalid response http status', ['status' => $httpCode]);
80+
throw new \UnexpectedValueException("Response error from '{$url}' with '{$httpCode}' http code");
81+
}
82+
}
7183
}

tests/Unit/Scraper/Application/ConfiguratorTest.php

+45-14
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
namespace Softonic\LaravelIntelligentScraper\Scraper\Application;
44

55
use Goutte\Client;
6+
use GuzzleHttp\Exception\ConnectException;
7+
use GuzzleHttp\Exception\RequestException;
68
use Illuminate\Foundation\Testing\DatabaseMigrations;
79
use Illuminate\Support\Facades\Log;
810
use Mockery\Mock;
@@ -64,7 +66,7 @@ public function setUp()
6466
/**
6567
* @test
6668
*/
67-
public function whenTryToFindNewXpathButUrlFromDatasetIsNotValidThrowAnExceptionAndRemoveIt()
69+
public function whenTryToFindNewXpathButUrlFromDatasetIsNotFoundThrowAnExceptionAndRemoveIt()
6870
{
6971
$posts = [
7072
new ScrapedDataset([
@@ -77,16 +79,55 @@ public function whenTryToFindNewXpathButUrlFromDatasetIsNotValidThrowAnException
7779
]),
7880
];
7981

82+
$requestException = \Mockery::mock(RequestException::class);
83+
$requestException->shouldReceive('getResponse->getStatusCode')
84+
->once()
85+
->andReturn(404);
8086
$this->client->shouldReceive('request')
8187
->once()
8288
->with(
8389
'GET',
8490
'https://test.c/123456789012'
8591
)
86-
->andReturnSelf();
87-
$this->client->shouldReceive('getInternalResponse->getStatus')
92+
->andThrows($requestException);
93+
94+
$this->configuration->shouldReceive('findByType')
8895
->once()
89-
->andReturn(404);
96+
->with('post')
97+
->andReturn(collect());
98+
99+
try {
100+
$this->configurator->configureFromDataset($posts);
101+
} catch (ConfigurationException $e) {
102+
$this->assertEquals('Field(s) "title,author" not found.', $e->getMessage());
103+
$this->assertDatabaseMissing('scraped_datasets', ['url' => 'https://test.c/123456789012']);
104+
}
105+
}
106+
107+
/**
108+
* @test
109+
*/
110+
public function whenTryToFindNewXpathButUrlFromDatasetIsNotAvailableThrowAnExceptionAndRemoveIt()
111+
{
112+
$posts = [
113+
new ScrapedDataset([
114+
'url' => 'https://test.c/123456789012',
115+
'type' => 'post',
116+
'data' => [
117+
'title' => 'My Title',
118+
'author' => 'My author',
119+
],
120+
]),
121+
];
122+
123+
$connectException = \Mockery::mock(ConnectException::class);
124+
$this->client->shouldReceive('request')
125+
->once()
126+
->with(
127+
'GET',
128+
'https://test.c/123456789012'
129+
)
130+
->andThrows($connectException);
90131

91132
$this->configuration->shouldReceive('findByType')
92133
->once()
@@ -125,9 +166,6 @@ public function whenTryToFindNewXpathButNotFoundItShouldLogItAndResetVariant()
125166
'https://test.c/123456789012'
126167
)
127168
->andReturnSelf();
128-
$this->client->shouldReceive('getInternalResponse->getStatus')
129-
->once()
130-
->andReturn(200);
131169

132170
$rootElement = new \DOMElement('test');
133171
$this->client->shouldReceive('getNode')
@@ -191,9 +229,6 @@ public function whenUseSomeOldXpathButNotFoundNewsItShouldLogItAndResetVariant()
191229
'https://test.c/123456789012'
192230
)
193231
->andReturnSelf();
194-
$this->client->shouldReceive('getInternalResponse->getStatus')
195-
->once()
196-
->andReturn(200);
197232

198233
$rootElement = new \DOMElement('test');
199234
$this->client->shouldReceive('getNode')
@@ -282,8 +317,6 @@ public function whenTryToFindXpathInMultiplepostsAndNotFoundInAnyItShouldThrowAn
282317
'https://test.c/123456789022'
283318
)
284319
->andReturnSelf();
285-
$this->client->shouldReceive('getInternalResponse->getStatus')
286-
->andReturn(200);
287320
$this->client->shouldReceive('getUri')
288321
->andReturn('https://test.c/123456789012');
289322

@@ -382,8 +415,6 @@ public function whenDiscoverDifferentXpathItShouldGetAllOfThemAndUpdateTheVarian
382415
'https://test.c/123456789033'
383416
)
384417
->andReturnSelf();
385-
$this->client->shouldReceive('getInternalResponse->getStatus')
386-
->andReturn(200);
387418

388419
$rootElement = new \DOMElement('test');
389420
$this->client->shouldReceive('getNode')

tests/Unit/Scraper/Application/XpathFinderTest.php

+42-12
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
namespace Softonic\LaravelIntelligentScraper\Scraper\Application;
44

55
use Goutte\Client;
6+
use GuzzleHttp\Exception\ConnectException;
7+
use GuzzleHttp\Exception\RequestException;
68
use Illuminate\Foundation\Testing\DatabaseMigrations;
79
use Illuminate\Support\Facades\Log;
810
use Softonic\LaravelIntelligentScraper\Scraper\Exceptions\MissingXpathValueException;
@@ -23,7 +25,7 @@ public function setUp()
2325
/**
2426
* @test
2527
*/
26-
public function whenExtractUsingAnInvalidUrlItShouldThrowAnException()
28+
public function whenExtractUsingAnInvalidUrlStatusItShouldThrowAnException()
2729
{
2830
$config = [
2931
Configuration::create([
@@ -34,21 +36,55 @@ public function whenExtractUsingAnInvalidUrlItShouldThrowAnException()
3436
];
3537

3638
$variantGenerator = \Mockery::mock(VariantGenerator::class);
39+
40+
$requestException = \Mockery::mock(RequestException::class);
41+
$requestException->shouldReceive('getResponse->getStatusCode')
42+
->once()
43+
->andReturn(404);
44+
3745
$client = \Mockery::mock(Client::class);
3846
$client->shouldReceive('request')
3947
->once()
4048
->with(
4149
'GET',
4250
'url'
4351
)
44-
->andReturnSelf();
52+
->andThrows($requestException);
53+
54+
$this->expectException(\UnexpectedValueException::class);
55+
$this->expectExceptionMessage('Response error from \'url\' with \'404\' http code');
56+
57+
$xpathFinder = new XpathFinder($client, $variantGenerator);
58+
$xpathFinder->extract('url', $config);
59+
}
60+
61+
/**
62+
* @test
63+
*/
64+
public function whenExtractUsingAnUnavailableUrlItShouldThrowAnException()
65+
{
66+
$config = [
67+
Configuration::create([
68+
'name' => 'title',
69+
'type' => 'post',
70+
'xpaths' => ['//*[@id="title"]'],
71+
]),
72+
];
73+
74+
$variantGenerator = \Mockery::mock(VariantGenerator::class);
4575

46-
$client->shouldReceive('getInternalResponse->getStatus')
76+
$connectException = \Mockery::mock(ConnectException::class);
77+
$client = \Mockery::mock(Client::class);
78+
$client->shouldReceive('request')
4779
->once()
48-
->andReturn(404);
80+
->with(
81+
'GET',
82+
'url'
83+
)
84+
->andThrows($connectException);
4985

5086
$this->expectException(\UnexpectedValueException::class);
51-
$this->expectExceptionMessage('Response error from \'url\' with \'404\' http code');
87+
$this->expectExceptionMessage('Unavailable url \'url\'');
5288

5389
$xpathFinder = new XpathFinder($client, $variantGenerator);
5490
$xpathFinder->extract('url', $config);
@@ -73,17 +109,14 @@ public function whenXpathIsMissingAValueItShouldThrowAnException()
73109
$internalXpathFinder = \Mockery::mock(\Symfony\Component\DomCrawler\Crawler::class);
74110

75111
$variantGenerator = \Mockery::mock(VariantGenerator::class);
76-
$client = \Mockery::mock(Client::class);
112+
$client = \Mockery::mock(Client::class);
77113
$client->shouldReceive('request')
78114
->once()
79115
->with(
80116
'GET',
81117
'url'
82118
)
83119
->andReturn($internalXpathFinder);
84-
$client->shouldReceive('getInternalResponse->getStatus')
85-
->once()
86-
->andReturn(200);
87120

88121
$internalXpathFinder->shouldReceive('filterXPath')
89122
->once()
@@ -145,9 +178,6 @@ public function whenXpathsAreFoundItShouldReturnTheFoundValues()
145178
'url'
146179
)
147180
->andReturn($internalXpathFinder);
148-
$client->shouldReceive('getInternalResponse->getStatus')
149-
->once()
150-
->andReturn(200);
151181

152182
$internalXpathFinder->shouldReceive('filterXPath')
153183
->once()

0 commit comments

Comments
 (0)