forked from apify/actor-scrapy-executor
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.js
More file actions
91 lines (80 loc) · 3.27 KB
/
Copy pathmain.js
File metadata and controls
91 lines (80 loc) · 3.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
const Apify = require('apify');
const { spawn } = require('child_process');
const fs = require('fs');
const tar = require('tar');
const tarfs = require('tar-fs');
const execSync = require('child_process').execSync;
Apify.getValue('INPUT').then((input) => {
if (input != null) {
// build spider
fs.writeFileSync('./actor/spiders/run.py', input.scrapyCode, (err) => {
if (err) console.log(err);
console.log('Successfully built scrapy spider.');
});
// configure proxy
var useProxy = false;
var proxyAddress = `http://auto:${process.env.APIFY_PROXY_PASSWORD}@proxy.apify.com:8000`;
if (!input.proxyConfig.useApifyProxy && input.proxyConfig.proxyUrls != null && input.proxyConfig.proxyUrls.length !== 0) {
useProxy = true;
const proxyUrl = input.proxyConfig.proxyUrls[0];
proxyAddress = proxyUrl;
} else if (input.proxyConfig.useApifyProxy && input.proxyConfig.apifyProxyGroups != null && input.proxyConfig.apifyProxyGroups.length !== 0) {
useProxy = true;
const proxyGroups = input.proxyConfig.apifyProxyGroups.join('+');
proxyAddress = `http://groups-${proxyGroups}:${process.env.APIFY_PROXY_PASSWORD}@proxy.apify.com:8000`;
} else if (input.proxyConfig.useApifyProxy) {
useProxy = true;
proxyAddress = `http://auto:${process.env.APIFY_PROXY_PASSWORD}@proxy.apify.com:8000`;
}
}
Apify.getValue('jobdir.tgz').then((stream) => {
// load persistent storage
if (stream != null) {
fs.writeFileSync('downloaded.tgz', stream);
try { execSync('rm -r ./crawls/'); } catch (err) {}
fs.createReadStream('downloaded.tgz').pipe(tarfs.extract('./'));
}
// if apify didn't auto-create
try { execSync('mkdir ./apify_storage/'); } catch (err) {}
try { execSync('mkdir ./apify_storage/datasets && mkdir ./apify_storage/datasets/default'); } catch (err) {}
try { execSync('mkdir ./apify_storage/key_value_stores && mkdir ./apify_storage/key_value_stores/default'); } catch (err) {}
// construct scrapy env vars
const env = Object.create(process.env);
if (useProxy) {
env.http_proxy = proxyAddress;
}
// update spider state every 5 seconds
const storeJobsInterval = setInterval(() => {
tar.c({ gzip: false, file: 'jobdir.tgz' }, ['crawls/']).then(() => {
Apify.setValue('jobdir.tgz', fs.readFileSync('jobdir.tgz'), { contentType: 'application/tar+gzip' });
});
}, 5000);
// run spiders
const scrapyList = spawn('scrapy', ['list']);
const scrapyRun = spawn('xargs', ['-n', '1', 'scrapy', 'crawl'], { env });
scrapyList.stdout.on('data', (data) => {
scrapyRun.stdin.write(data);
});
scrapyList.stderr.on('data', (data) => {
console.log(`${data}`);
});
scrapyList.on('close', (code) => {
if (code !== 0) {
console.log(`scrapy list exited with code ${code}`);
}
scrapyRun.stdin.end();
});
scrapyRun.stdout.on('data', (data) => {
console.log(data.toString());
});
scrapyRun.stderr.on('data', (data) => {
console.log(`${data}`);
});
scrapyRun.on('close', (code) => {
clearInterval(storeJobsInterval);
if (code !== 0) {
console.log(`scrapy crawl process exited with code ${code}`);
}
});
});
});