Skip to content

Commit 377c527

Browse files
committed
Merge branch 'topic-scraping' into v0.1
2 parents 74a13be + 6ab7a5a commit 377c527

File tree

2 files changed

+126
-1
lines changed

2 files changed

+126
-1
lines changed

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
"static-favicon": "~1.0.0",
2727
"tough-cookie": "^0.12.1",
2828
"underscore": "^1.6.0",
29-
"xml2json": "^0.4.0"
29+
"xml2json": "^0.4.0",
30+
"cheerio": "latest"
3031
},
3132
"devDependencies": {
3233
"debowerify": "^0.9.1",

script/crawler.js

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
/*
2+
script/ceawler.js
3+
4+
resources/company_ids.jsにあるcompanyIDのリストを使ってスクレイピングをするツール
5+
*/
6+
7+
//Initialize instance
8+
var request = require("request");
9+
var cheerio = require("cheerio");
10+
11+
var company_ids = require('../resources/company_ids');
12+
13+
var output = [];
14+
15+
var i = 0;
16+
17+
function outputFunc(){
18+
if ( i >= company_ids.length ){
19+
console.log( output );
20+
}
21+
};
22+
23+
function crawl(name){
24+
i++; //counter
25+
var requestUrl = "https://www.wantedly.com/companies/" + name + "/info";
26+
request({url: requestUrl}, function(error, response, body) {
27+
if (!error && response.statusCode == 200) {
28+
$ = cheerio.load(body);
29+
30+
// 募集人数
31+
var job_count_raw =
32+
$("ul.wt-ui-tab > li:nth-child(3) > a").text();
33+
var job_count_list = /\((\d+)\)/.exec(job_count_raw);
34+
var job_count;
35+
if(job_count_list)
36+
job_count = job_count_list.pop();
37+
38+
// 従業員数
39+
var company_basic_info_html =
40+
$("section.company_basic_info > ul").text();
41+
var employee_count_list =
42+
/(\d+)\n?/.exec(company_basic_info_html);
43+
var employee_count;
44+
if (employee_count_list){
45+
employee_count = employee_count_list[1];
46+
}
47+
// 創立年月
48+
var foundation_date_list =
49+
/(\d+\d+)/.exec(company_basic_info_html);
50+
var foundation_date;
51+
if (foundation_date_list){
52+
foundation_date = foundation_date_list[1];
53+
}
54+
55+
// 会社情報
56+
var company_name = $("h1.company-name > a").text();
57+
var address = $("section.section_location.article > address").text();
58+
var logo_url = $("div.profile-photo > a > span > img").attr("src");
59+
var company_url = $("p.company-url > a").text();
60+
61+
// 私たちについて
62+
var about_me = $("section.section_origin > p").text();
63+
64+
// 経度緯度 (あとでちゃんと取る)
65+
var lats_raw_list = /Gmaps\.map\.markers\s\=\s\[(.*)\];/.exec(body);
66+
var lats_raw;
67+
if(lats_raw_list){
68+
lats_raw = lats_raw_list.pop();
69+
}
70+
71+
var lats;
72+
if( lats_raw ){
73+
lats = JSON.parse(lats_raw);
74+
}
75+
76+
var company = {
77+
"id" : i,
78+
"name" : name,
79+
"company_name" : company_name,
80+
"address" : address,
81+
"logo_url" : logo_url,
82+
"company_url" : company_url,
83+
"job_count": job_count || null,
84+
"employee_count": employee_count || null,
85+
"foundation_date": foundation_date || null,
86+
"about_me": about_me || null,
87+
"lat": (lats)?lats.lat: null,
88+
"lng": (lats)?lats.lng: null
89+
};
90+
output.push( company );
91+
outputFunc();
92+
}
93+
94+
else {
95+
console.log("--- error occcured ---");
96+
if (error && "code" in error) {
97+
console.log("Error No:" + error.errno);
98+
console.log("Error Code:" + error.code);
99+
console.log("Error Syscall:" + error.syscall);
100+
console.log("Status Code:" + response.statusCode);
101+
}
102+
}
103+
});
104+
}
105+
106+
function sleep(x) {
107+
return function(func) {
108+
setTimeout(function() {
109+
console.warn(i + company_ids[x]);
110+
crawl(company_ids[x]);
111+
func();
112+
}, 500);
113+
}
114+
}
115+
116+
function loop(x) {
117+
sleep(x)(function(){
118+
if(x < company_ids.length -1){
119+
loop(x+1);
120+
}
121+
});
122+
}
123+
124+
loop(0);

0 commit comments

Comments
 (0)