Skip to content

Commit 9ed01f4

Browse files
authored
Merge pull request #70 from xboxeer/feature/readme-restructuring
Restructure README: Fluent API first, Docker deployment, legacy in se…
2 parents 8d5bc24 + f2e89a3 commit 9ed01f4

File tree

2 files changed

+497
-339
lines changed

2 files changed

+497
-339
lines changed

LEGACY.md

Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
# NScrapy Legacy API / NScrapy 旧版 API
2+
3+
> ⚠️ 以下为旧版类继承 API,已不再推荐使用。新项目请使用 [Fluent API](../README.md#编程指南--programming-guide)
4+
> The following is the legacy class-inheritance API, no longer recommended for new projects. Please use the [Fluent API](../README.md#编程指南--programming-guide) for new projects.
5+
6+
---
7+
8+
## NScrapy Sample Code / NScrapy 示例代码
9+
10+
Below is a sample of NScrapy, the sample will visit Liepin, which is a Recruit web site.
11+
Based on the seed URL defined in the `[URL]` attribute, NScrapy will visit each Position information in detail page (the `ParseItem` method), and visit the next page automatically (the `VisitPage` method).
12+
It is not necessary for the Spider writer to know how the Spiders distributed in different machine/process communicate with each other, and how the Downloader process get the URL that need to be downloaded. Just tell NScrapy the seed URL, inherit `Spider.Spider` class and write some callback, NScrapy will take the rest of the work.
13+
NScrapy supports different kind of extension, including add your own DownloaderMiddleware, config HTTP header, user agent pool.
14+
15+
如下是一段简单的 NScrapy 爬虫,该爬虫会抓取猎聘网上所有 PHP 的职位信息并做相应的输出。
16+
基于定义在 `[URL]` attribute 中的种子 URL,NScrapy 会访问每一个职位信息的详细信息页面(`ParseItem` method),并且自动爬取下一页信息(`VisitPage` method)。
17+
爬虫作者不需要关心如何管理分布式爬虫之间如何互相通信,下载器如何获取待下载队列,下载器池是如何维护的,仅仅需要告诉 NScrapy 一个种子链接,继承 `Spider.Spider` 类,并完成默认回调函数就可以爬取信息。
18+
NScrapy 支持丰富的自定义扩展,包括在配置文件 `appsetting.json` 中加入 `DownloaderMiddware`、配置 Http 请求头、构造 User Agent pool 等。
19+
20+
### Usage / 使用方法
21+
22+
```csharp
23+
using NScrapy.Infra;
24+
using NScrapy.Infra.Attributes.SpiderAttributes;
25+
using System;
26+
using System.Collections.Generic;
27+
using System.IO;
28+
using System.Linq;
29+
using System.Text;
30+
using System.Threading;
31+
32+
namespace NScrapy.Project
33+
{
34+
class Program
35+
{
36+
static void Main(string[] args)
37+
{
38+
// Init shell of NScrapy, which will init the context of NScrapy
39+
var shell = NScrapy.Shell.NScrapy.GetInstance();
40+
// Specify the Spider that you want to start
41+
shell.Crawl("JobSpider");
42+
return;
43+
}
44+
}
45+
46+
[Name(Name = "JobSpider")]
47+
[URL("https://www.liepin.com/zhaopin/?industries=&dqs=&salary=&jobKind=&pubTime=30&compkind=&compscale=&industryType=&searchType=1&clean_condition=&isAnalysis=&init=1&sortFlag=15&flushckid=0&fromSearchBtn=1&headckid=bb314f611fde073c&d_headId=4b294eff4ad202db83d4ed085fcbf94b&d_ckId=01fb643c53d14dd44d7991e27c98c51b&d_sfrom=search_prime&d_curPage=0&d_pageSize=40&siTag=k_cloHQj_hyIn0SLM9IfRg~UoKQA1_uiNxxEb8RglVcHg&key=php")]
48+
public class JobSpider : Spider.Spider
49+
{
50+
private string startingTime = DateTime.Now.ToString("yyyyMMddhhmm");
51+
52+
public JobSpider()
53+
{
54+
}
55+
56+
// 爬取种子链接 / Crawl seed URL
57+
public override void ResponseHandler(IResponse response)
58+
{
59+
var httpResponse = response as HttpResponse;
60+
var returnValue = response.CssSelector(".job-info h3 a::attr(href)");
61+
var pages = response.CssSelector(".pagerbar a::attr(href)").Extract();
62+
63+
foreach (var page in pages)
64+
{
65+
if (!page.Contains("javascript"))
66+
{
67+
NScrapy.Shell.NScrapy.GetInstance().Follow(returnValue, page, VisitPage);
68+
}
69+
}
70+
VisitPage(returnValue);
71+
}
72+
73+
// 翻页 / Visit next page
74+
private void VisitPage(IResponse returnValue)
75+
{
76+
var hrefs = returnValue.CssSelector(".job-info h3 a::attr(href)").Extract();
77+
78+
foreach (var href in hrefs)
79+
{
80+
// Use ItemLoader / 使用 ItemLoader
81+
NScrapy.Shell.NScrapy.GetInstance().Follow(returnValue, href, ParseItem);
82+
}
83+
84+
var pages = returnValue.CssSelector(".pagerbar a::attr(href)").Extract();
85+
foreach (var page in pages)
86+
{
87+
if (!page.Contains("javascript"))
88+
{
89+
NScrapy.Shell.NScrapy.GetInstance().Follow(returnValue, page, VisitPage);
90+
}
91+
}
92+
}
93+
94+
// 在具体岗位的招聘页面上获取信息 / Get info from job detail page
95+
public void ParseItem(IResponse response)
96+
{
97+
// Add Field Mapping to the HTML Dom element / 添加字段映射到 HTML DOM 元素
98+
var itemLoader = new ItemLoader<JobItem>(response);
99+
itemLoader.AddFieldMapping("Title", "css:.title-info h1::attr(text)");
100+
itemLoader.AddFieldMapping("Title", "css:.job-title h1::attr(text)");
101+
102+
itemLoader.AddFieldMapping("Firm", "css:.title-info h3 a::attr(text)");
103+
itemLoader.AddFieldMapping("Firm", "css:.title-info h3::attr(text)");
104+
itemLoader.AddFieldMapping("Firm", "css:.title-info h3");
105+
itemLoader.AddFieldMapping("Firm", "css:.job-title h2::attr(text)");
106+
107+
itemLoader.AddFieldMapping("Salary", "css:.job-main-title p::attr(text)");
108+
itemLoader.AddFieldMapping("Salary", "css:.job-main-title strong::attr(text)");
109+
itemLoader.AddFieldMapping("Salary", "css:.job-item-title p::attr(text)");
110+
itemLoader.AddFieldMapping("Salary", "css:.job-item-title");
111+
112+
itemLoader.AddFieldMapping("Time", "css:.job-title-left time::attr(title)");
113+
itemLoader.AddFieldMapping("Time", "css:.job-title-left time::attr(text)");
114+
115+
var item = itemLoader.LoadItem();
116+
117+
// 在示例中简单输出职位公司信息到控制台,你可以将信息输出到任何其他地方
118+
// Simple write the Position Firm information at the console,
119+
// you can write the information to anywhere else
120+
Console.WriteLine(item.Firm);
121+
}
122+
}
123+
124+
public class JobItem
125+
{
126+
public string Firm { get; set; }
127+
public string Title { get; set; }
128+
public string Salary { get; set; }
129+
public string Time { get; set; }
130+
}
131+
}
132+
```
133+
134+
---
135+
136+
## 分布式运行 / Distributed NScrapy
137+
138+
### 修改 Spider 项目中的 appsettings.json / Modify appsettings.json in Spider Project
139+
140+
```json
141+
{
142+
"Scheduler": {
143+
"SchedulerType": "NScrapy.Scheduler.RedisExt.RedisScheduler"
144+
},
145+
"Scheduler.RedisExt": {
146+
"RedisServer": "192.168.0.106",
147+
"RedisPort": "6379",
148+
"ReceiverQueue": "NScrapy.Downloader",
149+
"ResponseQueue": "NScrapy.ResponseQueue"
150+
}
151+
}
152+
```
153+
154+
### 修改 Downloader 项目中的 appsettings.json / Modify appsettings.json in Downloader Project
155+
156+
```json
157+
{
158+
"Scheduler": {
159+
"SchedulerType": "NScrapy.Scheduler.RedisExt.RedisScheduler"
160+
},
161+
"Scheduler.RedisExt": {
162+
"RedisServer": "192.168.0.106",
163+
"RedisPort": "6379",
164+
"ReceiverQueue": "NScrapy.Downloader",
165+
"ResponseQueue": "NScrapy.ResponseQueue"
166+
}
167+
}
168+
```
169+
170+
### 单独运行 DownloaderShell / Run DownloaderShell Individually
171+
172+
```bash
173+
dotnet /path/to/NScrapy.DownloaderShell.dll
174+
```
175+
176+
### 状态更新中间件 / Status Updater Middleware
177+
178+
如果需要将 Downloader 状态更新到 Redis,可以添加下面的中间件到 appsettings.json:
179+
If you want to update Downloader status to Redis, add the below middleware to appsettings.json:
180+
181+
```json
182+
"DownloaderMiddlewares": [
183+
{ "Middleware": "NScrapy.DownloaderShell.StatusUpdaterMiddleware" }
184+
]
185+
```
186+
187+
> 💡 [NScrapyWebConsole](https://github.com/xboxeer/NScrapyWebConsole) 会从 Redis 中读取 Downloader 状态数据。
188+
> [NScrapyWebConsole](https://github.com/xboxeer/NScrapyWebConsole) will read Downloader status from Redis.
189+
190+
---
191+
192+
### MongoDB Pipeline(旧版 / Legacy)
193+
194+
如果需要将抓取到的内容添加到 MongoDB 中,可以创建如下 PipelineItem:
195+
If you want to add the data that you captured to a MongoDB, you can add below PipelineItem:
196+
197+
```csharp
198+
public class MongoItemPipeline : IPipeline<JobItem>
199+
{
200+
private MongoClient client = new MongoClient("mongodb://localhost:27017");
201+
202+
public async void ProcessItem(JobItem item, ISpider spider)
203+
{
204+
var db = client.GetDatabase("NScrapy");
205+
var collection = db.GetCollection<JobItem>("JobItem");
206+
await collection.InsertOneAsync(item);
207+
}
208+
}
209+
```
210+
211+
添加到 `appsettings.json`
212+
Add the Pipeline to your project's `appsettings.json`:
213+
214+
```json
215+
"Pipelines": [
216+
{ "Pipeline": "NScrapy.Project.MongoItemPipeline" }
217+
]
218+
```
219+
220+
---
221+
222+
### CSV Pipeline(旧版 / Legacy)
223+
224+
如果想要存储到 CSV 文件中,也可以添加 CSV pipeline:
225+
You can also save your data in CSV by adding CSV pipeline:
226+
227+
```csharp
228+
public class CSVItemPipeline : IPipeline<JobItem>
229+
{
230+
private string startTime = DateTime.Now.ToString("yyyyMMddhhmm");
231+
232+
public void ProcessItem(JobItem item, ISpider spider)
233+
{
234+
var info = $"{item.Title},{item.Firm},{item.SalaryFrom},{item.SalaryTo},{item.Location},{item.Time},{item.URL},{System.Environment.NewLine}";
235+
Console.WriteLine(info);
236+
File.AppendAllText($"output-{startTime}.csv", info, Encoding.UTF8);
237+
}
238+
}
239+
```
240+
241+
添加到 `appsettings.json`
242+
Add the pipeline item in `appsettings.json`:
243+
244+
```json
245+
"Pipelines": [
246+
{ "Pipeline": "NScrapy.Project.MongoItemPipeline" },
247+
{ "Pipeline": "NScrapy.Project.CSVItemPipeline" }
248+
]
249+
```

0 commit comments

Comments
 (0)