|
| 1 | +# NScrapy Legacy API / NScrapy 旧版 API |
| 2 | + |
| 3 | +> ⚠️ 以下为旧版类继承 API,已不再推荐使用。新项目请使用 [Fluent API](../README.md#编程指南--programming-guide)。 |
| 4 | +> The following is the legacy class-inheritance API, no longer recommended for new projects. Please use the [Fluent API](../README.md#编程指南--programming-guide) for new projects. |
| 5 | +
|
| 6 | +--- |
| 7 | + |
| 8 | +## NScrapy Sample Code / NScrapy 示例代码 |
| 9 | + |
| 10 | +Below is a sample of NScrapy, the sample will visit Liepin, which is a Recruit web site. |
| 11 | +Based on the seed URL defined in the `[URL]` attribute, NScrapy will visit each Position information in detail page (the `ParseItem` method), and visit the next page automatically (the `VisitPage` method). |
| 12 | +It is not necessary for the Spider writer to know how the Spiders distributed in different machine/process communicate with each other, and how the Downloader process get the URL that need to be downloaded. Just tell NScrapy the seed URL, inherit `Spider.Spider` class and write some callback, NScrapy will take the rest of the work. |
| 13 | +NScrapy supports different kind of extension, including add your own DownloaderMiddleware, config HTTP header, user agent pool. |
| 14 | + |
| 15 | +如下是一段简单的 NScrapy 爬虫,该爬虫会抓取猎聘网上所有 PHP 的职位信息并做相应的输出。 |
| 16 | +基于定义在 `[URL]` attribute 中的种子 URL,NScrapy 会访问每一个职位信息的详细信息页面(`ParseItem` method),并且自动爬取下一页信息(`VisitPage` method)。 |
| 17 | +爬虫作者不需要关心如何管理分布式爬虫之间如何互相通信,下载器如何获取待下载队列,下载器池是如何维护的,仅仅需要告诉 NScrapy 一个种子链接,继承 `Spider.Spider` 类,并完成默认回调函数就可以爬取信息。 |
| 18 | +NScrapy 支持丰富的自定义扩展,包括在配置文件 `appsetting.json` 中加入 `DownloaderMiddware`、配置 Http 请求头、构造 User Agent pool 等。 |
| 19 | + |
| 20 | +### Usage / 使用方法 |
| 21 | + |
| 22 | +```csharp |
| 23 | +using NScrapy.Infra; |
| 24 | +using NScrapy.Infra.Attributes.SpiderAttributes; |
| 25 | +using System; |
| 26 | +using System.Collections.Generic; |
| 27 | +using System.IO; |
| 28 | +using System.Linq; |
| 29 | +using System.Text; |
| 30 | +using System.Threading; |
| 31 | + |
| 32 | +namespace NScrapy.Project |
| 33 | +{ |
| 34 | + class Program |
| 35 | + { |
| 36 | + static void Main(string[] args) |
| 37 | + { |
| 38 | + // Init shell of NScrapy, which will init the context of NScrapy |
| 39 | + var shell = NScrapy.Shell.NScrapy.GetInstance(); |
| 40 | + // Specify the Spider that you want to start |
| 41 | + shell.Crawl("JobSpider"); |
| 42 | + return; |
| 43 | + } |
| 44 | + } |
| 45 | + |
| 46 | + [Name(Name = "JobSpider")] |
| 47 | + [URL("https://www.liepin.com/zhaopin/?industries=&dqs=&salary=&jobKind=&pubTime=30&compkind=&compscale=&industryType=&searchType=1&clean_condition=&isAnalysis=&init=1&sortFlag=15&flushckid=0&fromSearchBtn=1&headckid=bb314f611fde073c&d_headId=4b294eff4ad202db83d4ed085fcbf94b&d_ckId=01fb643c53d14dd44d7991e27c98c51b&d_sfrom=search_prime&d_curPage=0&d_pageSize=40&siTag=k_cloHQj_hyIn0SLM9IfRg~UoKQA1_uiNxxEb8RglVcHg&key=php")] |
| 48 | + public class JobSpider : Spider.Spider |
| 49 | + { |
| 50 | + private string startingTime = DateTime.Now.ToString("yyyyMMddhhmm"); |
| 51 | + |
| 52 | + public JobSpider() |
| 53 | + { |
| 54 | + } |
| 55 | + |
| 56 | + // 爬取种子链接 / Crawl seed URL |
| 57 | + public override void ResponseHandler(IResponse response) |
| 58 | + { |
| 59 | + var httpResponse = response as HttpResponse; |
| 60 | + var returnValue = response.CssSelector(".job-info h3 a::attr(href)"); |
| 61 | + var pages = response.CssSelector(".pagerbar a::attr(href)").Extract(); |
| 62 | + |
| 63 | + foreach (var page in pages) |
| 64 | + { |
| 65 | + if (!page.Contains("javascript")) |
| 66 | + { |
| 67 | + NScrapy.Shell.NScrapy.GetInstance().Follow(returnValue, page, VisitPage); |
| 68 | + } |
| 69 | + } |
| 70 | + VisitPage(returnValue); |
| 71 | + } |
| 72 | + |
| 73 | + // 翻页 / Visit next page |
| 74 | + private void VisitPage(IResponse returnValue) |
| 75 | + { |
| 76 | + var hrefs = returnValue.CssSelector(".job-info h3 a::attr(href)").Extract(); |
| 77 | + |
| 78 | + foreach (var href in hrefs) |
| 79 | + { |
| 80 | + // Use ItemLoader / 使用 ItemLoader |
| 81 | + NScrapy.Shell.NScrapy.GetInstance().Follow(returnValue, href, ParseItem); |
| 82 | + } |
| 83 | + |
| 84 | + var pages = returnValue.CssSelector(".pagerbar a::attr(href)").Extract(); |
| 85 | + foreach (var page in pages) |
| 86 | + { |
| 87 | + if (!page.Contains("javascript")) |
| 88 | + { |
| 89 | + NScrapy.Shell.NScrapy.GetInstance().Follow(returnValue, page, VisitPage); |
| 90 | + } |
| 91 | + } |
| 92 | + } |
| 93 | + |
| 94 | + // 在具体岗位的招聘页面上获取信息 / Get info from job detail page |
| 95 | + public void ParseItem(IResponse response) |
| 96 | + { |
| 97 | + // Add Field Mapping to the HTML Dom element / 添加字段映射到 HTML DOM 元素 |
| 98 | + var itemLoader = new ItemLoader<JobItem>(response); |
| 99 | + itemLoader.AddFieldMapping("Title", "css:.title-info h1::attr(text)"); |
| 100 | + itemLoader.AddFieldMapping("Title", "css:.job-title h1::attr(text)"); |
| 101 | + |
| 102 | + itemLoader.AddFieldMapping("Firm", "css:.title-info h3 a::attr(text)"); |
| 103 | + itemLoader.AddFieldMapping("Firm", "css:.title-info h3::attr(text)"); |
| 104 | + itemLoader.AddFieldMapping("Firm", "css:.title-info h3"); |
| 105 | + itemLoader.AddFieldMapping("Firm", "css:.job-title h2::attr(text)"); |
| 106 | + |
| 107 | + itemLoader.AddFieldMapping("Salary", "css:.job-main-title p::attr(text)"); |
| 108 | + itemLoader.AddFieldMapping("Salary", "css:.job-main-title strong::attr(text)"); |
| 109 | + itemLoader.AddFieldMapping("Salary", "css:.job-item-title p::attr(text)"); |
| 110 | + itemLoader.AddFieldMapping("Salary", "css:.job-item-title"); |
| 111 | + |
| 112 | + itemLoader.AddFieldMapping("Time", "css:.job-title-left time::attr(title)"); |
| 113 | + itemLoader.AddFieldMapping("Time", "css:.job-title-left time::attr(text)"); |
| 114 | + |
| 115 | + var item = itemLoader.LoadItem(); |
| 116 | + |
| 117 | + // 在示例中简单输出职位公司信息到控制台,你可以将信息输出到任何其他地方 |
| 118 | + // Simple write the Position Firm information at the console, |
| 119 | + // you can write the information to anywhere else |
| 120 | + Console.WriteLine(item.Firm); |
| 121 | + } |
| 122 | + } |
| 123 | + |
| 124 | + public class JobItem |
| 125 | + { |
| 126 | + public string Firm { get; set; } |
| 127 | + public string Title { get; set; } |
| 128 | + public string Salary { get; set; } |
| 129 | + public string Time { get; set; } |
| 130 | + } |
| 131 | +} |
| 132 | +``` |
| 133 | + |
| 134 | +--- |
| 135 | + |
| 136 | +## 分布式运行 / Distributed NScrapy |
| 137 | + |
| 138 | +### 修改 Spider 项目中的 appsettings.json / Modify appsettings.json in Spider Project |
| 139 | + |
| 140 | +```json |
| 141 | +{ |
| 142 | + "Scheduler": { |
| 143 | + "SchedulerType": "NScrapy.Scheduler.RedisExt.RedisScheduler" |
| 144 | + }, |
| 145 | + "Scheduler.RedisExt": { |
| 146 | + "RedisServer": "192.168.0.106", |
| 147 | + "RedisPort": "6379", |
| 148 | + "ReceiverQueue": "NScrapy.Downloader", |
| 149 | + "ResponseQueue": "NScrapy.ResponseQueue" |
| 150 | + } |
| 151 | +} |
| 152 | +``` |
| 153 | + |
| 154 | +### 修改 Downloader 项目中的 appsettings.json / Modify appsettings.json in Downloader Project |
| 155 | + |
| 156 | +```json |
| 157 | +{ |
| 158 | + "Scheduler": { |
| 159 | + "SchedulerType": "NScrapy.Scheduler.RedisExt.RedisScheduler" |
| 160 | + }, |
| 161 | + "Scheduler.RedisExt": { |
| 162 | + "RedisServer": "192.168.0.106", |
| 163 | + "RedisPort": "6379", |
| 164 | + "ReceiverQueue": "NScrapy.Downloader", |
| 165 | + "ResponseQueue": "NScrapy.ResponseQueue" |
| 166 | + } |
| 167 | +} |
| 168 | +``` |
| 169 | + |
| 170 | +### 单独运行 DownloaderShell / Run DownloaderShell Individually |
| 171 | + |
| 172 | +```bash |
| 173 | +dotnet /path/to/NScrapy.DownloaderShell.dll |
| 174 | +``` |
| 175 | + |
| 176 | +### 状态更新中间件 / Status Updater Middleware |
| 177 | + |
| 178 | +如果需要将 Downloader 状态更新到 Redis,可以添加下面的中间件到 appsettings.json: |
| 179 | +If you want to update Downloader status to Redis, add the below middleware to appsettings.json: |
| 180 | + |
| 181 | +```json |
| 182 | +"DownloaderMiddlewares": [ |
| 183 | + { "Middleware": "NScrapy.DownloaderShell.StatusUpdaterMiddleware" } |
| 184 | +] |
| 185 | +``` |
| 186 | + |
| 187 | +> 💡 [NScrapyWebConsole](https://github.com/xboxeer/NScrapyWebConsole) 会从 Redis 中读取 Downloader 状态数据。 |
| 188 | +> [NScrapyWebConsole](https://github.com/xboxeer/NScrapyWebConsole) will read Downloader status from Redis. |
| 189 | +
|
| 190 | +--- |
| 191 | + |
| 192 | +### MongoDB Pipeline(旧版 / Legacy) |
| 193 | + |
| 194 | +如果需要将抓取到的内容添加到 MongoDB 中,可以创建如下 PipelineItem: |
| 195 | +If you want to add the data that you captured to a MongoDB, you can add below PipelineItem: |
| 196 | + |
| 197 | +```csharp |
| 198 | +public class MongoItemPipeline : IPipeline<JobItem> |
| 199 | +{ |
| 200 | + private MongoClient client = new MongoClient("mongodb://localhost:27017"); |
| 201 | + |
| 202 | + public async void ProcessItem(JobItem item, ISpider spider) |
| 203 | + { |
| 204 | + var db = client.GetDatabase("NScrapy"); |
| 205 | + var collection = db.GetCollection<JobItem>("JobItem"); |
| 206 | + await collection.InsertOneAsync(item); |
| 207 | + } |
| 208 | +} |
| 209 | +``` |
| 210 | + |
| 211 | +添加到 `appsettings.json`: |
| 212 | +Add the Pipeline to your project's `appsettings.json`: |
| 213 | + |
| 214 | +```json |
| 215 | +"Pipelines": [ |
| 216 | + { "Pipeline": "NScrapy.Project.MongoItemPipeline" } |
| 217 | +] |
| 218 | +``` |
| 219 | + |
| 220 | +--- |
| 221 | + |
| 222 | +### CSV Pipeline(旧版 / Legacy) |
| 223 | + |
| 224 | +如果想要存储到 CSV 文件中,也可以添加 CSV pipeline: |
| 225 | +You can also save your data in CSV by adding CSV pipeline: |
| 226 | + |
| 227 | +```csharp |
| 228 | +public class CSVItemPipeline : IPipeline<JobItem> |
| 229 | +{ |
| 230 | + private string startTime = DateTime.Now.ToString("yyyyMMddhhmm"); |
| 231 | + |
| 232 | + public void ProcessItem(JobItem item, ISpider spider) |
| 233 | + { |
| 234 | + var info = $"{item.Title},{item.Firm},{item.SalaryFrom},{item.SalaryTo},{item.Location},{item.Time},{item.URL},{System.Environment.NewLine}"; |
| 235 | + Console.WriteLine(info); |
| 236 | + File.AppendAllText($"output-{startTime}.csv", info, Encoding.UTF8); |
| 237 | + } |
| 238 | +} |
| 239 | +``` |
| 240 | + |
| 241 | +添加到 `appsettings.json`: |
| 242 | +Add the pipeline item in `appsettings.json`: |
| 243 | + |
| 244 | +```json |
| 245 | +"Pipelines": [ |
| 246 | + { "Pipeline": "NScrapy.Project.MongoItemPipeline" }, |
| 247 | + { "Pipeline": "NScrapy.Project.CSVItemPipeline" } |
| 248 | +] |
| 249 | +``` |
0 commit comments