Skip to content

Commit bcd8286

Browse files
committed
代码清理
1 parent ef2bf44 commit bcd8286

File tree

10 files changed

+354
-423
lines changed

10 files changed

+354
-423
lines changed

Iveely.SearchEngine/Backstage.cs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
using System.Linq;
55
using System.Net;
66
using System.Net.Mime;
7+
using System.Text;
78
using System.Threading;
89
using Iveely.CloudComputing.Client;
910
using Iveely.Framework.Algorithm;
@@ -70,7 +71,7 @@ public override string ToString()
7071
/// </summary>
7172
public static DimensionTable<string, string, double> RelativeTable;
7273

73-
public static Framework.Text.HMMSegment segment = HMMSegment.GetInstance();
74+
public static HMMSegment segment = HMMSegment.GetInstance();
7475

7576
public static LocalStore<Template.Question> DataStore;
7677

@@ -328,7 +329,7 @@ public byte[] ProcessQuery(byte[] bytes)
328329
try
329330
{
330331
Packet packet = Serializer.DeserializeFromBytes<Packet>(bytes);
331-
string type = System.Text.Encoding.UTF8.GetString(packet.Data);
332+
string type = Encoding.UTF8.GetString(packet.Data);
332333

333334
//如果是文本搜索
334335
if (type == "Text-Query")

Iveely.SearchEngine/BaikeDataCrawler.cs

Lines changed: 20 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,18 @@
1-
using System.Text.RegularExpressions;
1+
using Iveely.CloudComputing.Client;
22
using Iveely.Database;
3-
using Iveely.Framework.NLP;
4-
using Iveely.Framework.Process;
53
using Iveely.Framework.Text;
64
using System;
75
using System.Collections.Generic;
8-
using System.Data;
9-
using System.Data.SqlClient;
106
using System.IO;
117
using System.Linq;
12-
using System.Text;
138
using System.Threading;
14-
using System.Threading.Tasks;
159

1610
namespace Iveely.SearchEngine
1711
{
1812
/// <summary>
1913
/// 爬虫
2014
/// </summary>
21-
public class BaikeDataCrawler : Iveely.CloudComputing.Client.Application
15+
public class BaikeDataCrawler : Application
2216
{
2317
public class Page
2418
{
@@ -29,7 +23,7 @@ public class Page
2923
public string Site;
3024
}
3125

32-
private static object obj = new object();
26+
private static readonly object obj = new object();
3327
public class DataSaver
3428
{
3529
public void SavePage(ref List<Page> docs, string folder, bool isForce = false)
@@ -87,45 +81,44 @@ public object GetData(string url)
8781
List<Page> docs = new List<Page>();
8882

8983
// 当前需要爬行的链接
90-
List<string> CurrentUrls = new List<string>();
84+
List<string> currentUrls = new List<string>();
9185

9286
// 已经爬行过的链接
93-
HashSet<string> VisitedUrls = new HashSet<string>();
87+
HashSet<string> visitedUrls = new HashSet<string>();
9488

9589

96-
string[] urlInfo = url.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
90+
string[] urlInfo = url.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
9791
string schemaUrl = "http://" + urlInfo[0];
9892

9993
//Uri可能转换失败
94+
int hasVisited = 0;
10095
try
10196
{
10297
Uri hostUrl = new Uri(schemaUrl);
103-
CurrentUrls.Add(schemaUrl);
98+
currentUrls.Add(schemaUrl);
10499
string site = string.Empty;
105-
int hasVisited = 0;
106-
int hasUrlsCount = 1;
107100

108101
//如果当前拥有则爬行
109-
while (CurrentUrls.Count > 0)
102+
while (currentUrls.Count > 0)
110103
{
111104
hasVisited++;
112105
HashSet<string> newLinks = new HashSet<string>();
113106
try
114107
{
115108
//2. 获取网页信息
116-
Console.WriteLine(DateTime.Now.ToString() + "[" + Thread.CurrentThread.ManagedThreadId + "]" + ":Visit " + CurrentUrls[0]);
117-
VisitedUrls.Add(CurrentUrls[0]);
109+
Console.WriteLine(DateTime.Now + "[" + Thread.CurrentThread.ManagedThreadId + "]" + ":Visit " + currentUrls[0]);
110+
visitedUrls.Add(currentUrls[0]);
118111
bool isGetContentSuc = false;
119-
Html2Article.ArticleDocument document = Html2Article.GetArticle(CurrentUrls[0], ref isGetContentSuc);
112+
Html2Article.ArticleDocument document = Html2Article.GetArticle(currentUrls[0], ref isGetContentSuc);
120113
if (document != null && document.Content.Length > 10)
121114
{
122115
if (string.IsNullOrEmpty(site))
123116
{
124-
string[] titleArray = document.Title.Split(new char[] { '-', '_' }, StringSplitOptions.RemoveEmptyEntries);
117+
string[] titleArray = document.Title.Split(new[] { '-', '_' }, StringSplitOptions.RemoveEmptyEntries);
125118
site = titleArray[titleArray.Length - 1];
126119
}
127120
Page page = new Page();
128-
page.Url = CurrentUrls[0];
121+
page.Url = currentUrls[0];
129122
page.Site = site;
130123
page.Content = document.Content;
131124
page.Title = document.Title;
@@ -144,19 +137,19 @@ public object GetData(string url)
144137
string link = document.ChildrenLink[j];
145138
if (link.Contains("#"))
146139
{
147-
link = link.Substring(0, link.IndexOf("#", System.StringComparison.Ordinal) - 1);
140+
link = link.Substring(0, link.IndexOf("#", StringComparison.Ordinal) - 1);
148141
}
149142
if (link.EndsWith("/"))
150143
{
151144
link = link.Substring(0, link.Length - 1);
152145
}
153146
string host = (new Uri(document.ChildrenLink[j])).Host;
154147
if (host == hostUrl.Host && !newLinks.Contains(link) &&
155-
!VisitedUrls.Contains(link))
148+
!visitedUrls.Contains(link))
156149
{
157150

158151
newLinks.Add(link);
159-
VisitedUrls.Add(link);
152+
visitedUrls.Add(link);
160153
}
161154
}
162155
catch (Exception exception)
@@ -171,11 +164,10 @@ public object GetData(string url)
171164
{
172165
Console.WriteLine(exception);
173166
}
174-
CurrentUrls.RemoveAt(0);
167+
currentUrls.RemoveAt(0);
175168
if (newLinks.Count > 0)
176169
{
177-
CurrentUrls.AddRange(newLinks.ToArray());
178-
hasUrlsCount += newLinks.Count;
170+
currentUrls.AddRange(newLinks.ToArray());
179171
}
180172
}
181173
if (docs.Count > 0)
@@ -201,7 +193,7 @@ public void Index()
201193
ITable<string, Page> table = engine.OpenXTable<string, Page>("WebPage");
202194
foreach (var kv in table)
203195
{
204-
Page page = (Page)kv.Value;
196+
Page page = kv.Value;
205197
Console.WriteLine(kv.Key+" "+page.Url);
206198
}
207199
}

0 commit comments

Comments
 (0)