1- using System . Text . RegularExpressions ;
1+ using Iveely . CloudComputing . Client ;
22using Iveely . Database ;
3- using Iveely . Framework . NLP ;
4- using Iveely . Framework . Process ;
53using Iveely . Framework . Text ;
64using System ;
75using System . Collections . Generic ;
8- using System . Data ;
9- using System . Data . SqlClient ;
106using System . IO ;
117using System . Linq ;
12- using System . Text ;
138using System . Threading ;
14- using System . Threading . Tasks ;
159
1610namespace Iveely . SearchEngine
1711{
1812 /// <summary>
1913 /// 爬虫
2014 /// </summary>
21- public class BaikeDataCrawler : Iveely . CloudComputing . Client . Application
15+ public class BaikeDataCrawler : Application
2216 {
2317 public class Page
2418 {
@@ -29,7 +23,7 @@ public class Page
2923 public string Site ;
3024 }
3125
32- private static object obj = new object ( ) ;
26+ private static readonly object obj = new object ( ) ;
3327 public class DataSaver
3428 {
3529 public void SavePage ( ref List < Page > docs , string folder , bool isForce = false )
@@ -87,45 +81,44 @@ public object GetData(string url)
8781 List < Page > docs = new List < Page > ( ) ;
8882
8983 // 当前需要爬行的链接
90- List < string > CurrentUrls = new List < string > ( ) ;
84+ List < string > currentUrls = new List < string > ( ) ;
9185
9286 // 已经爬行过的链接
93- HashSet < string > VisitedUrls = new HashSet < string > ( ) ;
87+ HashSet < string > visitedUrls = new HashSet < string > ( ) ;
9488
9589
96- string [ ] urlInfo = url . Split ( new char [ ] { ' ' } , StringSplitOptions . RemoveEmptyEntries ) ;
90+ string [ ] urlInfo = url . Split ( new [ ] { ' ' } , StringSplitOptions . RemoveEmptyEntries ) ;
9791 string schemaUrl = "http://" + urlInfo [ 0 ] ;
9892
9993 //Uri可能转换失败
94+ int hasVisited = 0 ;
10095 try
10196 {
10297 Uri hostUrl = new Uri ( schemaUrl ) ;
103- CurrentUrls . Add ( schemaUrl ) ;
98+ currentUrls . Add ( schemaUrl ) ;
10499 string site = string . Empty ;
105- int hasVisited = 0 ;
106- int hasUrlsCount = 1 ;
107100
108101 //如果当前拥有则爬行
109- while ( CurrentUrls . Count > 0 )
102+ while ( currentUrls . Count > 0 )
110103 {
111104 hasVisited ++ ;
112105 HashSet < string > newLinks = new HashSet < string > ( ) ;
113106 try
114107 {
115108 //2. 获取网页信息
116- Console . WriteLine ( DateTime . Now . ToString ( ) + "[" + Thread . CurrentThread . ManagedThreadId + "]" + ":Visit " + CurrentUrls [ 0 ] ) ;
117- VisitedUrls . Add ( CurrentUrls [ 0 ] ) ;
109+ Console . WriteLine ( DateTime . Now + "[" + Thread . CurrentThread . ManagedThreadId + "]" + ":Visit " + currentUrls [ 0 ] ) ;
110+ visitedUrls . Add ( currentUrls [ 0 ] ) ;
118111 bool isGetContentSuc = false ;
119- Html2Article . ArticleDocument document = Html2Article . GetArticle ( CurrentUrls [ 0 ] , ref isGetContentSuc ) ;
112+ Html2Article . ArticleDocument document = Html2Article . GetArticle ( currentUrls [ 0 ] , ref isGetContentSuc ) ;
120113 if ( document != null && document . Content . Length > 10 )
121114 {
122115 if ( string . IsNullOrEmpty ( site ) )
123116 {
124- string [ ] titleArray = document . Title . Split ( new char [ ] { '-' , '_' } , StringSplitOptions . RemoveEmptyEntries ) ;
117+ string [ ] titleArray = document . Title . Split ( new [ ] { '-' , '_' } , StringSplitOptions . RemoveEmptyEntries ) ;
125118 site = titleArray [ titleArray . Length - 1 ] ;
126119 }
127120 Page page = new Page ( ) ;
128- page . Url = CurrentUrls [ 0 ] ;
121+ page . Url = currentUrls [ 0 ] ;
129122 page . Site = site ;
130123 page . Content = document . Content ;
131124 page . Title = document . Title ;
@@ -144,19 +137,19 @@ public object GetData(string url)
144137 string link = document . ChildrenLink [ j ] ;
145138 if ( link . Contains ( "#" ) )
146139 {
147- link = link . Substring ( 0 , link . IndexOf ( "#" , System . StringComparison . Ordinal ) - 1 ) ;
140+ link = link . Substring ( 0 , link . IndexOf ( "#" , StringComparison . Ordinal ) - 1 ) ;
148141 }
149142 if ( link . EndsWith ( "/" ) )
150143 {
151144 link = link . Substring ( 0 , link . Length - 1 ) ;
152145 }
153146 string host = ( new Uri ( document . ChildrenLink [ j ] ) ) . Host ;
154147 if ( host == hostUrl . Host && ! newLinks . Contains ( link ) &&
155- ! VisitedUrls . Contains ( link ) )
148+ ! visitedUrls . Contains ( link ) )
156149 {
157150
158151 newLinks . Add ( link ) ;
159- VisitedUrls . Add ( link ) ;
152+ visitedUrls . Add ( link ) ;
160153 }
161154 }
162155 catch ( Exception exception )
@@ -171,11 +164,10 @@ public object GetData(string url)
171164 {
172165 Console . WriteLine ( exception ) ;
173166 }
174- CurrentUrls . RemoveAt ( 0 ) ;
167+ currentUrls . RemoveAt ( 0 ) ;
175168 if ( newLinks . Count > 0 )
176169 {
177- CurrentUrls . AddRange ( newLinks . ToArray ( ) ) ;
178- hasUrlsCount += newLinks . Count ;
170+ currentUrls . AddRange ( newLinks . ToArray ( ) ) ;
179171 }
180172 }
181173 if ( docs . Count > 0 )
@@ -201,7 +193,7 @@ public void Index()
201193 ITable < string , Page > table = engine . OpenXTable < string , Page > ( "WebPage" ) ;
202194 foreach ( var kv in table )
203195 {
204- Page page = ( Page ) kv . Value ;
196+ Page page = kv . Value ;
205197 Console . WriteLine ( kv . Key + " " + page . Url ) ;
206198 }
207199 }
0 commit comments