@@ -372,27 +372,25 @@ public MacroscopeDataExtractorXpaths GetDataExtractorXpaths ()
372
372
public bool Execute ( )
373
373
{
374
374
375
- this . DebugMsg ( string . Format ( "Start URL: {0}" , this . StartUrl ) ) ;
375
+ this . DebugMsg ( string . Format ( "Start URL: {0}" , this . GetStartUrl ( ) ) ) ;
376
376
377
377
//this.LogEntry( string.Format( "Executing with Start URL: {0}", this.StartUrl ) );
378
378
379
- this . StartUrl = MacroscopeHttpUrlUtils . SanitizeUrl ( Url : this . StartUrl ) ;
379
+ this . SetStartUrl ( Url : MacroscopeHttpUrlUtils . SanitizeUrl ( Url : this . GetStartUrl ( ) ) ) ;
380
380
381
- this . DocCollection . SetStartUrl ( Url : this . StartUrl ) ;
382
-
383
- this . DetermineStartingDirectory ( ) ;
381
+ this . DocCollection . SetStartUrl ( Url : this . GetStartUrl ( ) ) ;
384
382
385
383
this . SetThreadsStop ( Stopped : false ) ;
386
384
387
- this . AllowedHosts . AddFromUrl ( Url : this . StartUrl ) ;
385
+ this . AllowedHosts . AddFromUrl ( Url : this . GetStartUrl ( ) ) ;
388
386
389
387
if ( ! this . PeekUrlQueue ( ) )
390
388
{
391
389
392
390
{ // Add robots.txt URL to queue
393
391
if ( MacroscopePreferencesManager . GetFollowRobotsProtocol ( ) )
394
392
{
395
- string RobotsUrl = MacroscopeRobots . GenerateRobotUrl ( Url : this . StartUrl ) ;
393
+ string RobotsUrl = MacroscopeRobots . GenerateRobotUrl ( Url : this . GetStartUrl ( ) ) ;
396
394
if ( ! string . IsNullOrEmpty ( RobotsUrl ) )
397
395
{
398
396
this . AddUrlQueueItem ( Url : RobotsUrl ) ;
@@ -406,7 +404,7 @@ public bool Execute ()
406
404
MacroscopeSitemapPaths SitemapPaths = new MacroscopeSitemapPaths ( ) ;
407
405
foreach ( string SitemapPath in SitemapPaths . IterateSitemapPaths ( ) )
408
406
{
409
- string SitemapUrl = MacroscopeSitemapPaths . GenerateSitemapUrl ( Url : this . StartUrl , SitemapPath : SitemapPath ) ;
407
+ string SitemapUrl = MacroscopeSitemapPaths . GenerateSitemapUrl ( Url : this . GetStartUrl ( ) , SitemapPath : SitemapPath ) ;
410
408
if ( ! string . IsNullOrEmpty ( SitemapUrl ) )
411
409
{
412
410
this . AddUrlQueueItem ( Url : SitemapUrl ) ;
@@ -418,17 +416,17 @@ public bool Execute ()
418
416
{ // Add humans.txt URL to queue
419
417
if ( MacroscopePreferencesManager . GetProbeHumansText ( ) )
420
418
{
421
- string HumansUrl = MacroscopeHumans . GenerateHumansUrl ( Url : this . StartUrl ) ;
419
+ string HumansUrl = MacroscopeHumans . GenerateHumansUrl ( Url : this . GetStartUrl ( ) ) ;
422
420
if ( ! string . IsNullOrEmpty ( HumansUrl ) )
423
421
{
424
422
this . AddUrlQueueItem ( Url : HumansUrl ) ;
425
423
}
426
424
}
427
425
}
428
426
429
- this . IncludeExcludeUrls . AddExplicitIncludeUrl ( Url : this . StartUrl ) ;
427
+ this . IncludeExcludeUrls . AddExplicitIncludeUrl ( Url : this . GetStartUrl ( ) ) ;
430
428
431
- this . AddUrlQueueItem ( Url : this . StartUrl ) ;
429
+ this . AddUrlQueueItem ( Url : this . GetStartUrl ( ) ) ;
432
430
433
431
foreach ( MacroscopeDocument msDoc in this . GetDocCollection ( ) . IterateDocuments ( ) )
434
432
{
@@ -437,9 +435,9 @@ public bool Execute ()
437
435
438
436
}
439
437
440
- this . ProbeRobotsFile ( Url : this . StartUrl ) ;
438
+ this . ProbeRobotsFile ( Url : this . GetStartUrl ( ) ) ;
441
439
442
- this . SetCrawlDelay ( Url : this . StartUrl ) ;
440
+ this . SetCrawlDelay ( Url : this . GetStartUrl ( ) ) ;
443
441
444
442
this . SpawnWorkers ( ) ;
445
443
@@ -450,7 +448,7 @@ public bool Execute ()
450
448
this . TaskController . ICallbackScanComplete ( ) ;
451
449
}
452
450
453
- this . AddUpdateDisplayQueue ( Url : this . StartUrl ) ;
451
+ this . AddUpdateDisplayQueue ( Url : this . GetStartUrl ( ) ) ;
454
452
455
453
return ( true ) ;
456
454
@@ -1089,6 +1087,7 @@ private void ResetLink ( MacroscopeDocument msDoc )
1089
1087
public string SetStartUrl ( string Url )
1090
1088
{
1091
1089
this . StartUrl = Url ;
1090
+ this . DetermineStartingDirectory ( ) ;
1092
1091
return ( this . StartUrl ) ;
1093
1092
}
1094
1093
@@ -1103,7 +1102,7 @@ public string GetStartUrl ()
1103
1102
1104
1103
public string GetStartUriHostAndPort ( )
1105
1104
{
1106
- Uri StartUri = new Uri ( this . StartUrl ) ;
1105
+ Uri StartUri = new Uri ( this . GetStartUrl ( ) ) ;
1107
1106
string StartUriHostAndPort = null ;
1108
1107
if ( StartUri != null )
1109
1108
{
@@ -1148,70 +1147,11 @@ private void IncPagesFound ()
1148
1147
1149
1148
/** Crawl Parent / Child Directories **************************************/
1150
1149
1151
- public void DetermineStartingDirectory ( )
1150
+ private void DetermineStartingDirectory ( )
1152
1151
{
1153
-
1154
- Uri StartUri = null ;
1155
- string Path = "/" ;
1156
- string StartUriPort = "" ;
1157
-
1158
- try
1159
- {
1160
-
1161
- StartUri = new Uri ( this . GetStartUrl ( ) ) ;
1162
-
1163
- if ( StartUri . Port > 0 )
1164
- {
1165
- StartUriPort = string . Format ( ":{0}" , StartUri . Port ) ;
1166
- }
1167
-
1168
- Path = StartUri . AbsolutePath ;
1169
-
1170
- }
1171
- catch ( UriFormatException ex )
1172
- {
1173
- this . DebugMsg ( string . Format ( "DetermineStartingDirectory: {0}" , ex . Message ) ) ;
1174
- }
1175
- catch ( Exception ex )
1176
- {
1177
- this . DebugMsg ( string . Format ( "DetermineStartingDirectory: {0}" , ex . Message ) ) ;
1178
- }
1179
-
1180
-
1181
- if ( StartUri != null )
1182
- {
1183
-
1184
- Path = Regex . Replace ( Path , "/[^/]*$" , "/" , RegexOptions . IgnoreCase ) ;
1185
-
1186
- if ( Path . Length == 0 )
1187
- {
1188
- Path = "/" ;
1189
- }
1190
-
1191
- this . SetParentStartingDirectory (
1192
- Url : string . Join (
1193
- "" ,
1194
- StartUri . Scheme ,
1195
- "://" ,
1196
- StartUri . Host ,
1197
- StartUriPort ,
1198
- Path
1199
- )
1200
- ) ;
1201
-
1202
- this . SetChildStartingDirectory (
1203
- Url : string . Join (
1204
- "" ,
1205
- StartUri . Scheme ,
1206
- "://" ,
1207
- StartUri . Host ,
1208
- StartUriPort ,
1209
- Path
1210
- )
1211
- ) ;
1212
-
1213
- }
1214
-
1152
+ string StartingUrl = MacroscopeHttpUrlUtils . DetermineStartingDirectory ( Url : this . GetStartUrl ( ) ) ;
1153
+ this . SetParentStartingDirectory ( Url : StartingUrl ) ;
1154
+ this . SetChildStartingDirectory ( Url : StartingUrl ) ;
1215
1155
}
1216
1156
1217
1157
/** -------------------------------------------------------------------- **/
0 commit comments