Skip to content

Commit 30fdb4c

Browse files
committed
Fixed parent/child directory parsing and unit tests
1 parent 4ea8ebb commit 30fdb4c

File tree

3 files changed

+75
-115
lines changed

3 files changed

+75
-115
lines changed

SEOMacroscopeSeriesOne/src/MacroscopeStandards/HTTP/MacroscopeHttpUrlUtils.cs

+57-35
Original file line numberDiff line numberDiff line change
@@ -628,13 +628,63 @@ public static int FindUrlDepth ( string Url )
628628

629629
}
630630

631+
/**************************************************************************/
632+
633+
public static string DetermineStartingDirectory ( string Url )
634+
{
635+
636+
Uri StartUri = null;
637+
string Path = "/";
638+
string StartUriPort = "";
639+
string StartingUrl = null;
631640

641+
try
642+
{
632643

644+
StartUri = new Uri( Url );
633645

646+
if( StartUri.Port > 0 )
647+
{
648+
StartUriPort = string.Format( ":{0}", StartUri.Port );
649+
}
634650

651+
Path = StartUri.AbsolutePath;
635652

653+
}
654+
catch( UriFormatException ex )
655+
{
656+
DebugMsg( string.Format( "DetermineStartingDirectory: {0}", ex.Message ), true );
657+
}
658+
catch( Exception ex )
659+
{
660+
DebugMsg( string.Format( "DetermineStartingDirectory: {0}", ex.Message ), true );
661+
}
636662

637663

664+
if( StartUri != null )
665+
{
666+
667+
Path = Regex.Replace( Path, "/[^/]*$", "/", RegexOptions.IgnoreCase );
668+
669+
if( Path.Length == 0 )
670+
{
671+
Path = "/";
672+
}
673+
674+
StartingUrl = string.Join(
675+
"",
676+
StartUri.Scheme,
677+
"://",
678+
StartUri.Host,
679+
StartUriPort,
680+
Path
681+
);
682+
683+
}
684+
685+
return ( StartingUrl );
686+
687+
}
638688

639689
/**************************************************************************/
640690

@@ -673,6 +723,7 @@ public static bool IsWithinParentDirectory ( string StartUrl, string Url )
673723
|| ( CurrentUri.Scheme.ToLower() == "https" ) )
674724
{
675725

726+
string StartingUrl = MacroscopeHttpUrlUtils.DetermineStartingDirectory( Url: StartUrl );
676727
string Path = CurrentUri.AbsolutePath;
677728
string CurrentUriString;
678729
int ParentStartingDirectoryLength;
@@ -694,12 +745,12 @@ public static bool IsWithinParentDirectory ( string StartUrl, string Url )
694745
Path
695746
);
696747

697-
ParentStartingDirectoryLength = StartUrl.Length;
748+
ParentStartingDirectoryLength = StartingUrl.Length;
698749
CurrentUriStringLength = CurrentUriString.Length;
699750

700751
if( ParentStartingDirectoryLength >= CurrentUriStringLength )
701752
{
702-
if( StartUrl.StartsWith( CurrentUriString, StringComparison.Ordinal ) )
753+
if( StartingUrl.StartsWith( CurrentUriString, StringComparison.Ordinal ) )
703754
{
704755
IsWithin = true;
705756
}
@@ -751,6 +802,7 @@ public static bool IsWithinChildDirectory ( string StartUrl, string Url )
751802
|| ( CurrentUri.Scheme.ToLower() == "https" ) )
752803
{
753804

805+
string StartingUrl = MacroscopeHttpUrlUtils.DetermineStartingDirectory( Url: StartUrl );
754806
string Path = CurrentUri.AbsolutePath;
755807
string CurrentUriString;
756808
int ChildStartingDirectoryLength;
@@ -772,13 +824,13 @@ public static bool IsWithinChildDirectory ( string StartUrl, string Url )
772824
Path
773825
);
774826

775-
ChildStartingDirectoryLength = StartUrl.Length;
827+
ChildStartingDirectoryLength = StartingUrl.Length;
776828
CurrentUriStringLength = CurrentUriString.Length;
777829

778830
if( CurrentUriStringLength >= ChildStartingDirectoryLength )
779831
{
780832

781-
if( CurrentUriString.StartsWith( StartUrl, StringComparison.Ordinal ) )
833+
if( CurrentUriString.StartsWith( StartingUrl, StringComparison.Ordinal ) )
782834
{
783835
IsWithin = true;
784836
}
@@ -792,37 +844,7 @@ public static bool IsWithinChildDirectory ( string StartUrl, string Url )
792844
return ( IsWithin );
793845

794846
}
795-
796-
797-
798-
799-
800-
801-
802-
803-
804-
805-
806-
807-
808-
809-
810-
811-
812-
813-
814-
815-
816-
817-
818-
819-
820-
821-
822-
823-
824-
825-
847+
826848
/**************************************************************************/
827849

828850
public static string CleanUrlCss ( string CssProperty )

SEOMacroscopeSeriesOne/src/MacroscopeTasks/MacroscopeJobMaster.cs

+18-78
Original file line numberDiff line numberDiff line change
@@ -372,27 +372,25 @@ public MacroscopeDataExtractorXpaths GetDataExtractorXpaths ()
372372
public bool Execute ()
373373
{
374374

375-
this.DebugMsg( string.Format( "Start URL: {0}", this.StartUrl ) );
375+
this.DebugMsg( string.Format( "Start URL: {0}", this.GetStartUrl() ) );
376376

377377
//this.LogEntry( string.Format( "Executing with Start URL: {0}", this.StartUrl ) );
378378

379-
this.StartUrl = MacroscopeHttpUrlUtils.SanitizeUrl( Url: this.StartUrl );
379+
this.SetStartUrl( Url: MacroscopeHttpUrlUtils.SanitizeUrl( Url: this.GetStartUrl() ) );
380380

381-
this.DocCollection.SetStartUrl( Url: this.StartUrl );
382-
383-
this.DetermineStartingDirectory();
381+
this.DocCollection.SetStartUrl( Url: this.GetStartUrl() );
384382

385383
this.SetThreadsStop( Stopped: false );
386384

387-
this.AllowedHosts.AddFromUrl( Url: this.StartUrl );
385+
this.AllowedHosts.AddFromUrl( Url: this.GetStartUrl() );
388386

389387
if( !this.PeekUrlQueue() )
390388
{
391389

392390
{ // Add robots.txt URL to queue
393391
if( MacroscopePreferencesManager.GetFollowRobotsProtocol() )
394392
{
395-
string RobotsUrl = MacroscopeRobots.GenerateRobotUrl( Url: this.StartUrl );
393+
string RobotsUrl = MacroscopeRobots.GenerateRobotUrl( Url: this.GetStartUrl() );
396394
if( !string.IsNullOrEmpty( RobotsUrl ) )
397395
{
398396
this.AddUrlQueueItem( Url: RobotsUrl );
@@ -406,7 +404,7 @@ public bool Execute ()
406404
MacroscopeSitemapPaths SitemapPaths = new MacroscopeSitemapPaths();
407405
foreach( string SitemapPath in SitemapPaths.IterateSitemapPaths() )
408406
{
409-
string SitemapUrl = MacroscopeSitemapPaths.GenerateSitemapUrl( Url: this.StartUrl, SitemapPath: SitemapPath );
407+
string SitemapUrl = MacroscopeSitemapPaths.GenerateSitemapUrl( Url: this.GetStartUrl(), SitemapPath: SitemapPath );
410408
if( !string.IsNullOrEmpty( SitemapUrl ) )
411409
{
412410
this.AddUrlQueueItem( Url: SitemapUrl );
@@ -418,17 +416,17 @@ public bool Execute ()
418416
{ // Add humans.txt URL to queue
419417
if( MacroscopePreferencesManager.GetProbeHumansText() )
420418
{
421-
string HumansUrl = MacroscopeHumans.GenerateHumansUrl( Url: this.StartUrl );
419+
string HumansUrl = MacroscopeHumans.GenerateHumansUrl( Url: this.GetStartUrl() );
422420
if( !string.IsNullOrEmpty( HumansUrl ) )
423421
{
424422
this.AddUrlQueueItem( Url: HumansUrl );
425423
}
426424
}
427425
}
428426

429-
this.IncludeExcludeUrls.AddExplicitIncludeUrl( Url: this.StartUrl );
427+
this.IncludeExcludeUrls.AddExplicitIncludeUrl( Url: this.GetStartUrl() );
430428

431-
this.AddUrlQueueItem( Url: this.StartUrl );
429+
this.AddUrlQueueItem( Url: this.GetStartUrl() );
432430

433431
foreach( MacroscopeDocument msDoc in this.GetDocCollection().IterateDocuments() )
434432
{
@@ -437,9 +435,9 @@ public bool Execute ()
437435

438436
}
439437

440-
this.ProbeRobotsFile( Url: this.StartUrl );
438+
this.ProbeRobotsFile( Url: this.GetStartUrl() );
441439

442-
this.SetCrawlDelay( Url: this.StartUrl );
440+
this.SetCrawlDelay( Url: this.GetStartUrl() );
443441

444442
this.SpawnWorkers();
445443

@@ -450,7 +448,7 @@ public bool Execute ()
450448
this.TaskController.ICallbackScanComplete();
451449
}
452450

453-
this.AddUpdateDisplayQueue( Url: this.StartUrl );
451+
this.AddUpdateDisplayQueue( Url: this.GetStartUrl() );
454452

455453
return ( true );
456454

@@ -1089,6 +1087,7 @@ private void ResetLink ( MacroscopeDocument msDoc )
10891087
public string SetStartUrl ( string Url )
10901088
{
10911089
this.StartUrl = Url;
1090+
this.DetermineStartingDirectory();
10921091
return ( this.StartUrl );
10931092
}
10941093

@@ -1103,7 +1102,7 @@ public string GetStartUrl ()
11031102

11041103
public string GetStartUriHostAndPort ()
11051104
{
1106-
Uri StartUri = new Uri( this.StartUrl );
1105+
Uri StartUri = new Uri( this.GetStartUrl() );
11071106
string StartUriHostAndPort = null;
11081107
if( StartUri != null )
11091108
{
@@ -1148,70 +1147,11 @@ private void IncPagesFound ()
11481147

11491148
/** Crawl Parent / Child Directories **************************************/
11501149

1151-
public void DetermineStartingDirectory ()
1150+
private void DetermineStartingDirectory ()
11521151
{
1153-
1154-
Uri StartUri = null;
1155-
string Path = "/";
1156-
string StartUriPort = "";
1157-
1158-
try
1159-
{
1160-
1161-
StartUri = new Uri( this.GetStartUrl() );
1162-
1163-
if( StartUri.Port > 0 )
1164-
{
1165-
StartUriPort = string.Format( ":{0}", StartUri.Port );
1166-
}
1167-
1168-
Path = StartUri.AbsolutePath;
1169-
1170-
}
1171-
catch( UriFormatException ex )
1172-
{
1173-
this.DebugMsg( string.Format( "DetermineStartingDirectory: {0}", ex.Message ) );
1174-
}
1175-
catch( Exception ex )
1176-
{
1177-
this.DebugMsg( string.Format( "DetermineStartingDirectory: {0}", ex.Message ) );
1178-
}
1179-
1180-
1181-
if( StartUri != null )
1182-
{
1183-
1184-
Path = Regex.Replace( Path, "/[^/]*$", "/", RegexOptions.IgnoreCase );
1185-
1186-
if( Path.Length == 0 )
1187-
{
1188-
Path = "/";
1189-
}
1190-
1191-
this.SetParentStartingDirectory(
1192-
Url: string.Join(
1193-
"",
1194-
StartUri.Scheme,
1195-
"://",
1196-
StartUri.Host,
1197-
StartUriPort,
1198-
Path
1199-
)
1200-
);
1201-
1202-
this.SetChildStartingDirectory(
1203-
Url: string.Join(
1204-
"",
1205-
StartUri.Scheme,
1206-
"://",
1207-
StartUri.Host,
1208-
StartUriPort,
1209-
Path
1210-
)
1211-
);
1212-
1213-
}
1214-
1152+
string StartingUrl = MacroscopeHttpUrlUtils.DetermineStartingDirectory( Url: this.GetStartUrl() );
1153+
this.SetParentStartingDirectory( Url: StartingUrl );
1154+
this.SetChildStartingDirectory( Url: StartingUrl );
12151155
}
12161156

12171157
/** -------------------------------------------------------------------- **/

SEOMacroscopeSeriesOne/src/MacroscopeTasks/t/TestMacroscopeJobMaster.cs

-2
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,6 @@ public void TestJobMasterStartUrl ()
5151

5252
JobMaster.SetStartUrl( Url: StartUrl );
5353

54-
JobMaster.DetermineStartingDirectory();
55-
5654
Assert.AreEqual( StartUrl, JobMaster.GetStartUrl(), string.Format( "FAIL: {0}", StartUrl ) );
5755

5856
}

0 commit comments

Comments
 (0)