@@ -2461,8 +2461,9 @@ impl Website {
24612461 self . status = CrawlStatus :: Blocked ;
24622462 } else if page. status_code == reqwest:: StatusCode :: TOO_MANY_REQUESTS {
24632463 self . status = CrawlStatus :: RateLimited ;
2464- } else if page. status_code == * UNKNOWN_STATUS_ERROR
2465- || page. status_code == * CHROME_UNKNOWN_STATUS_ERROR
2464+ } else if ( page. status_code == * UNKNOWN_STATUS_ERROR
2465+ || page. status_code == * CHROME_UNKNOWN_STATUS_ERROR )
2466+ && ( page. is_empty ( ) || page. error_status . is_some ( ) )
24662467 {
24672468 self . status = CrawlStatus :: ConnectError ;
24682469 } else if page. status_code . is_server_error ( ) {
@@ -10990,6 +10991,96 @@ mod tests {
1099010991 "Should not build rotator with no proxies"
1099110992 ) ;
1099210993 }
10994+
10995+ fn make_page ( status : reqwest:: StatusCode ) -> crate :: page:: Page {
10996+ let mut page = crate :: page:: Page :: default ( ) ;
10997+ page. status_code = status;
10998+ page
10999+ }
11000+
11001+ #[ test]
11002+ fn test_crawl_status_599_empty_page_is_connect_error ( ) {
11003+ let mut website = crate :: website:: Website :: new ( "http://example.com" ) ;
11004+ let page = make_page ( * crate :: page:: UNKNOWN_STATUS_ERROR ) ;
11005+ let links = hashbrown:: HashSet :: new ( ) ;
11006+ website. set_crawl_initial_status ( & page, & links) ;
11007+ assert_eq ! ( * website. get_status( ) , super :: CrawlStatus :: ConnectError ) ;
11008+ }
11009+
11010+ #[ test]
11011+ fn test_crawl_status_598_empty_page_is_connect_error ( ) {
11012+ let mut website = crate :: website:: Website :: new ( "http://example.com" ) ;
11013+ let page = make_page ( * crate :: page:: CHROME_UNKNOWN_STATUS_ERROR ) ;
11014+ let links = hashbrown:: HashSet :: new ( ) ;
11015+ website. set_crawl_initial_status ( & page, & links) ;
11016+ assert_eq ! ( * website. get_status( ) , super :: CrawlStatus :: ConnectError ) ;
11017+ }
11018+
11019+ #[ test]
11020+ fn test_crawl_status_598_with_error_status_is_connect_error ( ) {
11021+ let mut website = crate :: website:: Website :: new ( "http://example.com" ) ;
11022+ let mut page = make_page ( * crate :: page:: CHROME_UNKNOWN_STATUS_ERROR ) ;
11023+ page. html = Some ( b"<html><body>some content</body></html>" . to_vec ( ) . into ( ) ) ;
11024+ page. error_status = Some ( "Invalid proxy configuration." . into ( ) ) ;
11025+ let links = hashbrown:: HashSet :: new ( ) ;
11026+ website. set_crawl_initial_status ( & page, & links) ;
11027+ assert_eq ! ( * website. get_status( ) , super :: CrawlStatus :: ConnectError ) ;
11028+ }
11029+
11030+ #[ test]
11031+ fn test_crawl_status_598_with_content_no_error_is_server_error ( ) {
11032+ let mut website = crate :: website:: Website :: new ( "http://example.com" ) ;
11033+ let mut page = make_page ( * crate :: page:: CHROME_UNKNOWN_STATUS_ERROR ) ;
11034+ page. html = Some (
11035+ b"<html><body>real server content</body></html>"
11036+ . to_vec ( )
11037+ . into ( ) ,
11038+ ) ;
11039+ let links = hashbrown:: HashSet :: new ( ) ;
11040+ website. set_crawl_initial_status ( & page, & links) ;
11041+ assert_eq ! ( * website. get_status( ) , super :: CrawlStatus :: ServerError ) ;
11042+ }
11043+
11044+ #[ test]
11045+ fn test_crawl_status_599_with_content_no_error_is_server_error ( ) {
11046+ let mut website = crate :: website:: Website :: new ( "http://example.com" ) ;
11047+ let mut page = make_page ( * crate :: page:: UNKNOWN_STATUS_ERROR ) ;
11048+ page. html = Some (
11049+ b"<html><body>real server content</body></html>"
11050+ . to_vec ( )
11051+ . into ( ) ,
11052+ ) ;
11053+ let links = hashbrown:: HashSet :: new ( ) ;
11054+ website. set_crawl_initial_status ( & page, & links) ;
11055+ assert_eq ! ( * website. get_status( ) , super :: CrawlStatus :: ServerError ) ;
11056+ }
11057+
11058+ #[ test]
11059+ fn test_crawl_status_500_is_server_error ( ) {
11060+ let mut website = crate :: website:: Website :: new ( "http://example.com" ) ;
11061+ let page = make_page ( reqwest:: StatusCode :: INTERNAL_SERVER_ERROR ) ;
11062+ let links = hashbrown:: HashSet :: new ( ) ;
11063+ website. set_crawl_initial_status ( & page, & links) ;
11064+ assert_eq ! ( * website. get_status( ) , super :: CrawlStatus :: ServerError ) ;
11065+ }
11066+
11067+ #[ test]
11068+ fn test_crawl_status_429_is_rate_limited ( ) {
11069+ let mut website = crate :: website:: Website :: new ( "http://example.com" ) ;
11070+ let page = make_page ( reqwest:: StatusCode :: TOO_MANY_REQUESTS ) ;
11071+ let links = hashbrown:: HashSet :: new ( ) ;
11072+ website. set_crawl_initial_status ( & page, & links) ;
11073+ assert_eq ! ( * website. get_status( ) , super :: CrawlStatus :: RateLimited ) ;
11074+ }
11075+
11076+ #[ test]
11077+ fn test_crawl_status_empty_page_200_is_empty ( ) {
11078+ let mut website = crate :: website:: Website :: new ( "http://example.com" ) ;
11079+ let page = make_page ( reqwest:: StatusCode :: OK ) ;
11080+ let links = hashbrown:: HashSet :: new ( ) ;
11081+ website. set_crawl_initial_status ( & page, & links) ;
11082+ assert_eq ! ( * website. get_status( ) , super :: CrawlStatus :: Empty ) ;
11083+ }
1099311084}
1099411085
1099511086#[ tokio:: test]
0 commit comments