Skip to content

Failed runs on Web Scraper #248

@nicklamonov

Description

@nicklamonov

Original issue report from a user:
https://console.apify.com/organization/ZscMwFR5H7eCtWtyh/actors/moJRLRc85AitArpNN/issues/OTIaynfUmAfVbu5Ac

On WCC the same websites are accessed and scraped just fine. So maybe the issue is connected with some anti-blocking mechanisms (even though we've updated Web Scraper to the latest Crawlee and Impit).

Example runs which are failing:

Run 1:
https://console.apify.com/admin/users/Nm4QsZo2PqzLLG3Tf/actors/runs/OByAd7YvvKSe2DUBr#log
Start URL: https://www.defence.gov.au/news-events/releases
Input:

{
  "runMode": "DEVELOPMENT",
  "startUrls": [
    {
      "url": "https://www.defence.gov.au/news-events/releases",
      "method": "GET"
    }
  ],
  "keepUrlFragments": false,
  "linkSelector": "a[href]",
  "pseudoUrls": [
    {
      "purl": "https://apify.com[(/[\\w-]+)?]"
    }
  ],
  "pageFunction": "// The function accepts a single argument: the \"context\" object.\n// For a complete list of its properties and functions,\n// see https://apify.com/apify/web-scraper#page-function \nasync function pageFunction(context) {\n    // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!\n    // debugger; \n\n    // jQuery is handy for finding DOM elements and extracting data from them.\n    // To use it, make sure to enable the \"Inject jQuery\" option.\n    const $ = context.jQuery;\n    await context.waitFor(5000);\n    const pageTitle = $('title').first().text();\n    const pageContent = $('html').html();\n\n    // Print some information to actor log\n    context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`);\n\n    // Manually add a new page to the queue for scraping.\n    // To make this work, make sure the \"Use request queue\" option is enabled.\n    //context.enqueueRequest({ url: 'http://www.example.com' });\n\n    // Return an object with the data extracted from the page.\n    // It will be stored to the resulting dataset.\n    return {\n         url: context.request.url,\n       pageContent\n    };\n}",
  "injectJQuery": true,
  "injectUnderscore": false,
  "proxyConfiguration": {
    "useApifyProxy": true,
    "apifyProxyGroups": [
      "RESIDENTIAL"
    ]
  },
  "proxyRotation": "RECOMMENDED",
  "useChrome": false,
  "useStealth": false,
  "ignoreSslErrors": true,
  "ignoreCorsAndCsp": false,
  "downloadMedia": false,
  "downloadCss": false,
  "waitUntil": [
    "networkidle2"
  ],
  "breakpointLocation": "NONE",
  "debugLog": false,
  "browserLog": false,
  "initialCookies": [
    {
      "name": "nmstat",
      "value": "d1c0d046-f75b-5c46-bda6-de95cd4cdab0",
      "domain": "www.defence.gov.au"
    },
    {
      "name": "_ga_7TYP3R2XDD",
      "value": "GS2.1.s1768937441$o1$g0$t1768937441$j60$l0$h0",
      "domain": "www.defence.gov.au"
    },
    {
      "name": "bm_sv",
      "value": "CF1B85A3280342917F30733F44221F70~YAAQh1jWF4+oItWbAQAArEvj3B6LM78Qd5dsctXDxvHHz4kCJejJy58Yb2AYrG2FI0H9gaDOnBq3iPBsgphw9Ag2AEhslHsblRCd+GyULiTUsmyaDBEsqFwS3c/IiE4Bawxdxp7ymob0aHKNkbTfcWGtfjG/fV50kl1fdzQUGzCHdqrnHKZZHw1E9AslEeZIKtJ0imjshPgLm3kuXl0zY1EDwhAKT2Ed9rJYbB6KK+Yi/y6xOWulbhSbhhHg26VLLpn9cw",
      "domain": "www.defence.gov.au"
    },
    {
      "name": "_ga_0XT7NFV9ZS",
      "value": "GS2.1.s1768937442$o1$g0$t1768937442$j60$l0$h0",
      "domain": "www.defence.gov.au"
    },
    {
      "name": "_ga",
      "value": "GA1.3.1461178366.1768937442",
      "domain": "www.defence.gov.au"
    },
    {
      "name": "_gid",
      "value": "GA1.3.975453558.1768937442",
      "domain": "www.defence.gov.au"
    },
    {
      "name": "_gat_gtag_UA_2848578_38",
      "value": "1",
      "domain": "www.defence.gov.au"
    },
    {
      "name": "ak_bmsc",
      "value": "CD2FF8AE78B0AE4955D6437CF8C423DA~000000000000000000000000000000~YAAQh1jWF5OoItWbAQAAYEzj3B5dh+Xk++24qGq4KhXWQg24Dt7Pz7wayx3XtGIei8hQdL8CTdScSUdUa52RPrt2dwG6rMgIAsZeY2F9x6SEMApjl89dKl2AcOmsi33+/YemJDbFw3ij1TGgNUe2x3OhYcDxO4Y7z77caEUbOczs6Lkxnd0lK6Z3isM8qgl3hQamWdyYVaaNTsEd5fU/sFOpOhqFw2uVUAvavj1Uwtt1UyW1mDP63FzgcFQt+HTXY6EeSJaxXawfruAiivcbt2Y9n13KLu3CdlY7A3CcWBfH9pmolkwoR0UXweEzDc+G25kYrjFJn9Nh0IpSe6Rlfq9Rof5oLGy9/Qi7nTPcctaVRWCYJrPdq/+9Xtkop+hkFdK2GEx1mVytg4j10OWjGHestk+vOB3wb8E/r2V8Tt1Cw1DfXTXqwrKTaoeO38Z43t8F3sSLRo5u5WyOG12jAezYl3oF",
      "domain": "www.defence.gov.au"
    }
  ],
  "respectRobotsTxtFile": false,
  "globs": [],
  "excludes": [],
  "headless": true,
  "maxRequestRetries": 3,
  "maxPagesPerCrawl": 0,
  "maxResultsPerCrawl": 0,
  "maxCrawlingDepth": 0,
  "maxConcurrency": 50,
  "pageLoadTimeoutSecs": 60,
  "pageFunctionTimeoutSecs": 60,
  "closeCookieModals": false,
  "maxScrollHeightPixels": 5000,
  "customData": {}
}

Run 2:
https://console.apify.com/admin/users/Nm4QsZo2PqzLLG3Tf/actors/runs/7GhWxBheU9ZRIaiYz#log
Start URL: https://www.royalnavy.mod.uk/news

Input:

{
  "runMode": "DEVELOPMENT",
  "startUrls": [
    {
      "url": "https://www.royalnavy.mod.uk/news",
      "method": "GET"
    }
  ],
  "keepUrlFragments": false,
  "linkSelector": "a[href]",
  "pseudoUrls": [
    {
      "purl": "https://apify.com[(/[\\w-]+)?]"
    }
  ],
  "pageFunction": "// The function accepts a single argument: the \"context\" object.\n// For a complete list of its properties and functions,\n// see https://apify.com/apify/web-scraper#page-function \nasync function pageFunction(context) {\n    // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!\n    // debugger; \n\n    // jQuery is handy for finding DOM elements and extracting data from them.\n    // To use it, make sure to enable the \"Inject jQuery\" option.\n    const $ = context.jQuery;\n    await context.waitFor(5000);\n    const pageTitle = $('title').first().text();\n    const pageContent = $('html').html();\n\n    // Print some information to actor log\n    context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`);\n\n    // Manually add a new page to the queue for scraping.\n    // To make this work, make sure the \"Use request queue\" option is enabled.\n    //context.enqueueRequest({ url: 'http://www.example.com' });\n\n    // Return an object with the data extracted from the page.\n    // It will be stored to the resulting dataset.\n    return {\n         url: context.request.url,\n       pageContent\n    };\n}",
  "injectJQuery": true,
  "injectUnderscore": false,
  "proxyConfiguration": {
    "useApifyProxy": true,
    "apifyProxyGroups": [
      "RESIDENTIAL"
    ]
  },
  "proxyRotation": "RECOMMENDED",
  "useChrome": false,
  "useStealth": false,
  "ignoreSslErrors": true,
  "ignoreCorsAndCsp": false,
  "downloadMedia": false,
  "downloadCss": false,
  "waitUntil": [
    "networkidle2"
  ],
  "breakpointLocation": "NONE",
  "debugLog": false,
  "browserLog": false,
  "initialCookies": [
    {
      "name": "__cf_bm",
      "value": "7zCjosvtpPVhj0JowmRcclrB5JGy1Hq4HLdsOJDNUTk-1771489757-1.0.1.1-rsDlnOg5fmN_fGZcUmYn1M.L1ujrvZnXk8.kcB7EpwM2vKHGOjQI8QZ9oysGCcfLohMghlW0aNI8uWzwxZZtVs0y1fhLhSmr1Gd3x2JJ5bQ",
      "domain": "www.royalnavy.mod.uk"
    },
    {
      "name": "CookieControl",
      "value": "{\\\"necessaryCookies\\\":[],\\\"optionalCookies\\\":{},\\\"statement\\\":{},\\\"consentDate\\\":1771489761002,\\\"consentExpiry\\\":365,\\\"interactedWith\\\":true,\\\"user\\\":\\\"B5ACB496-7D39-4427-8C25-8F24754FFB57\\\"}",
      "domain": "www.royalnavy.mod.uk"
    }
  ],
  "respectRobotsTxtFile": false,
  "globs": [],
  "excludes": [],
  "headless": true,
  "maxRequestRetries": 3,
  "maxPagesPerCrawl": 0,
  "maxResultsPerCrawl": 0,
  "maxCrawlingDepth": 0,
  "maxConcurrency": 50,
  "pageLoadTimeoutSecs": 60,
  "pageFunctionTimeoutSecs": 60,
  "closeCookieModals": false,
  "maxScrollHeightPixels": 5000,
  "customData": {}
}

Metadata

Metadata

Assignees

Labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions