-
Notifications
You must be signed in to change notification settings - Fork 52
Open
Labels
Description
Original issue report from a user:
https://console.apify.com/organization/ZscMwFR5H7eCtWtyh/actors/moJRLRc85AitArpNN/issues/OTIaynfUmAfVbu5Ac
On WCC the same websites are accessed and scraped just fine. So maybe the issue is connected with some anti-blocking mechanisms (even though we've updated Web Scraper to the latest Crawlee and Impit).
Example runs which are failing:
Run 1:
https://console.apify.com/admin/users/Nm4QsZo2PqzLLG3Tf/actors/runs/OByAd7YvvKSe2DUBr#log
Start URL: https://www.defence.gov.au/news-events/releases
Input:
{
"runMode": "DEVELOPMENT",
"startUrls": [
{
"url": "https://www.defence.gov.au/news-events/releases",
"method": "GET"
}
],
"keepUrlFragments": false,
"linkSelector": "a[href]",
"pseudoUrls": [
{
"purl": "https://apify.com[(/[\\w-]+)?]"
}
],
"pageFunction": "// The function accepts a single argument: the \"context\" object.\n// For a complete list of its properties and functions,\n// see https://apify.com/apify/web-scraper#page-function \nasync function pageFunction(context) {\n // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!\n // debugger; \n\n // jQuery is handy for finding DOM elements and extracting data from them.\n // To use it, make sure to enable the \"Inject jQuery\" option.\n const $ = context.jQuery;\n await context.waitFor(5000);\n const pageTitle = $('title').first().text();\n const pageContent = $('html').html();\n\n // Print some information to actor log\n context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`);\n\n // Manually add a new page to the queue for scraping.\n // To make this work, make sure the \"Use request queue\" option is enabled.\n //context.enqueueRequest({ url: 'http://www.example.com' });\n\n // Return an object with the data extracted from the page.\n // It will be stored to the resulting dataset.\n return {\n url: context.request.url,\n pageContent\n };\n}",
"injectJQuery": true,
"injectUnderscore": false,
"proxyConfiguration": {
"useApifyProxy": true,
"apifyProxyGroups": [
"RESIDENTIAL"
]
},
"proxyRotation": "RECOMMENDED",
"useChrome": false,
"useStealth": false,
"ignoreSslErrors": true,
"ignoreCorsAndCsp": false,
"downloadMedia": false,
"downloadCss": false,
"waitUntil": [
"networkidle2"
],
"breakpointLocation": "NONE",
"debugLog": false,
"browserLog": false,
"initialCookies": [
{
"name": "nmstat",
"value": "d1c0d046-f75b-5c46-bda6-de95cd4cdab0",
"domain": "www.defence.gov.au"
},
{
"name": "_ga_7TYP3R2XDD",
"value": "GS2.1.s1768937441$o1$g0$t1768937441$j60$l0$h0",
"domain": "www.defence.gov.au"
},
{
"name": "bm_sv",
"value": "CF1B85A3280342917F30733F44221F70~YAAQh1jWF4+oItWbAQAArEvj3B6LM78Qd5dsctXDxvHHz4kCJejJy58Yb2AYrG2FI0H9gaDOnBq3iPBsgphw9Ag2AEhslHsblRCd+GyULiTUsmyaDBEsqFwS3c/IiE4Bawxdxp7ymob0aHKNkbTfcWGtfjG/fV50kl1fdzQUGzCHdqrnHKZZHw1E9AslEeZIKtJ0imjshPgLm3kuXl0zY1EDwhAKT2Ed9rJYbB6KK+Yi/y6xOWulbhSbhhHg26VLLpn9cw",
"domain": "www.defence.gov.au"
},
{
"name": "_ga_0XT7NFV9ZS",
"value": "GS2.1.s1768937442$o1$g0$t1768937442$j60$l0$h0",
"domain": "www.defence.gov.au"
},
{
"name": "_ga",
"value": "GA1.3.1461178366.1768937442",
"domain": "www.defence.gov.au"
},
{
"name": "_gid",
"value": "GA1.3.975453558.1768937442",
"domain": "www.defence.gov.au"
},
{
"name": "_gat_gtag_UA_2848578_38",
"value": "1",
"domain": "www.defence.gov.au"
},
{
"name": "ak_bmsc",
"value": "CD2FF8AE78B0AE4955D6437CF8C423DA~000000000000000000000000000000~YAAQh1jWF5OoItWbAQAAYEzj3B5dh+Xk++24qGq4KhXWQg24Dt7Pz7wayx3XtGIei8hQdL8CTdScSUdUa52RPrt2dwG6rMgIAsZeY2F9x6SEMApjl89dKl2AcOmsi33+/YemJDbFw3ij1TGgNUe2x3OhYcDxO4Y7z77caEUbOczs6Lkxnd0lK6Z3isM8qgl3hQamWdyYVaaNTsEd5fU/sFOpOhqFw2uVUAvavj1Uwtt1UyW1mDP63FzgcFQt+HTXY6EeSJaxXawfruAiivcbt2Y9n13KLu3CdlY7A3CcWBfH9pmolkwoR0UXweEzDc+G25kYrjFJn9Nh0IpSe6Rlfq9Rof5oLGy9/Qi7nTPcctaVRWCYJrPdq/+9Xtkop+hkFdK2GEx1mVytg4j10OWjGHestk+vOB3wb8E/r2V8Tt1Cw1DfXTXqwrKTaoeO38Z43t8F3sSLRo5u5WyOG12jAezYl3oF",
"domain": "www.defence.gov.au"
}
],
"respectRobotsTxtFile": false,
"globs": [],
"excludes": [],
"headless": true,
"maxRequestRetries": 3,
"maxPagesPerCrawl": 0,
"maxResultsPerCrawl": 0,
"maxCrawlingDepth": 0,
"maxConcurrency": 50,
"pageLoadTimeoutSecs": 60,
"pageFunctionTimeoutSecs": 60,
"closeCookieModals": false,
"maxScrollHeightPixels": 5000,
"customData": {}
}
Run 2:
https://console.apify.com/admin/users/Nm4QsZo2PqzLLG3Tf/actors/runs/7GhWxBheU9ZRIaiYz#log
Start URL: https://www.royalnavy.mod.uk/news
Input:
{
"runMode": "DEVELOPMENT",
"startUrls": [
{
"url": "https://www.royalnavy.mod.uk/news",
"method": "GET"
}
],
"keepUrlFragments": false,
"linkSelector": "a[href]",
"pseudoUrls": [
{
"purl": "https://apify.com[(/[\\w-]+)?]"
}
],
"pageFunction": "// The function accepts a single argument: the \"context\" object.\n// For a complete list of its properties and functions,\n// see https://apify.com/apify/web-scraper#page-function \nasync function pageFunction(context) {\n // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!\n // debugger; \n\n // jQuery is handy for finding DOM elements and extracting data from them.\n // To use it, make sure to enable the \"Inject jQuery\" option.\n const $ = context.jQuery;\n await context.waitFor(5000);\n const pageTitle = $('title').first().text();\n const pageContent = $('html').html();\n\n // Print some information to actor log\n context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`);\n\n // Manually add a new page to the queue for scraping.\n // To make this work, make sure the \"Use request queue\" option is enabled.\n //context.enqueueRequest({ url: 'http://www.example.com' });\n\n // Return an object with the data extracted from the page.\n // It will be stored to the resulting dataset.\n return {\n url: context.request.url,\n pageContent\n };\n}",
"injectJQuery": true,
"injectUnderscore": false,
"proxyConfiguration": {
"useApifyProxy": true,
"apifyProxyGroups": [
"RESIDENTIAL"
]
},
"proxyRotation": "RECOMMENDED",
"useChrome": false,
"useStealth": false,
"ignoreSslErrors": true,
"ignoreCorsAndCsp": false,
"downloadMedia": false,
"downloadCss": false,
"waitUntil": [
"networkidle2"
],
"breakpointLocation": "NONE",
"debugLog": false,
"browserLog": false,
"initialCookies": [
{
"name": "__cf_bm",
"value": "7zCjosvtpPVhj0JowmRcclrB5JGy1Hq4HLdsOJDNUTk-1771489757-1.0.1.1-rsDlnOg5fmN_fGZcUmYn1M.L1ujrvZnXk8.kcB7EpwM2vKHGOjQI8QZ9oysGCcfLohMghlW0aNI8uWzwxZZtVs0y1fhLhSmr1Gd3x2JJ5bQ",
"domain": "www.royalnavy.mod.uk"
},
{
"name": "CookieControl",
"value": "{\\\"necessaryCookies\\\":[],\\\"optionalCookies\\\":{},\\\"statement\\\":{},\\\"consentDate\\\":1771489761002,\\\"consentExpiry\\\":365,\\\"interactedWith\\\":true,\\\"user\\\":\\\"B5ACB496-7D39-4427-8C25-8F24754FFB57\\\"}",
"domain": "www.royalnavy.mod.uk"
}
],
"respectRobotsTxtFile": false,
"globs": [],
"excludes": [],
"headless": true,
"maxRequestRetries": 3,
"maxPagesPerCrawl": 0,
"maxResultsPerCrawl": 0,
"maxCrawlingDepth": 0,
"maxConcurrency": 50,
"pageLoadTimeoutSecs": 60,
"pageFunctionTimeoutSecs": 60,
"closeCookieModals": false,
"maxScrollHeightPixels": 5000,
"customData": {}
}
Reactions are currently unavailable