From 5bad46285b3fa2a730c7790eacc0e6ddcb38d1a5 Mon Sep 17 00:00:00 2001 From: AkaNebur Date: Sun, 21 Jun 2026 11:49:32 +0200 Subject: [PATCH 1/3] feat(post): add search_posts content-search tool Adds a new MCP tool search_posts(keywords, date_posted=None, max_pages=3) that drives LinkedIn's global "Posts" content-search tab. It surfaces informal hiring posts ("we're hiring", "Buscamos ...", "join our team") that often appear before a formal job listing exists -- distinct from get_feed (the authenticated user's home feed) and get_company_posts (a single company page), so it gets its own tools/post.py module mirroring feed.py rather than folding into either. Follows the existing search-tool conventions: - New LinkedInExtractor.search_posts method plus a pure _build_content_search_url static helper that composes /search/results/content/?keywords=...&origin=FACETED_SEARCH and appends the datePosted facet as a URL-encoded one-element JSON list via the existing _encode_list_facet helper, mirroring how search_people encodes its network/currentCompany facets (content search uses literal datePosted tokens rather than job search's f_TPR=r codes). Underscore aliases normalise onto LinkedIn's tokens via _CONTENT_DATE_POSTED_MAP. - Content search is an infinite scroll with no &start= pagination, so max_pages maps to scroll depth (~5 scrolls/page via _CONTENT_SCROLLS_PER_PAGE). - Returns the canonical {url, sections, references?, section_errors?} shape: raw innerText under search_results plus feed_post permalink references. No structured per-post objects (no stable, locale- independent selector), matching the deliberate get_feed decision and the AGENTS.md scraping philosophy. - Invalid date_posted raises FilterValidationError (a ValueError subclass), re-raised in the tool layer as ToolError so the actionable message survives mask_error_details. Rate-limit responses surface as a typed section_errors entry, mirroring get_feed. - Thin tools/post.py:register_post_tools wired into server.py after register_feed_tools; tools/__init__.py docstring updated. - Two-layer tests (tests/test_scraping.py + tests/test_tools.py): URL building, alias normalisation, scroll-depth mapping, FilterValidationError -> ToolError surfacing, empty and rate-limited results, the Field(ge=1, le=10) boundary, and search_posts added to both timeout sweeps. --- linkedin_mcp_server/scraping/extractor.py | 110 +++++++++++++++++++++ linkedin_mcp_server/server.py | 2 + linkedin_mcp_server/tools/__init__.py | 1 + linkedin_mcp_server/tools/post.py | 114 ++++++++++++++++++++++ tests/test_scraping.py | 108 ++++++++++++++++++++ tests/test_tools.py | 73 ++++++++++++++ 6 files changed, 408 insertions(+) create mode 100644 linkedin_mcp_server/tools/post.py diff --git a/linkedin_mcp_server/scraping/extractor.py b/linkedin_mcp_server/scraping/extractor.py index 3b0b4549..231d4bc6 100644 --- a/linkedin_mcp_server/scraping/extractor.py +++ b/linkedin_mcp_server/scraping/extractor.py @@ -89,6 +89,25 @@ _SORT_BY_MAP = {"date": "DD", "relevance": "R"} +# Content (post) search uses literal ``datePosted`` tokens inside a JSON-list +# facet, e.g. ``datePosted=["past-week"]`` — unlike job search, which uses +# ``f_TPR=r`` codes. Human-friendly underscore aliases map onto +# LinkedIn's exact tokens; the tokens themselves also pass through unchanged. +_CONTENT_DATE_POSTED_MAP = { + "past-24h": "past-24h", + "past_24_hours": "past-24h", + "past-24-hours": "past-24h", + "past-week": "past-week", + "past_week": "past-week", + "past-month": "past-month", + "past_month": "past-month", +} + +# Content search is an infinite scroll (no ``&start=`` pagination), so +# ``search_posts`` expresses depth as result "pages" of roughly this many +# scrolls each. +_CONTENT_SCROLLS_PER_PAGE = 5 + # Valid tokens for the people-search ``network`` facet. # LinkedIn accepts "F" (1st-degree), "S" (2nd-degree), "O" (3rd-degree and beyond). _NETWORK_TOKENS = ("F", "S", "O") @@ -3345,6 +3364,97 @@ async def search_companies( result["section_errors"] = section_errors return result + @staticmethod + def _build_content_search_url( + keywords: str, + date_posted: str | None = None, + ) -> str: + """Build a LinkedIn content (post) search URL. + + Reproduces the ``FACETED_SEARCH`` URL LinkedIn produces from the + Posts results tab, e.g. for "Buscamos Unity" in the past week: + ``/search/results/content/?keywords=Buscamos+Unity&origin=FACETED_SEARCH&datePosted=%5B%22past-week%22%5D`` + + The ``datePosted`` facet is a one-element JSON list carrying a literal + token (``past-24h`` / ``past-week`` / ``past-month``), URL-encoded — + unlike job search, which uses ``f_TPR=r``. Aliases are + normalized via ``_CONTENT_DATE_POSTED_MAP``; unknown values pass + through unchanged (callers validate first). + """ + params = f"keywords={quote_plus(keywords)}&origin=FACETED_SEARCH" + if date_posted: + token = _CONTENT_DATE_POSTED_MAP.get(date_posted.strip(), date_posted) + params += f"&datePosted={_encode_list_facet([token])}" + return f"https://www.linkedin.com/search/results/content/?{params}" + + async def search_posts( + self, + keywords: str, + date_posted: str | None = None, + max_pages: int = 3, + ) -> dict[str, Any]: + """Search LinkedIn posts/content and extract the results page. + + Reproduces the LinkedIn "Posts" content-search tab — the surface for + catching informal "we're hiring" / "Buscamos ..." posts before a + formal job listing exists. + + Args: + keywords: Free-text query (e.g. "Buscamos Unity", "estamos contratando"). + date_posted: Optional recency filter. One of ``"past-24h"``, + ``"past-week"``, ``"past-month"`` (underscore aliases also + accepted). Invalid values raise ``FilterValidationError`` + (a ``ValueError`` subclass). + max_pages: Scroll depth, expressed in result "pages" of roughly + ``_CONTENT_SCROLLS_PER_PAGE`` scrolls each (default 3). Content + search is an infinite scroll with no per-page URL, so this caps + how far the page is scrolled rather than fetching discrete + ``&start=`` pages. + + Returns: + {url, sections: {search_results: text}} plus optional ``references`` + (``feed_post`` permalinks, post authors, companies) and + ``section_errors``. The LLM should parse the raw text to extract + each post's author, headline, body, date, and reaction counts. + """ + if ( + date_posted is not None + and date_posted.strip() + and date_posted.strip() not in _CONTENT_DATE_POSTED_MAP + ): + raise FilterValidationError( + f"Invalid date_posted {date_posted!r}; expected one of " + "'past-24h', 'past-week', 'past-month'." + ) + + url = self._build_content_search_url(keywords, date_posted=date_posted) + max_scrolls = max(1, max_pages) * _CONTENT_SCROLLS_PER_PAGE + extracted = await self.extract_page( + url, section_name="search_results", max_scrolls=max_scrolls + ) + + sections: dict[str, str] = {} + references: dict[str, list[Reference]] = {} + section_errors: dict[str, dict[str, Any]] = {} + if extracted.text and extracted.text != _RATE_LIMITED_MSG: + sections["search_results"] = extracted.text + if extracted.references: + references["search_results"] = extracted.references + elif extracted.text == _RATE_LIMITED_MSG: + section_errors["search_results"] = { + "error_type": "rate_limit", + "error_message": extracted.text, + } + elif extracted.error: + section_errors["search_results"] = extracted.error + + result: dict[str, Any] = {"url": url, "sections": sections} + if references: + result["references"] = references + if section_errors: + result["section_errors"] = section_errors + return result + async def get_inbox(self, limit: int = 20) -> dict[str, Any]: """List recent conversations from the messaging inbox.""" url = "https://www.linkedin.com/messaging/" diff --git a/linkedin_mcp_server/server.py b/linkedin_mcp_server/server.py index c064d614..29c8e36a 100644 --- a/linkedin_mcp_server/server.py +++ b/linkedin_mcp_server/server.py @@ -27,6 +27,7 @@ from linkedin_mcp_server.tools.job import register_job_tools from linkedin_mcp_server.tools.messaging import register_messaging_tools from linkedin_mcp_server.tools.person import register_person_tools +from linkedin_mcp_server.tools.post import register_post_tools logger = logging.getLogger(__name__) @@ -62,6 +63,7 @@ def create_mcp_server(*, tool_timeout: float = DEFAULT_TOOL_TIMEOUT_SECONDS) -> register_job_tools(mcp, tool_timeout=tool_timeout) register_messaging_tools(mcp, tool_timeout=tool_timeout) register_feed_tools(mcp, tool_timeout=tool_timeout) + register_post_tools(mcp, tool_timeout=tool_timeout) # Register session management tool @mcp.tool( diff --git a/linkedin_mcp_server/tools/__init__.py b/linkedin_mcp_server/tools/__init__.py index e45c195c..eb1bbad1 100644 --- a/linkedin_mcp_server/tools/__init__.py +++ b/linkedin_mcp_server/tools/__init__.py @@ -12,6 +12,7 @@ - Job tools: Job posting details and search functionality - Messaging tools: Inbox, conversations, search, and sending messages - Feed tools: Home feed scraping +- Post tools: Global post/content search Architecture: - FastMCP integration for MCP-compliant tool registration diff --git a/linkedin_mcp_server/tools/post.py b/linkedin_mcp_server/tools/post.py new file mode 100644 index 00000000..89bacc56 --- /dev/null +++ b/linkedin_mcp_server/tools/post.py @@ -0,0 +1,114 @@ +""" +LinkedIn post/content search tool. + +Performs LinkedIn's global content search (the "Posts" results tab) using +innerText extraction, so informal "we're hiring" / "Buscamos ..." posts can +be found before a formal job listing is published. Mirrors search_people: +build a /search/results/content/ URL, scroll to load results, and return the +raw innerText for the LLM to parse, plus post-permalink references. +""" + +import logging +from typing import Annotated, Any + +from fastmcp import Context, FastMCP +from fastmcp.exceptions import ToolError +from pydantic import Field + +from linkedin_mcp_server.config.schema import DEFAULT_TOOL_TIMEOUT_SECONDS +from linkedin_mcp_server.core.exceptions import AuthenticationError +from linkedin_mcp_server.dependencies import get_ready_extractor, handle_auth_error +from linkedin_mcp_server.error_handler import raise_tool_error +from linkedin_mcp_server.scraping.extractor import FilterValidationError + +logger = logging.getLogger(__name__) + + +def register_post_tools( + mcp: FastMCP, *, tool_timeout: float = DEFAULT_TOOL_TIMEOUT_SECONDS +) -> None: + """Register post/content-search tools with the MCP server.""" + + @mcp.tool( + timeout=tool_timeout, + title="Search Posts", + annotations={"readOnlyHint": True, "openWorldHint": True}, + tags={"post", "search"}, + exclude_args=["extractor"], + ) + async def search_posts( + keywords: str, + ctx: Context, + date_posted: str | None = None, + max_pages: Annotated[int, Field(ge=1, le=10)] = 3, + extractor: Any | None = None, + ) -> dict[str, Any]: + """ + Search LinkedIn posts/content globally by keyword (the "Posts" tab). + + Use this to catch informal hiring posts ("we're hiring", "Buscamos + ...", "estamos contratando", "join our team") that often appear before + a formal job listing exists. This is global content search, distinct + from get_feed (your own home feed) and get_company_posts (one + company's page). + + Args: + keywords: Search keywords (e.g., "Buscamos Unity", "AI automation hiring") + ctx: FastMCP context for progress reporting + date_posted: Optional recency filter. One of "past-24h", + "past-week", "past-month" (underscore aliases like + "past_week" also accepted). Omit for any time. + max_pages: Scroll depth as result "pages" of ~5 scrolls each + (1-10, default 3). Content search is an infinite scroll, so + this caps how far the page is scrolled rather than fetching + discrete pages. + + Returns: + Dict with url, sections (search_results -> raw text), and optional + references (post permalinks, authors, companies) and section_errors. + The LLM should parse the raw text to extract each post's author, + headline/role, company, body, posted date, and reaction/comment + counts. + """ + try: + extractor = extractor or await get_ready_extractor( + ctx, tool_name="search_posts" + ) + logger.info( + "Searching posts: keywords='%s', date_posted='%s', max_pages=%d", + keywords, + date_posted, + max_pages, + ) + + await ctx.report_progress( + progress=0, total=100, message="Starting post search" + ) + + try: + result = await extractor.search_posts( + keywords, + date_posted=date_posted, + max_pages=max_pages, + ) + except FilterValidationError as e: + # Validation messages carry actionable detail; surface them as + # ToolError so mask_error_details doesn't reduce them to a + # generic "Error calling tool 'search_posts'". + raise ToolError(str(e)) from e + + await ctx.report_progress(progress=100, total=100, message="Complete") + + return result + + except ToolError: + # Already a properly formatted client-facing error; do not log it + # as "Unexpected error" via raise_tool_error. + raise + except AuthenticationError as e: + try: + await handle_auth_error(e, ctx) + except Exception as relogin_exc: + raise_tool_error(relogin_exc, "search_posts") + except Exception as e: + raise_tool_error(e, "search_posts") # NoReturn diff --git a/tests/test_scraping.py b/tests/test_scraping.py index 65d3f66b..34bf3c20 100644 --- a/tests/test_scraping.py +++ b/tests/test_scraping.py @@ -2707,6 +2707,114 @@ async def test_search_people_combines_all_filters(self, mock_page): assert "currentCompany=%5B%221115%22%5D" in result["url"] +class TestBuildContentSearchUrl: + """Tests for _build_content_search_url URL construction.""" + + def test_basic_keywords(self): + url = LinkedInExtractor._build_content_search_url("Buscamos Unity") + assert url == ( + "https://www.linkedin.com/search/results/content/" + "?keywords=Buscamos+Unity&origin=FACETED_SEARCH" + ) + + def test_date_posted_past_week(self): + url = LinkedInExtractor._build_content_search_url( + "Buscamos Unity", date_posted="past-week" + ) + assert "datePosted=%5B%22past-week%22%5D" in url + + def test_date_posted_alias_normalized(self): + url = LinkedInExtractor._build_content_search_url( + "python", date_posted="past_24_hours" + ) + assert "datePosted=%5B%22past-24h%22%5D" in url + + def test_no_date_posted_omits_facet(self): + url = LinkedInExtractor._build_content_search_url("python") + assert "datePosted" not in url + + +@pytest.mark.asyncio +class TestSearchPosts: + async def test_returns_results_and_url(self, mock_page): + extractor = LinkedInExtractor(mock_page) + with patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value=extracted("We're hiring a Unity dev"), + ) as mock_extract: + result = await extractor.search_posts("Buscamos Unity") + + assert "/search/results/content/" in result["url"] + assert "origin=FACETED_SEARCH" in result["url"] + assert result["sections"]["search_results"] == "We're hiring a Unity dev" + # max_pages default (3) -> 15 scrolls + mock_extract.assert_awaited_once_with( + ANY, section_name="search_results", max_scrolls=15 + ) + + async def test_date_posted_in_url(self, mock_page): + extractor = LinkedInExtractor(mock_page) + with patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value=extracted("post"), + ): + result = await extractor.search_posts( + "Buscamos Unity", date_posted="past-week" + ) + + assert "datePosted=%5B%22past-week%22%5D" in result["url"] + + async def test_max_pages_controls_scroll_depth(self, mock_page): + extractor = LinkedInExtractor(mock_page) + with patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value=extracted("post"), + ) as mock_extract: + await extractor.search_posts("python", max_pages=2) + + mock_extract.assert_awaited_once_with( + ANY, section_name="search_results", max_scrolls=10 + ) + + async def test_invalid_date_posted_raises(self, mock_page): + extractor = LinkedInExtractor(mock_page) + with pytest.raises(ValueError, match="Invalid date_posted"): + await extractor.search_posts("python", date_posted="last-year") + + async def test_empty_results_omit_optional_keys(self, mock_page): + extractor = LinkedInExtractor(mock_page) + with patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value=extracted(""), + ): + result = await extractor.search_posts("nothing matches this query") + + assert result["sections"] == {} + assert "references" not in result + assert "section_errors" not in result + + async def test_rate_limited_surfaces_section_error(self, mock_page): + extractor = LinkedInExtractor(mock_page) + with patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value=extracted(_RATE_LIMITED_MSG), + ): + result = await extractor.search_posts("python") + + assert result["sections"] == {} + assert result["section_errors"]["search_results"]["error_type"] == "rate_limit" + + class TestStripLinkedInNoise: def test_strips_footer(self): text = "Bill Gates\nChair, Gates Foundation\n\nAbout\nAccessibility\nTalent Solutions\nCareers" diff --git a/tests/test_tools.py b/tests/test_tools.py index 1bda7e89..0ca4d64c 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -35,6 +35,7 @@ def _make_mock_extractor(scrape_result: dict) -> MagicMock: mock.send_message = AsyncMock(return_value=scrape_result) mock.get_my_profile = AsyncMock(return_value=scrape_result) mock.search_companies = AsyncMock(return_value=scrape_result) + mock.search_posts = AsyncMock(return_value=scrape_result) mock.get_company_employees = AsyncMock(return_value=scrape_result) mock.extract_page = AsyncMock( return_value=ExtractedSection(text="some text", references=[]) @@ -1151,6 +1152,76 @@ async def test_get_feed_rejects_excessive_num_posts(self, mock_context): await mcp.call_tool("get_feed", {"num_posts": 51}) +class TestPostTools: + async def test_search_posts_success(self, mock_context): + expected = { + "url": ( + "https://www.linkedin.com/search/results/content/" + "?keywords=Buscamos+Unity&origin=FACETED_SEARCH" + ), + "sections": {"search_results": "Acme is hiring a Unity dev!"}, + } + mock_extractor = _make_mock_extractor(expected) + + from linkedin_mcp_server.tools.post import register_post_tools + + mcp = FastMCP("test") + register_post_tools(mcp) + + tool_fn = await get_tool_fn(mcp, "search_posts") + result = await tool_fn( + "Buscamos Unity", + mock_context, + date_posted="past-week", + extractor=mock_extractor, + ) + assert "search_results" in result["sections"] + mock_extractor.search_posts.assert_awaited_once_with( + "Buscamos Unity", + date_posted="past-week", + max_pages=3, + ) + + async def test_search_posts_validation_error_surfaced_as_tool_error( + self, mock_context + ): + """A FilterValidationError from the extractor surfaces to the client as + a ToolError carrying the same message, not the generic mask.""" + from fastmcp.exceptions import ToolError + + from linkedin_mcp_server.scraping.extractor import FilterValidationError + from linkedin_mcp_server.tools.post import register_post_tools + + mock_extractor = MagicMock() + mock_extractor.search_posts = AsyncMock( + side_effect=FilterValidationError("Invalid date_posted 'last-year'") + ) + + mcp = FastMCP("test") + register_post_tools(mcp) + tool_fn = await get_tool_fn(mcp, "search_posts") + + with pytest.raises(ToolError, match="Invalid date_posted"): + await tool_fn( + "python", + mock_context, + date_posted="last-year", + extractor=mock_extractor, + ) + + async def test_search_posts_rejects_zero_max_pages(self, mock_context): + """Verify max_pages=0 is rejected by Field(ge=1) validation.""" + from pydantic import ValidationError + + from linkedin_mcp_server.tools.post import register_post_tools + + mcp = FastMCP("test") + register_post_tools(mcp) + + with pytest.raises(ValidationError, match="max_pages"): + await mcp.call_tool("search_posts", {"keywords": "python", "max_pages": 0}) + + class TestToolTimeouts: async def test_all_tools_have_global_timeout(self): from linkedin_mcp_server.server import create_mcp_server @@ -1172,6 +1243,7 @@ async def test_all_tools_have_global_timeout(self): "search_conversations", "send_message", "get_feed", + "search_posts", "close_session", ) @@ -1203,6 +1275,7 @@ async def test_all_tools_have_default_timeout(self): "search_conversations", "send_message", "get_feed", + "search_posts", "close_session", ) From 83c31dce9ed23e4209ab964577a34a595ebfffd1 Mon Sep 17 00:00:00 2001 From: AkaNebur Date: Sun, 21 Jun 2026 11:49:32 +0200 Subject: [PATCH 2/3] docs(post): document search_posts tool Adds the search_posts row to the README tool table (status: working), a Features bullet to docs/docker-hub.md, and the tool entry to the manifest.json tools array, per the CONTRIBUTING.md "Adding a New Tool" checklist. --- README.md | 1 + docs/docker-hub.md | 1 + manifest.json | 4 ++++ 3 files changed, 6 insertions(+) diff --git a/README.md b/README.md index cb4b4af5..8e64306f 100644 --- a/README.md +++ b/README.md @@ -53,6 +53,7 @@ This MCP server is **free** and **open source**, supported by [**Unipile**](http | `search_people` | Search for people by keywords, location, connection degree (1st/2nd/3rd), and current company | working | | `get_job_details` | Get detailed information about a specific job posting | working | | `get_feed` | Get recent posts from the authenticated user's home feed | working | +| `search_posts` | Search posts/content globally by keyword (the "Posts" tab) with an optional recency filter (past-24h/past-week/past-month) | working | | `close_session` | Close browser session and clean up resources | working |
diff --git a/docs/docker-hub.md b/docs/docker-hub.md index ee5c431b..326c6084 100644 --- a/docs/docker-hub.md +++ b/docs/docker-hub.md @@ -18,6 +18,7 @@ A Model Context Protocol (MCP) server that connects AI assistants to LinkedIn. A - **Person Posts**: Get recent activity/posts from a person's profile - **Company Posts**: Get recent posts from a company's LinkedIn feed - **Home Feed**: Get recent posts from the authenticated user's LinkedIn home feed +- **Post Search**: Search posts/content globally by keyword (the "Posts" tab) with an optional recency filter - **Compact References**: Return typed per-section links alongside readable text without shipping full-page markdown ## Quick Start diff --git a/manifest.json b/manifest.json index 0404387c..eb750f11 100644 --- a/manifest.json +++ b/manifest.json @@ -104,6 +104,10 @@ "name": "get_feed", "description": "Get recent posts from the authenticated user's LinkedIn home feed" }, + { + "name": "search_posts", + "description": "Search LinkedIn posts/content globally by keyword (the 'Posts' tab) with an optional recency filter (past-24h/past-week/past-month)" + }, { "name": "close_session", "description": "Properly close browser session and clean up resources" From db7d14ba77323744aa0b71de90cfe74423ecbf0d Mon Sep 17 00:00:00 2001 From: AkaNebur Date: Sun, 21 Jun 2026 12:43:37 +0200 Subject: [PATCH 3/3] fix(post): omit whitespace-only date_posted facet Addresses review feedback on #532: - _build_content_search_url now guards on date_posted.strip(), so a whitespace-only value (e.g. " ") is omitted from the URL instead of being appended as an invalid datePosted facet. The stripped value is also used as the alias-map fallback so passthrough tokens are normalised. This keeps the builder in sync with the search_posts validation, which already short-circuits on a falsy strip(). - Add a regression test for the whitespace case, plus a test for the previously-uncovered `elif extracted.error:` branch (a navigation error surfaces a typed section_errors entry, mirroring search_people). --- linkedin_mcp_server/scraping/extractor.py | 6 ++++-- tests/test_scraping.py | 24 +++++++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/linkedin_mcp_server/scraping/extractor.py b/linkedin_mcp_server/scraping/extractor.py index 231d4bc6..874028f8 100644 --- a/linkedin_mcp_server/scraping/extractor.py +++ b/linkedin_mcp_server/scraping/extractor.py @@ -3382,8 +3382,10 @@ def _build_content_search_url( through unchanged (callers validate first). """ params = f"keywords={quote_plus(keywords)}&origin=FACETED_SEARCH" - if date_posted: - token = _CONTENT_DATE_POSTED_MAP.get(date_posted.strip(), date_posted) + if date_posted and date_posted.strip(): + token = _CONTENT_DATE_POSTED_MAP.get( + date_posted.strip(), date_posted.strip() + ) params += f"&datePosted={_encode_list_facet([token])}" return f"https://www.linkedin.com/search/results/content/?{params}" diff --git a/tests/test_scraping.py b/tests/test_scraping.py index 34bf3c20..c08a9445 100644 --- a/tests/test_scraping.py +++ b/tests/test_scraping.py @@ -2733,6 +2733,12 @@ def test_no_date_posted_omits_facet(self): url = LinkedInExtractor._build_content_search_url("python") assert "datePosted" not in url + def test_whitespace_date_posted_omits_facet(self): + # Whitespace-only date_posted must be ignored, not appended as an + # invalid facet token (regression guard). + url = LinkedInExtractor._build_content_search_url("python", date_posted=" ") + assert "datePosted" not in url + @pytest.mark.asyncio class TestSearchPosts: @@ -2814,6 +2820,24 @@ async def test_rate_limited_surfaces_section_error(self, mock_page): assert result["sections"] == {} assert result["section_errors"]["search_results"]["error_type"] == "rate_limit" + async def test_navigation_error_surfaces_section_error(self, mock_page): + extractor = LinkedInExtractor(mock_page) + with patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value=extracted( + "", error={"error_type": "navigation_error", "error_message": "timeout"} + ), + ): + result = await extractor.search_posts("python") + + assert result["sections"] == {} + assert result["section_errors"]["search_results"] == { + "error_type": "navigation_error", + "error_message": "timeout", + } + class TestStripLinkedInNoise: def test_strips_footer(self):