From 697d3dd991241352ef925640b8bacfc331b79568 Mon Sep 17 00:00:00 2001 From: Pierre918 Date: Sat, 20 Jun 2026 10:43:26 +0200 Subject: [PATCH 1/2] feat(search): paginate search_people via max_pages --- README.md | 2 +- linkedin_mcp_server/scraping/extractor.py | 66 ++++++++++++++---- linkedin_mcp_server/tools/person.py | 9 ++- manifest.json | 2 +- tests/test_scraping.py | 81 +++++++++++++++++++++++ tests/test_tools.py | 24 +++++++ 6 files changed, 169 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index cb4b4af5..5f4a33dd 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ This MCP server is **free** and **open source**, supported by [**Unipile**](http | `search_companies` | Search for companies on LinkedIn by keywords | working | | `get_company_employees` | List employees at a company from the /people/ page, with optional keyword filter | working | | `search_jobs` | Search for jobs with keywords and location filters | working | -| `search_people` | Search for people by keywords, location, connection degree (1st/2nd/3rd), and current company | working | +| `search_people` | Search for people by keywords, location, connection degree (1st/2nd/3rd), and current company, with optional multi-page pagination | working | | `get_job_details` | Get detailed information about a specific job posting | working | | `get_feed` | Get recent posts from the authenticated user's home feed | working | | `close_session` | Close browser session and clean up resources | working | diff --git a/linkedin_mcp_server/scraping/extractor.py b/linkedin_mcp_server/scraping/extractor.py index 3b0b4549..1530b081 100644 --- a/linkedin_mcp_server/scraping/extractor.py +++ b/linkedin_mcp_server/scraping/extractor.py @@ -3245,8 +3245,15 @@ async def search_people( location: str | None = None, network: list[str] | None = None, current_company: str | None = None, + max_pages: int = 1, ) -> dict[str, Any]: - """Search for people and extract the results page. + """Search for people and extract the results page(s). + + Paginates through LinkedIn's people search via the ``&page=N`` URL + parameter (1-based). Each page yields ~10 results; ``max_pages`` caps + how many are fetched. Pagination stops early when a page surfaces no + new ``person`` references (the locale-independent end-of-results + signal), so requesting more pages than exist is harmless. Args: keywords: Free-text query ("software engineer", "recruiter at Google"). @@ -3262,9 +3269,13 @@ async def search_people( unfiltered result set. Look up a company's URN via ``get_company_profile`` -- it is exposed under ``references["about"]``. + max_pages: Maximum number of result pages to load (default 1). Returns: - {url, sections: {name: text}} + {url, sections: {search_results: text}} where ``url`` is the + first-page URL and ``search_results`` joins each page's text with + ``\\n---\\n``. Optional ``references`` and ``section_errors`` keys + follow the standard tool return shape. """ if network is not None: invalid = [t for t in network if t not in _NETWORK_TOKENS] @@ -3290,21 +3301,52 @@ async def search_people( if current_company: params += f"¤tCompany={_encode_list_facet([current_company])}" - url = f"https://www.linkedin.com/search/results/people/?{params}" - extracted = await self.extract_page(url, section_name="search_results") + base_url = f"https://www.linkedin.com/search/results/people/?{params}" - sections: dict[str, str] = {} - references: dict[str, list[Reference]] = {} + page_texts: list[str] = [] + all_references: list[Reference] = [] + seen_person_urls: set[str] = set() section_errors: dict[str, dict[str, Any]] = {} - if extracted.text and extracted.text != _RATE_LIMITED_MSG: - sections["search_results"] = extracted.text + + for page_num in range(max_pages): + if page_num > 0: + await asyncio.sleep(_NAV_DELAY) + + url = base_url if page_num == 0 else f"{base_url}&page={page_num + 1}" + extracted = await self.extract_page(url, section_name="search_results") + + if not extracted.text or extracted.text == _RATE_LIMITED_MSG: + if extracted.error: + section_errors["search_results"] = extracted.error + # Navigation failed or rate-limited; nothing more to paginate. + break + + # End-of-results detection (locale-independent): a page beyond the + # first that surfaces no new /in/ profile anchors means we have run + # past the last page of results. + page_person_urls = { + ref["url"] for ref in extracted.references if ref["kind"] == "person" + } + new_person_urls = page_person_urls - seen_person_urls + if page_num > 0 and not new_person_urls: + logger.debug("No new person results on page %d, stopping", page_num + 1) + break + + seen_person_urls |= page_person_urls + page_texts.append(extracted.text) if extracted.references: - references["search_results"] = extracted.references - elif extracted.error: - section_errors["search_results"] = extracted.error + all_references.extend(extracted.references) + + sections: dict[str, str] = {} + references: dict[str, list[Reference]] = {} + if page_texts: + sections["search_results"] = "\n---\n".join(page_texts) + deduped = dedupe_references(all_references) + if deduped: + references["search_results"] = deduped result: dict[str, Any] = { - "url": url, + "url": base_url, "sections": sections, } if references: diff --git a/linkedin_mcp_server/tools/person.py b/linkedin_mcp_server/tools/person.py index acd74ebd..b5563f88 100644 --- a/linkedin_mcp_server/tools/person.py +++ b/linkedin_mcp_server/tools/person.py @@ -114,6 +114,7 @@ async def search_people( location: str | None = None, network: list[str] | None = None, current_company: str | None = None, + max_pages: Annotated[int, Field(ge=1, le=10)] = 1, extractor: Any | None = None, ) -> dict[str, Any]: """ @@ -126,6 +127,9 @@ async def search_people( network: Optional connection-degree filter. Each element is one of "F" (1st-degree), "S" (2nd-degree), "O" (3rd-degree and beyond). Example: ["F"] to only return 1st-degree connections. + max_pages: Maximum number of result pages to load (1-10, default 1). + Each page holds ~10 results. Pagination stops early once a page + returns no new people, so over-requesting pages is safe. current_company: Optional current-employer filter. LinkedIn's currentCompany facet only filters on the numeric company URN id (e.g. "1115" for SAP); plain company names are accepted by the @@ -144,11 +148,13 @@ async def search_people( ctx, tool_name="search_people" ) logger.info( - "Searching people: keywords='%s', location='%s', network=%s, current_company='%s'", + "Searching people: keywords='%s', location='%s', network=%s, " + "current_company='%s', max_pages=%d", keywords, location, network, current_company, + max_pages, ) await ctx.report_progress( @@ -161,6 +167,7 @@ async def search_people( location, network=network, current_company=current_company, + max_pages=max_pages, ) except FilterValidationError as e: # Validation messages carry actionable detail; surface diff --git a/manifest.json b/manifest.json index 0404387c..7a5234c1 100644 --- a/manifest.json +++ b/manifest.json @@ -82,7 +82,7 @@ }, { "name": "search_people", - "description": "Search for people on LinkedIn by keywords, location, connection degree (1st/2nd/3rd), and current company" + "description": "Search for people on LinkedIn by keywords, location, connection degree (1st/2nd/3rd), and current company, with optional multi-page pagination via max_pages" }, { "name": "get_inbox", diff --git a/tests/test_scraping.py b/tests/test_scraping.py index 65d3f66b..c1d7f1b7 100644 --- a/tests/test_scraping.py +++ b/tests/test_scraping.py @@ -2706,6 +2706,87 @@ async def test_search_people_combines_all_filters(self, mock_page): assert "network=%5B%22F%22%5D" in result["url"] assert "currentCompany=%5B%221115%22%5D" in result["url"] + async def test_search_people_default_loads_single_page(self, mock_page): + extractor = LinkedInExtractor(mock_page) + with patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value=extracted( + "Jane Doe", + [{"kind": "person", "url": "/in/jane/", "text": "Jane Doe"}], + ), + ) as mock_extract: + result = await extractor.search_people("engineer") + + assert mock_extract.await_count == 1 + assert "&page=" not in result["url"] + assert result["sections"]["search_results"] == "Jane Doe" + + async def test_search_people_paginates_and_joins_pages(self, mock_page): + extractor = LinkedInExtractor(mock_page) + pages = [ + extracted( + "Jane Doe", + [{"kind": "person", "url": "/in/jane/", "text": "Jane Doe"}], + ), + extracted( + "John Roe", + [ + {"kind": "person", "url": "/in/jane/", "text": "Jane Doe"}, + {"kind": "person", "url": "/in/john/", "text": "John Roe"}, + ], + ), + ] + with ( + patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + side_effect=pages, + ) as mock_extract, + patch( + "linkedin_mcp_server.scraping.extractor.asyncio.sleep", + new_callable=AsyncMock, + ), + ): + result = await extractor.search_people("engineer", max_pages=2) + + assert mock_extract.await_count == 2 + # Second navigation carries the &page=2 cursor; first one does not. + second_url = mock_extract.await_args_list[1].args[0] + assert "&page=2" in second_url + assert result["sections"]["search_results"] == "Jane Doe\n---\nJohn Roe" + # References are deduped by URL across pages (jane appears on both). + urls = [ref["url"] for ref in result["references"]["search_results"]] + assert urls == ["/in/jane/", "/in/john/"] + + async def test_search_people_stops_when_page_adds_no_new_people(self, mock_page): + extractor = LinkedInExtractor(mock_page) + repeated = [{"kind": "person", "url": "/in/jane/", "text": "Jane Doe"}] + pages = [ + extracted("Jane Doe", repeated), + # Same person, no new /in/ anchors -> past the last page. + extracted("Jane Doe again", repeated), + extracted("should not be reached", repeated), + ] + with ( + patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + side_effect=pages, + ) as mock_extract, + patch( + "linkedin_mcp_server.scraping.extractor.asyncio.sleep", + new_callable=AsyncMock, + ), + ): + result = await extractor.search_people("engineer", max_pages=5) + + assert mock_extract.await_count == 2 + assert result["sections"]["search_results"] == "Jane Doe" + class TestStripLinkedInNoise: def test_strips_footer(self): diff --git a/tests/test_tools.py b/tests/test_tools.py index 1bda7e89..a03d4bb0 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -263,6 +263,7 @@ async def test_search_people(self, mock_context): "New York", network=None, current_company=None, + max_pages=1, ) async def test_search_people_with_network_and_company_filters(self, mock_context): @@ -297,6 +298,29 @@ async def test_search_people_with_network_and_company_filters(self, mock_context None, network=["F"], current_company="1115", + max_pages=1, + ) + + async def test_search_people_forwards_max_pages(self, mock_context): + expected = { + "url": "https://www.linkedin.com/search/results/people/?keywords=engineer", + "sections": {"search_results": "Jane Doe\n---\nJohn Roe"}, + } + mock_extractor = _make_mock_extractor(expected) + + from linkedin_mcp_server.tools.person import register_person_tools + + mcp = FastMCP("test") + register_person_tools(mcp) + + tool_fn = await get_tool_fn(mcp, "search_people") + await tool_fn("engineer", mock_context, max_pages=3, extractor=mock_extractor) + mock_extractor.search_people.assert_awaited_once_with( + "engineer", + None, + network=None, + current_company=None, + max_pages=3, ) async def test_search_people_validation_error_surfaced_as_tool_error( From 35b8ddf9f11f07b520ccb65e739e12c58729f879 Mon Sep 17 00:00:00 2001 From: Pierre918 Date: Sat, 20 Jun 2026 15:45:59 +0200 Subject: [PATCH 2/2] test(search): annotate repeated refs list for ty --- tests/test_scraping.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_scraping.py b/tests/test_scraping.py index c1d7f1b7..4bf10a63 100644 --- a/tests/test_scraping.py +++ b/tests/test_scraping.py @@ -2763,7 +2763,9 @@ async def test_search_people_paginates_and_joins_pages(self, mock_page): async def test_search_people_stops_when_page_adds_no_new_people(self, mock_page): extractor = LinkedInExtractor(mock_page) - repeated = [{"kind": "person", "url": "/in/jane/", "text": "Jane Doe"}] + repeated: list[Reference] = [ + {"kind": "person", "url": "/in/jane/", "text": "Jane Doe"} + ] pages = [ extracted("Jane Doe", repeated), # Same person, no new /in/ anchors -> past the last page.