Skip to content

Commit a40201c

Browse files
committed
[tiktok] rework search scraper
1 parent 6ec2105 commit a40201c

File tree

3 files changed

+39
-81
lines changed

3 files changed

+39
-81
lines changed

tiktok-scraper/run.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,10 @@ async def run():
4848
with open(output.joinpath("profiles.json"), "w", encoding="utf-8") as file:
4949
json.dump(profiles_data, file, indent=2, ensure_ascii=False)
5050

51-
search_data = await tiktok.scrape_search(
52-
keyword="whales",
53-
max_search=18
54-
)
55-
# the current API URL scrapes video and profile data from search pages using the general search API.
56-
# to get the specific data API URLs, you can filter the results by profiles or videos and inspect the network:
57-
# profiles -> https://www.tiktok.com/api/search/user/full/?cursor=0&keyword=whales
58-
# videos - > https://www.tiktok.com/api/search/item/full/?cursor=0&keyword=whales
51+
search_data = await tiktok.scrape_search(keyword="whales")
52+
# the search scraper scrolls the search page to load results dynamically
53+
# it will scroll up to 15 times (configurable in the js scroll code)
54+
# the results are extracted from XHR calls captured during scrolling
5955
with open(output.joinpath("search.json"), "w", encoding="utf-8") as file:
6056
json.dump(search_data, file, indent=2, ensure_ascii=False)
6157

tiktok-scraper/test.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -167,10 +167,7 @@ async def test_profile_scraping():
167167
@pytest.mark.asyncio
168168
@pytest.mark.flaky(reruns=3, reruns_delay=30)
169169
async def test_search_scraping():
170-
search_data = await tiktok.scrape_search(
171-
keyword="whales",
172-
max_search=20
173-
)
170+
search_data = await tiktok.scrape_search(keyword="whales")
174171
validator = Validator(search_schema, allow_unknown=True)
175172
for item in search_data:
176173
assert validator.validate(item), {"item": item, "errors": validator.errors}

tiktok-scraper/tiktok.py

Lines changed: 34 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -201,14 +201,20 @@ async def scrape_profiles(urls: List[str]) -> List[Dict]:
201201

202202

203203
def parse_search(response: ScrapeApiResponse) -> List[Dict]:
204-
"""parse search data from the API response"""
205-
try:
206-
data = json.loads(response.scrape_result["content"])
207-
search_data = data["data"]
208-
except Exception as e:
209-
log.error(f"Failed to parse JSON from search API response: {e}")
210-
return None
211-
204+
"""parse search data from XHR calls"""
205+
# extract the xhr calls and extract the ones for search results
206+
_xhr_calls = response.scrape_result["browser_data"]["xhr_call"]
207+
search_calls = [c for c in _xhr_calls if "/api/search/general/full/" in c["url"]]
208+
search_data = []
209+
for search_call in search_calls:
210+
try:
211+
data = json.loads(search_call["response"]["body"])["data"]
212+
except Exception as e:
213+
log.error(f"Failed to parse search data from XHR call: {e}")
214+
continue
215+
search_data.extend(data)
216+
217+
# parse all the data using jmespath
212218
parsed_search = []
213219
for item in search_data:
214220
if item["type"] == 1: # get the item if it was item only
@@ -226,72 +232,31 @@ def parse_search(response: ScrapeApiResponse) -> List[Dict]:
226232
)
227233
result["type"] = item["type"]
228234
parsed_search.append(result)
229-
230-
# wheter there is more search results: 0 or 1. There is no max searches available
231-
has_more = data["has_more"]
232235
return parsed_search
233236

234237

235-
async def obtain_session(url: str) -> str:
236-
"""create a session to save the cookies and authorize the search API"""
237-
session_id = str(uuid.uuid4().hex)
238-
await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG, render_js=True, session=session_id))
239-
return session_id
240-
241-
242-
async def scrape_search(keyword: str, max_search: int, search_count: int = 12) -> List[Dict]:
243-
"""scrape tiktok search data from the search API"""
244-
245-
def generate_search_id():
246-
# get the current datetime and format it as YYYYMMDDHHMMSS
247-
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
248-
# calculate the length of the random hex required for the total length (32)
249-
random_hex_length = (32 - len(timestamp)) // 2 # calculate bytes needed
250-
random_hex = secrets.token_hex(random_hex_length).upper()
251-
random_id = timestamp + random_hex
252-
return random_id
253-
254-
def form_api_url(cursor: int):
255-
"""form the reviews API URL and its pagination values"""
256-
base_url = "https://www.tiktok.com/api/search/general/full/?"
257-
params = {
258-
"keyword": quote(keyword),
259-
"offset": cursor, # the index to start from
260-
"search_id": generate_search_id(),
261-
}
262-
return base_url + urlencode(params)
263-
264-
log.info("obtaining a session for the search API")
265-
session_id = await obtain_session(url="https://www.tiktok.com/search?q=" + quote(keyword))
266-
267-
log.info("scraping the first search batch")
268-
first_page = await SCRAPFLY.async_scrape(
238+
async def scrape_search(keyword: str) -> List[Dict]:
239+
"""scrape tiktok search data by scrolling the search page"""
240+
# js code for scrolling down with maximum 15 scrolls. It stops at the end without using the full iterations
241+
js = """const scrollToEnd = (i = 0) => (window.innerHeight + window.scrollY >= document.body.scrollHeight || i >= 15) ? (console.log("Reached the bottom or maximum iterations. Stopping further iterations."), setTimeout(() => console.log("Waited 10 seconds after all iterations."), 10000)) : (window.scrollTo(0, document.body.scrollHeight), setTimeout(() => scrollToEnd(i + 1), 10000)); setTimeout(() => scrollToEnd(), 5000);"""
242+
url = f"https://www.tiktok.com/search?q={quote(keyword)}"
243+
log.info(f"scraping search page with the URL {url} for search data")
244+
response = await SCRAPFLY.async_scrape(
269245
ScrapeConfig(
270-
form_api_url(cursor=0),
271-
**BASE_CONFIG,
272-
headers={
273-
"content-type": "application/json",
274-
},
275-
session=session_id,
246+
url,
247+
asp=True,
248+
country="AU",
249+
wait_for_selector="//div[@data-e2e='search_top-item']",
250+
render_js=True,
251+
auto_scroll=True,
252+
rendering_wait=10000,
253+
js=js,
254+
debug=True,
276255
)
277256
)
278-
search_data = parse_search(first_page)
279-
280-
# scrape the remaining comments concurrently
281-
log.info(f"scraping search pagination, remaining {max_search // search_count} more pages")
282-
_other_pages = [
283-
ScrapeConfig(
284-
form_api_url(cursor=cursor), **BASE_CONFIG, headers={"content-type": "application/json"}, session=session_id
285-
)
286-
for cursor in range(search_count, max_search + search_count, search_count)
287-
]
288-
async for response in SCRAPFLY.concurrent_scrape(_other_pages):
289-
data = parse_search(response)
290-
if data is not None:
291-
search_data.extend(data)
292-
293-
log.success(f"scraped {len(search_data)} from the search API from the keyword {keyword}")
294-
return search_data
257+
data = parse_search(response)
258+
log.success(f"scraped {len(data)} search results for keyword: {keyword}")
259+
return data
295260

296261

297262
def parse_channel(response: ScrapeApiResponse):
@@ -326,7 +291,7 @@ def parse_channel(response: ScrapeApiResponse):
326291
async def scrape_channel(url: str) -> List[Dict]:
327292
"""scrape video data from a channel (profile with videos)"""
328293
# js code for scrolling down with maximum 15 scrolls. It stops at the end without using the full iterations
329-
js = """const scrollToEnd = (i = 0) => (window.innerHeight + window.scrollY >= document.body.scrollHeight || i >= 15) ? (console.log("Reached the bottom or maximum iterations. Stopping further iterations."), setTimeout(() => console.log("Waited 10 seconds after all iterations."), 10000)) : (window.scrollTo(0, document.body.scrollHeight), setTimeout(() => scrollToEnd(i + 1), 5000)); scrollToEnd();"""
294+
js = """const scrollToEnd = (i = 0) => (window.innerHeight + window.scrollY >= document.body.scrollHeight || i >= 15) ? (console.log("Reached the bottom or maximum iterations. Stopping further iterations."), setTimeout(() => console.log("Waited 10 seconds after all iterations."), 10000)) : (window.scrollTo(0, document.body.scrollHeight), setTimeout(() => scrollToEnd(i + 1), 10000)); setTimeout(() => scrollToEnd(), 5000);"""
330295
log.info(f"scraping channel page with the URL {url} for post data")
331296
response = await SCRAPFLY.async_scrape(
332297
ScrapeConfig(

0 commit comments

Comments
 (0)