Skip to content
This repository was archived by the owner on Aug 15, 2023. It is now read-only.

Commit 3fae71e

Browse files
vecnaascariandrea
andauthored
feat: tiktok profile scraper (#531)
Co-authored-by: ascariandrea <[email protected]>
1 parent 6bb18d8 commit 3fae71e

File tree

15 files changed

+14826
-54
lines changed

15 files changed

+14826
-54
lines changed

Diff for: packages/shared/src/extension/app.ts

+18-21
Original file line numberDiff line numberDiff line change
@@ -108,14 +108,11 @@ function setupObserver({
108108

109109
const observer = new MutationObserver(
110110
debounce((mutations) => {
111-
mutations.forEach(function (mutation) {
112-
// appLog.debug('mutation (%s) %O', mutation.type, mutation.target);
113-
114-
if (window?.document) {
115-
if (
116-
oldHref !== window.location.href &&
117-
platformMatch.test(window.location.href)
118-
) {
111+
// appLog.debug('mutation (%s) %O', mutation.type, mutation.target);
112+
113+
if (window?.document) {
114+
if (platformMatch.test(window.location.href)) {
115+
if (oldHref !== window.location.href) {
119116
const newHref = window.location.href;
120117

121118
appLog.debug(
@@ -126,25 +123,25 @@ function setupObserver({
126123

127124
onLocationChange(oldHref, newHref);
128125

129-
const routeHandlerKey = handlersList.find((h) => {
130-
const handler = handlers[h];
126+
oldHref = newHref;
127+
}
131128

132-
if (handler.match.type === 'route') {
133-
return window.location.pathname.match(handler.match.location);
134-
}
135-
return false;
136-
});
129+
const routeHandlerKey = handlersList.find((h) => {
130+
const handler = handlers[h];
137131

138-
if (routeHandlerKey) {
139-
appLog.debug('Route handler key %s', routeHandlerKey);
140-
const { handle, ...routeHandlerOpts } = handlers[routeHandlerKey];
141-
handle(window.document.body, routeHandlerOpts, routeHandlerKey);
132+
if (handler.match.type === 'route') {
133+
return window.location.pathname.match(handler.match.location);
142134
}
135+
return false;
136+
});
143137

144-
oldHref = newHref;
138+
if (routeHandlerKey) {
139+
appLog.debug('Route handler key %s', routeHandlerKey);
140+
const { handle, ...routeHandlerOpts } = handlers[routeHandlerKey];
141+
handle(window.document.body, routeHandlerOpts, routeHandlerKey);
145142
}
146143
}
147-
});
144+
}
148145
}, 300)
149146
);
150147

Diff for: platforms/tktrex/backend/lib/parserchain.js

+13
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ const parserList = {
1414
stitch: require('../parsers/stitch'),
1515
author: require('../parsers/author'),
1616
search: require('../parsers/search'),
17+
profile: require('../parsers/profile'),
1718
downloader: require('../parsers/downloader'),
1819
};
1920

@@ -47,6 +48,18 @@ function buildMetadata(entry) {
4748
return metadata;
4849
}
4950

51+
if (entry.findings.nature.type === 'profile') {
52+
const metadata = {
53+
...entry.findings.nature,
54+
// ...entry.findings.downloader,
55+
...entry.findings.profile,
56+
};
57+
metadata.savingTime = new Date(entry.source.html.savingTime);
58+
metadata.id = entry.source.html.id;
59+
metadata.publicKey = entry.source.html.publicKey;
60+
return metadata;
61+
}
62+
5063
/* else ... */
5164
const metadata = {
5265
...entry.findings.nature,

Diff for: platforms/tktrex/backend/lib/utils.js

+3-2
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,11 @@ function hash(obj, fields) {
1515
memo += fname + '∴' + JSON.stringify(_.get(obj, fname, '…miss!')) + ',';
1616
return memo;
1717
}, '');
18-
// debug("(note) hashing of %s", plaincnt);
1918
const sha1sum = crypto.createHash('sha1');
2019
sha1sum.update(plaincnt);
21-
return sha1sum.digest('hex');
20+
const retval = sha1sum.digest('hex');
21+
// debug('(note) hashing of %s\n%s', plaincnt, retval);
22+
return retval;
2223
}
2324

2425
function verifyRequestSignature(req) {

Diff for: platforms/tktrex/backend/parsers/profile.js

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
const _ = require('lodash');
2+
const debug = require('debug')('parser:profile');
3+
4+
const getNatureByHref = require('./shared').getNatureByHref;
5+
6+
function getFullProfileMetadata(renod, order) {
7+
const vlink = renod.querySelector('a[href^="https://www.tiktok.com/@"]');
8+
const vhref = vlink.getAttribute('href');
9+
const vidnat = getNatureByHref(vhref);
10+
11+
const titleel = renod.querySelector('a[title]');
12+
if(!titleel)
13+
return null;
14+
const title = titleel.getAttribute('title');
15+
const viewsel = renod.querySelector('[data-e2e="video-views"]');
16+
const views = viewsel.textContent;
17+
const img = renod.querySelector('img[alt]');
18+
const thumbnail = img.getAttribute('src');
19+
20+
return {
21+
order,
22+
video: vidnat,
23+
title,
24+
views,
25+
thumbnail,
26+
};
27+
}
28+
29+
/* this is returning a bunch of native information,
30+
* perhaps might be splitted in appropriate files.
31+
* videoId, error messages, comment disabled, etc */
32+
function profile(envelop, previous) {
33+
34+
if (previous.nature.type !== 'profile') return false;
35+
36+
/* this piece of code return a list of videos, because
37+
the search selector is not per video, but per 'body' */
38+
const descs = envelop.jsdom.querySelectorAll('[data-e2e="user-post-item"]');
39+
const results = _.compact(_.map(descs, function (elem, i) {
40+
return getFullProfileMetadata(elem.parentNode, i + 1);
41+
}));
42+
43+
const retval = {};
44+
45+
debug("Video Results found in profile %d", results.length);
46+
if (results.length) {
47+
retval.amount = results.length;
48+
retval.results = results;
49+
} else {
50+
const errmsg = 'No results found';
51+
const h2 = envelop.jsdom.querySelectorAll('h2');
52+
// there are various 'h2' but only one can be an error
53+
_.each(h2, function (h) {
54+
if (errmsg === h.textContent) {
55+
retval.error = errmsg;
56+
retval.message = h.parentNode.querySelector('p')?.textContent;
57+
// it can be 'hateful' or 'violate' but we don't know about other languages
58+
debug('No results found: found this message: %s', retval.message);
59+
retval.hatespeech = !!retval?.message?.match(/hateful/);
60+
}
61+
});
62+
}
63+
64+
return retval;
65+
}
66+
67+
module.exports = profile;

Diff for: platforms/tktrex/backend/parsers/shared.js

+4-3
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@ function getNatureByHref(href) {
2222
retval.type = 'video';
2323
retval.videoId = chunks[3];
2424
retval.authorId = chunks[1];
25-
} else if (_.startsWith(urlO.pathname, '/@')) {
26-
retval.type = 'creator';
27-
retval.creatorName = urlO.pathname.substr(1);
25+
} else if (_.startsWith(urlO.pathname, '/@') && chunks.length === 2) {
26+
retval.type = 'profile';
27+
retval.creatorName = chunks[1].substring(1);
2828
} else if (urlO.pathname === '/search') {
2929
retval.type = 'search';
3030
retval.query = urlO.searchParams.get('q');
@@ -51,6 +51,7 @@ function getUUID(url, type) {
5151
const fullname = type === 'video' ? `${fname}.mp4` : `${fname}.jpeg`;
5252
const cwd = process.cwd();
5353
if (!nconf.get('downloads')) {
54+
/* eslint-disable no-console */
5455
console.log("WRONG CONFIGURATION SETTINGS!! missing 'downloads' from", cwd);
5556
process.exit(1);
5657
}

Diff for: platforms/tktrex/backend/routes/events.js

+8-6
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ async function saveInDB(experinfo, objects, dbcollection) {
112112
}
113113
}
114114

115+
/*
115116
function handleFullSave(body, headers) {
116117
// ["html","href","feedId","feedCounter", "reason",
117118
// "videoCounter","rect","clientTime","type","incremental"]
@@ -132,7 +133,7 @@ function handleFullSave(body, headers) {
132133
geoip: geo(headers['x-forwarded-for']),
133134
researchTag: body.researchTag,
134135
};
135-
}
136+
} */
136137

137138
async function processEvents(req) {
138139
const headers = processHeaders(_.get(req, 'headers'), mandatoryHeaders);
@@ -161,14 +162,13 @@ async function processEvents(req) {
161162
// "videoCounter","rect","clientTime","type","incremental"]
162163
// 'type' can be ignored as it is always 'video' and doesn't reflect nature
163164

164-
console.log("--- feedId %s randomUUID",
165-
body.feedId ?? 'x', body.randomUUID ?? 'x')
166165
const id = utils.hash({
167166
clientRGN: body.feedId
168167
? body.feedId
169168
: body.href + new Date().toISOString(),
170169
serverPRGN: supporter.publicKey,
171-
impressionNumber: body.videoCounter || Math.random(),
170+
type: body.type,
171+
impressionNumber: body.videoCounter,
172172
});
173173
const timelineIdHash = utils.hash({
174174
session: body.feedId
@@ -193,6 +193,7 @@ async function processEvents(req) {
193193
optionalNumbers.push(_.size(body.html));
194194
const html = {
195195
id,
196+
type: body.type,
196197
rect: body.rect,
197198
href: body.href,
198199
timelineId: timelineWord + '-' + timelineIdHash.substr(0, 10),
@@ -208,9 +209,10 @@ async function processEvents(req) {
208209
);
209210

210211
debug(
211-
'[+] (p %s) from %s saving %s',
212+
'[+] (p %s) from %s -- %s -- %s',
212213
supporter.p,
213214
JSON.stringify(_.map(req.body, 'type')),
215+
JSON.stringify(_.map(req.body, 'href')),
214216
JSON.stringify(_.map(htmls, 'n'))
215217
);
216218

@@ -233,7 +235,7 @@ async function processEvents(req) {
233235
}
234236

235237
async function handshake(req) {
236-
debug('Not implemented protocol (yet) [handshake API %j]', req.body);
238+
// debug('Not implemented protocol (yet) [handshake API %j]', req.body);
237239
return {
238240
json: { ignored: true },
239241
};

Diff for: platforms/tktrex/backend/routes/personal.js

+48-8
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ const utils = require('../lib/utils');
66
const automo = require('../lib/automo');
77
const CSV = require('../lib/CSV');
88
const flattenSearch = require('./search').flattenSearch;
9+
const flattenProfile = require('./search').flattenProfile;
910

1011
function pickFeedFields(metae) {
1112
return {
@@ -33,7 +34,7 @@ async function getPersonal(req) {
3334
const amount = _.parseInt(req.query.amount) || 50;
3435
const skip = _.parseInt(req.query.skip) || 0;
3536
const what = req.params.what;
36-
const allowed = ['summary', 'search', 'foryou', 'following'];
37+
const allowed = ['summary', 'search', 'foryou', 'following', 'profile'];
3738

3839
if (allowed.indexOf(what) === -1) {
3940
return {
@@ -47,7 +48,7 @@ async function getPersonal(req) {
4748
}
4849

4950
debug(
50-
'Asked to get data kind %s (%d-%d), preparing JSON',
51+
'Requested data [nature %s] (%d-%d), preparing JSON',
5152
what,
5253
amount,
5354
skip
@@ -88,6 +89,24 @@ async function getPersonal(req) {
8889
counters: { metadata: avail.counters?.metadata },
8990
metadata,
9091
};
92+
} else if (what === 'profile') {
93+
/* this function access to 'search' results which is a
94+
* bit different than the other. as in the collection
95+
* there is not one entry for video, but one entry for search
96+
* query --> hence, the _.map/_.pick
97+
* note, this data should match
98+
* packages/shared/src/models/contributor/ContributorPersonalSummary.ts
99+
*/
100+
const avail = await automo.getPersonalTableData(
101+
k,
102+
{ type: 'profile' },
103+
{ amount, skip }
104+
);
105+
106+
retval = {
107+
counters: { metadata: avail.counters?.metadata },
108+
metadata: avail.metadata,
109+
};
91110
} else if (what === 'foryou' || what === 'following') {
92111
retval = await automo.getMetadataByFilter(
93112
{ type: what, publicKey: k },
@@ -111,26 +130,47 @@ async function getPersonal(req) {
111130
}
112131

113132
async function getPersonalCSV(req) {
114-
const CSV_MAX_SIZE = 1000;
133+
const CSV_MAX_SIZE = 9000;
115134
const k = req.params.publicKey;
116135
const type = req.params.what;
117-
if (['foryou', 'search', 'following'].indexOf(type) === -1)
118-
return { text: 'Error, only foryou and search is supported ' };
136+
137+
if (['foryou', 'search', 'following', 'profile'].indexOf(type) === -1)
138+
return { text: 'Error, nature not supported ' };
119139

120140
const data = await automo.getMetadataByFilter(
121141
{ publicKey: k, type },
122142
{ amount: CSV_MAX_SIZE, skip: 0 }
123143
);
124144

125-
/* remind self, search has a different logic than for you,
145+
if (!data.length) {
146+
debug("getPersonalCSV didn't found DB entry matching %o", {
147+
publicKey: k,
148+
type,
149+
});
150+
return { text: 'No data not found in the DB' };
151+
}
152+
153+
debug(
154+
'type [%s] return %d with amount %d skip-zero',
155+
type,
156+
data.length,
157+
CSV_MAX_SIZE
158+
);
159+
160+
/* remind: search and profile have a different logic than
161+
foryou and following.
126162
this is why is a reduce instead of map */
127163
let unrolledData = [];
128164
if (type === 'search') unrolledData = _.reduce(data, flattenSearch, []);
165+
else if (type === 'profile')
166+
unrolledData = _.reduce(data, flattenProfile, []);
129167
else unrolledData = _.map(data, pickFeedFields);
130168

131169
if (!unrolledData.length) {
132-
debug('getPersonalCSV return empty data');
133-
return { text: 'Data not found: are you sure any search worked?' };
170+
debug(
171+
'getPersonalCSV produced empty data during transformation: investigate parsers and pipeline!'
172+
);
173+
return { text: 'Data not found, from metadata: ' + data.length };
134174
}
135175

136176
/* XXX TMP FIXME (not if we pick the pseudo via mongodb)

0 commit comments

Comments
 (0)