Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add comment scraping #693

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ Options:
requests. For example to your own API [default: ""]
--method Receive data to your webhook url as POST or GET request
[choices: "GET", "POST"] [default: "POST"]
--includeComments Also save all comments when downloading a video. Needs a
valid session to work. [boolean] [default: false]
--help Show help [boolean]

Examples:
Expand Down
5 changes: 5 additions & 0 deletions bin/cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,11 @@ yargs
choices: ['GET', 'POST'],
describe: 'Receive data to your webhook url as POST or GET request',
},
includeComments: {
boolean: true,
default: false,
describe: 'Also save all comments when downloading a video. Needs a valid session to work.',
},
})
.check(argv => {
if (CONST.scrape.indexOf(argv._[0]) === -1) {
Expand Down
84 changes: 84 additions & 0 deletions src/core/TikTok.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import {
Headers,
WebHtmlUserMetadata,
VideoMetadata,
CommentsData,
} from '../types';

import { Downloader } from '../core';
Expand Down Expand Up @@ -127,6 +128,8 @@ export class TikTokScraper extends EventEmitter {

public cookieJar: CookieJar;

private includeComments: boolean;

constructor({
download,
filepath,
Expand Down Expand Up @@ -157,6 +160,7 @@ export class TikTokScraper extends EventEmitter {
headers,
verifyFp = '',
sessionList = [],
includeComments = false,
}: TikTokConstructor) {
super();
this.userIdStore = '';
Expand Down Expand Up @@ -221,6 +225,7 @@ export class TikTokScraper extends EventEmitter {
bad: 0,
};
this.store = [];
this.includeComments = includeComments;
}

/**
Expand Down Expand Up @@ -868,6 +873,7 @@ export class TikTokScraper extends EventEmitter {
name,
}))
: [],
comments: [],
};

if (this.event) {
Expand Down Expand Up @@ -1259,6 +1265,27 @@ export class TikTokScraper extends EventEmitter {
videoData = await this.getVideoMetadata();
}

// get *all* comments of a video (paginated)
let commentData: CommentsData | undefined;
if (this.includeComments) {
try {
for (let paginationStepSize = 30, currentPage = 0; currentPage < videoData.stats.commentCount; currentPage += paginationStepSize) {
const data = await this.getCommentMetadata('', currentPage, paginationStepSize);
// no data could be retrieved: possibly no valid session; skip comment scraping
if (data === undefined) {
break;
}
if (commentData === undefined) {
commentData = data;
} else if (data.comments !== null) {
commentData.comments = commentData.comments.concat(data.comments);
}
}
} catch {
// continue regardless of error
}
}

const videoItem = {
id: videoData.id,
secretID: videoData.video.id,
Expand Down Expand Up @@ -1326,6 +1353,7 @@ export class TikTokScraper extends EventEmitter {
name,
}))
: [],
comments: commentData?.comments,
} as PostCollector;

try {
Expand Down Expand Up @@ -1374,4 +1402,60 @@ export class TikTokScraper extends EventEmitter {
);
});
}

/**
* Get comment metadata from the API endpoint
* (only works with a valid session!)
*/
private async getCommentMetadata(url = '', _cursor = 0, _count = 30): Promise<CommentsData> {
// abort, if no session is set
if (this.cookieJar.getCookieString('https://tiktok.com').indexOf('sid_tt') === -1) {
throw Error(`No valid session given. Can't download comments.`);
}

// get username and videoId from url/parameter
const videoData = /tiktok.com\/(@[\w.-]+)\/video\/(\d+)/.exec(url || this.input);
if (videoData) {
// const videoUsername = videoData[1];
const videoId = videoData[2];

// prepare api call
const query = {
method: 'GET',
uri: `https://www.tiktok.com/api/comment/list/`,
json: true,
followAllRedirects: true,
headers: {
// referer: this.input ? this.input : `https://www.tiktok.com/@${videoUsername}/video/${videoId}`,
cookie: this.cookieJar.getCookieString(`https://tiktok.com/`),
},
qs: {
aweme_id: videoId,
aid: 1988,
history_len: 6,
cursor: _cursor,
count: _count,
},
};

// generate signature and add it to query
const unsignedURL = `${query.uri}?${new URLSearchParams(query.qs as any).toString()}`;
const _signature = sign(unsignedURL, this.headers['user-agent']);
// @ts-ignore
query.qs._signature = _signature;

// call api
try {
const response = await this.request<CommentsData>(query);
if (response.status_code === 0) {
return response;
}
} catch (err) {
if (err.statusCode === 404) {
throw new Error(err.string);
}
}
}
throw new Error(`Can't extract comment metadata of ${this.input}`);
}
}
54 changes: 54 additions & 0 deletions src/types/TikTok.ts
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ export interface TikTokConstructor {
headers: Headers;
verifyFp?: string;
sessionList?: string[];
includeComments?: boolean;
}

export interface Hashtags {
Expand Down Expand Up @@ -161,6 +162,59 @@ export interface PostCollector {
repeated?: boolean;
downloaded: boolean;
effectStickers: EffectStickers[];
comments: Comment[];
}

export interface Comment {
aweme_id: string;
cid: string;
create_time: number;
digg_count: number;
status: number;
text: string;
author_pin: boolean;
collect_stat: number;
is_author_digged: boolean;
no_show: boolean;
reply_comment_total: number;
reply_id: string;
reply_to_reply_id: string;
stick_position: number;
text_extra: [];
user_buried: boolean;
user_digged: number;
// label_list: null;
// reply_comment: null;
user: {
avatar_thumb: {
uri: string;
url_list: string[];
};
custom_verify: string;
enterprise_verify_reason: string;
nickname: string;
sec_uid: string;
unique_id: string;
uid: string;
// ad_cover_url: null;
// bold_fields: null;
// can_set_geofencing: null;
// cha_list: null;
// cover_url: null;
// events: null;
// followers_detail: null;
// geofencing: null;
// homepage_bottom_toast: null;
// item_list: null;
// mutual_relation_avatars: null;
// need_points: null;
// platform_sync_info: null;
// relative_users: null;
// search_highlight: null;
// type_label: null;
// user_tags: null;
// white_cover_url: null;
};
}

export interface Result {
Expand Down
65 changes: 65 additions & 0 deletions src/types/TikTokApi.ts
Original file line number Diff line number Diff line change
Expand Up @@ -373,3 +373,68 @@ export interface WebHtmlUserMetadata {
};
};
}

export interface CommentsData {
status_code: number;
status_message: string;
comments: CommentMetadata[];
cursor: string;
hasMore: boolean;
reply_style: number;
total: number;
// log_pb:
// top_gifts:
alias_comment_deleted: boolean;
}

export interface CommentMetadata {
aweme_id: string;
cid: string;
create_time: number;
digg_count: number;
status: number;
text: string;
author_pin: boolean;
collect_stat: number;
is_author_digged: boolean;
no_show: boolean;
reply_comment_total: number;
reply_id: string;
reply_to_reply_id: string;
stick_position: number;
text_extra: [];
user_buried: boolean;
user_digged: number;
// label_list: null;
// reply_comment: null;
user: {
avatar_thumb: {
uri: string;
url_list: string[];
};
custom_verify: string;
enterprise_verify_reason: string;
nickname: string;
sec_uid: string;
unique_id: string;
uid: string;
// ad_cover_url: null;
// bold_fields: null;
// can_set_geofencing: null;
// cha_list: null;
// cover_url: null;
// events: null;
// followers_detail: null;
// geofencing: null;
// homepage_bottom_toast: null;
// item_list: null;
// mutual_relation_avatars: null;
// need_points: null;
// platform_sync_info: null;
// relative_users: null;
// search_highlight: null;
// type_label: null;
// user_tags: null;
// white_cover_url: null;
};
}