+ @staticmethod
+ def parse_time_text(time_text):
+ """
+ Parse the comment time text
+ time_text is in the format 'X units ago (edited)'
+ """
+ time_text_split = time_text.split(' ')
+ if len(time_text_split) >= 3:
+ return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto')
+
+ @staticmethod
+ def _join_text_entries(runs):
+ text = None
+ for run in runs:
+ if not isinstance(run, dict):
+ continue
+ sub_text = try_get(run, lambda x: x['text'], compat_str)
+ if sub_text:
+ if not text:
+ text = sub_text
+ continue
+ text += sub_text
+ return text
+
+ def _extract_comment(self, comment_renderer, parent=None):
+ comment_id = comment_renderer.get('commentId')
+ if not comment_id:
+ return
+ comment_text_runs = try_get(comment_renderer, lambda x: x['contentText']['runs']) or []
+ text = self._join_text_entries(comment_text_runs) or ''
+ comment_time_text = try_get(comment_renderer, lambda x: x['publishedTimeText']['runs']) or []
+ time_text = self._join_text_entries(comment_time_text)
+ timestamp = calendar.timegm(self.parse_time_text(time_text).timetuple())
+ author = try_get(comment_renderer, lambda x: x['authorText']['simpleText'], compat_str)
+ author_id = try_get(comment_renderer,
+ lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str)
+ votes = str_to_int(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'],
+ lambda x: x['likeCount']), compat_str)) or 0
+ author_thumbnail = try_get(comment_renderer,
+ lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str)
+
+ author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool)
+ is_liked = try_get(comment_renderer, lambda x: x['isLiked'], bool)
+ return {
+ 'id': comment_id,
+ 'text': text,
+ 'timestamp': timestamp,
+ 'time_text': time_text,
+ 'like_count': votes,
+ 'is_favorited': is_liked,
+ 'author': author,
+ 'author_id': author_id,
+ 'author_thumbnail': author_thumbnail,
+ 'author_is_uploader': author_is_uploader,
+ 'parent': parent or 'root'
+ }
+
+ def _comment_entries(self, root_continuation_data, identity_token, account_syncid,
+ ytcfg, session_token_list, parent=None, comment_counts=None):
+
+ def extract_thread(parent_renderer):
+ contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
+ if not parent:
+ comment_counts[2] = 0
+ for content in contents:
+ comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer'])
+ comment_renderer = try_get(
+ comment_thread_renderer, (lambda x: x['comment']['commentRenderer'], dict)) or try_get(
+ content, (lambda x: x['commentRenderer'], dict))
+
+ if not comment_renderer:
+ continue
+ comment = self._extract_comment(comment_renderer, parent)
+ if not comment:
+ continue
+ comment_counts[0] += 1
+ yield comment
+ # Attempt to get the replies
+ comment_replies_renderer = try_get(
+ comment_thread_renderer, lambda x: x['replies']['commentRepliesRenderer'], dict)
+
+ if comment_replies_renderer:
+ comment_counts[2] += 1
+ comment_entries_iter = self._comment_entries(
+ comment_replies_renderer, identity_token, account_syncid, ytcfg,
+ parent=comment.get('id'), session_token_list=session_token_list,
+ comment_counts=comment_counts)
+
+ for reply_comment in comment_entries_iter:
+ yield reply_comment
+
+ if not comment_counts:
+ # comment so far, est. total comments, current comment thread #
+ comment_counts = [0, 0, 0]
+
+ # TODO: Generalize the download code with TabIE
+ context = self._extract_context(ytcfg)
+ visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str)
+ continuation = YoutubeTabIE._extract_continuation(root_continuation_data) # TODO
+ first_continuation = False
+ if parent is None:
+ first_continuation = True
+
+ for page_num in itertools.count(0):
+ if not continuation:
+ break
+ headers = self._generate_api_headers(ytcfg, identity_token, account_syncid, visitor_data)
+ retries = self.get_param('extractor_retries', 3)
+ count = -1
+ last_error = None
+
+ while count < retries:
+ count += 1
+ if last_error:
+ self.report_warning('%s. Retrying ...' % last_error)
+ try:
+ query = {
+ 'ctoken': continuation['ctoken'],
+ 'pbj': 1,
+ 'type': 'next',
+ }
+ if parent:
+ query['action_get_comment_replies'] = 1
+ else:
+ query['action_get_comments'] = 1
+
+ comment_prog_str = '(%d/%d)' % (comment_counts[0], comment_counts[1])
+ if page_num == 0:
+ if first_continuation:
+ note_prefix = 'Downloading initial comment continuation page'
+ else:
+ note_prefix = ' Downloading comment reply thread %d %s' % (comment_counts[2], comment_prog_str)
+ else:
+ note_prefix = '%sDownloading comment%s page %d %s' % (
+ ' ' if parent else '',
+ ' replies' if parent else '',
+ page_num,
+ comment_prog_str)
+
+ browse = self._download_json(
+ 'https://www.youtube.com/comment_service_ajax', None,
+ '%s %s' % (note_prefix, '(retry #%d)' % count if count else ''),
+ headers=headers, query=query,
+ data=urlencode_postdata({
+ 'session_token': session_token_list[0]
+ }))
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503, 404, 413):
+ if e.cause.code == 413:
+ self.report_warning('Assumed end of comments (received HTTP Error 413)')
+ return
+ # Downloading page may result in intermittent 5xx HTTP error
+ # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289
+ last_error = 'HTTP Error %s' % e.cause.code
+ if e.cause.code == 404:
+ last_error = last_error + ' (this API is probably deprecated)'
+ if count < retries:
+ continue
+ raise
+ else:
+ session_token = try_get(browse, lambda x: x['xsrf_token'], compat_str)
+ if session_token:
+ session_token_list[0] = session_token
+
+ response = try_get(browse,
+ (lambda x: x['response'],
+ lambda x: x[1]['response'])) or {}
+
+ if response.get('continuationContents'):
+ break
+
+ # YouTube sometimes gives reload: now json if something went wrong (e.g. bad auth)
+ if browse.get('reload'):
+ raise ExtractorError('Invalid or missing params in continuation request', expected=False)
+
+ # TODO: not tested, merged from old extractor
+ err_msg = browse.get('externalErrorMessage')
+ if err_msg:
+ raise ExtractorError('YouTube said: %s' % err_msg, expected=False)
+
+ # Youtube sometimes sends incomplete data
+ # See: https://github.com/ytdl-org/youtube-dl/issues/28194
+ last_error = 'Incomplete data received'
+ if count >= retries:
+ raise ExtractorError(last_error)
+
+ if not response:
+ break
+ visitor_data = try_get(
+ response,
+ lambda x: x['responseContext']['webResponseContextExtensionData']['ytConfigData']['visitorData'],
+ compat_str) or visitor_data
+
+ known_continuation_renderers = {
+ 'itemSectionContinuation': extract_thread,
+ 'commentRepliesContinuation': extract_thread
+ }
+
+ # extract next root continuation from the results
+ continuation_contents = try_get(
+ response, lambda x: x['continuationContents'], dict) or {}
+
+ for key, value in continuation_contents.items():
+ if key not in known_continuation_renderers:
+ continue
+ continuation_renderer = value
+
+ if first_continuation:
+ first_continuation = False
+ expected_comment_count = try_get(
+ continuation_renderer,
+ (lambda x: x['header']['commentsHeaderRenderer']['countText']['runs'][0]['text'],
+ lambda x: x['header']['commentsHeaderRenderer']['commentsCount']['runs'][0]['text']),
+ compat_str)
+
+ if expected_comment_count:
+ comment_counts[1] = str_to_int(expected_comment_count)
+ self.to_screen('Downloading ~%d comments' % str_to_int(expected_comment_count))
+ yield comment_counts[1]
+
+ # TODO: cli arg.
+ # 1/True for newest, 0/False for popular (default)
+ comment_sort_index = int(True)
+ sort_continuation_renderer = try_get(
+ continuation_renderer,
+ lambda x: x['header']['commentsHeaderRenderer']['sortMenu']['sortFilterSubMenuRenderer']['subMenuItems']
+ [comment_sort_index]['continuation']['reloadContinuationData'], dict)
+ # If this fails, the initial continuation page
+ # starts off with popular anyways.
+ if sort_continuation_renderer:
+ continuation = YoutubeTabIE._build_continuation_query(
+ continuation=sort_continuation_renderer.get('continuation'),
+ ctp=sort_continuation_renderer.get('clickTrackingParams'))
+ self.to_screen('Sorting comments by %s' % ('popular' if comment_sort_index == 0 else 'newest'))
+ break
+
+ for entry in known_continuation_renderers[key](continuation_renderer):
+ yield entry
+
+ continuation = YoutubeTabIE._extract_continuation(continuation_renderer) # TODO
+ break
+
+ def _extract_comments(self, ytcfg, video_id, contents, webpage, xsrf_token):
+ """Entry for comment extraction"""
+ comments = []
+ known_entry_comment_renderers = (
+ 'itemSectionRenderer',
+ )
+ estimated_total = 0
+ for entry in contents:
+ for key, renderer in entry.items():
+ if key not in known_entry_comment_renderers:
+ continue
+
+ comment_iter = self._comment_entries(
+ renderer,
+ identity_token=self._extract_identity_token(webpage, item_id=video_id),
+ account_syncid=self._extract_account_syncid(ytcfg),
+ ytcfg=ytcfg,
+ session_token_list=[xsrf_token])
+
+ for comment in comment_iter:
+ if isinstance(comment, int):
+ estimated_total = comment
+ continue
+ comments.append(comment)
+ break
+ self.to_screen('Downloaded %d/%d comments' % (len(comments), estimated_total))
+ return {
+ 'comments': comments,
+ 'comment_count': len(comments),
+ }
+