'x-twitter-active-user': 'yes',
})
- result, last_error = None, None
+ last_error = None
for bearer_token in self._TOKENS:
- headers['Authorization'] = f'Bearer {bearer_token}'
+ for first_attempt in (True, False):
+ headers['Authorization'] = f'Bearer {bearer_token}'
- if not self.is_logged_in:
- if not self._TOKENS[bearer_token]:
- headers.pop('x-guest-token', None)
- guest_token_response = self._download_json(
- self._API_BASE + 'guest/activate.json', video_id,
- 'Downloading guest token', data=b'', headers=headers)
-
- self._TOKENS[bearer_token] = guest_token_response.get('guest_token')
+ if not self.is_logged_in:
if not self._TOKENS[bearer_token]:
- raise ExtractorError('Could not retrieve guest token')
- headers['x-guest-token'] = self._TOKENS[bearer_token]
-
- try:
- allowed_status = {400, 403, 404} if graphql else {403}
- result = self._download_json(
- (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path,
- video_id, headers=headers, query=query, expected_status=allowed_status)
- break
-
- except ExtractorError as e:
- if last_error:
- raise last_error
- elif not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code != 404:
- raise
- last_error = e
- self.report_warning(
- 'Twitter API gave 404 response, retrying with deprecated token. '
- 'Only one media item can be extracted')
-
- if result.get('errors'):
- error_message = ', '.join(set(traverse_obj(
- result, ('errors', ..., 'message'), expected_type=str))) or 'Unknown error'
- raise ExtractorError(f'Error(s) while querying api: {error_message}', expected=True)
-
- assert result is not None
- return result
+ headers.pop('x-guest-token', None)
+ guest_token_response = self._download_json(
+ self._API_BASE + 'guest/activate.json', video_id,
+ 'Downloading guest token', data=b'', headers=headers)
+
+ self._TOKENS[bearer_token] = guest_token_response.get('guest_token')
+ if not self._TOKENS[bearer_token]:
+ raise ExtractorError('Could not retrieve guest token')
+
+ headers['x-guest-token'] = self._TOKENS[bearer_token]
+
+ try:
+ allowed_status = {400, 403, 404} if graphql else {403}
+ result = self._download_json(
+ (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path,
+ video_id, headers=headers, query=query, expected_status=allowed_status)
+
+ except ExtractorError as e:
+ if last_error:
+ raise last_error
+
+ if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code != 404:
+ raise
+
+ last_error = e
+ self.report_warning(
+ 'Twitter API gave 404 response, retrying with deprecated auth token. '
+ 'Only one media item can be extracted')
+ break # continue outer loop with next bearer_token
+
+ if result.get('errors'):
+ errors = traverse_obj(result, ('errors', ..., 'message'), expected_type=str)
+ if first_attempt and any('bad guest token' in error.lower() for error in errors):
+ self.to_screen('Guest token has expired. Refreshing guest token')
+ self._TOKENS[bearer_token] = None
+ continue
+
+ error_message = ', '.join(set(errors)) or 'Unknown error'
+ raise ExtractorError(f'Error(s) while querying API: {error_message}', expected=True)
+
+ return result
def _build_graphql_query(self, media_id):
raise NotImplementedError('Method must be implemented to support GraphQL')
class TwitterIE(TwitterBaseIE):
IE_NAME = 'twitter'
- _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P<id>\d+)'
+ _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P<id>\d+)(?:/video/(?P<index>\d+))?'
_TESTS = [{
'url': 'https://twitter.com/freethenipple/status/643211948184596480',
'id': '665052190608723968',
'display_id': '665052190608723968',
'ext': 'mp4',
- 'title': 'md5:3f57ab5d35116537a2ae7345cd0060d8',
+ 'title': 'md5:e99588f17b3dd0503814ffb560e64731',
'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ',
'uploader_id': 'starwars',
'uploader': r're:Star Wars.*',
# Test case of TwitterCardIE
'skip_download': True,
},
+ 'skip': 'Dead external link',
}, {
'url': 'https://twitter.com/jaydingeer/status/700207533655363584',
'info_dict': {
'id': '1577855447914409984',
'display_id': '1577855540407197696',
'ext': 'mp4',
- 'title': 'oshtru \U0001faac\U0001f47d - gm \u2728\ufe0f now I can post image and video. nice update.',
- 'description': 'gm \u2728\ufe0f now I can post image and video. nice update. https://t.co/cG7XgiINOm',
+ 'title': 'md5:9d198efb93557b8f8d5b78c480407214',
+ 'description': 'md5:b9c3699335447391d11753ab21c70a74',
'upload_date': '20221006',
- 'uploader': 'oshtru \U0001faac\U0001f47d',
+ 'uploader': 'oshtru',
'uploader_id': 'oshtru',
'uploader_url': 'https://twitter.com/oshtru',
'thumbnail': r're:^https?://.*\.jpg',
'uploader_url': 'https://twitter.com/Rizdraws',
'upload_date': '20220928',
'timestamp': 1664391723,
- 'thumbnail': 're:^https?://.*\\.jpg',
+ 'thumbnail': r're:^https?://.+\.jpg',
'like_count': int,
'repost_count': int,
'comment_count': int,
},
'add_ie': ['TwitterSpaces'],
'params': {'skip_download': 'm3u8'},
+ }, {
+ # URL specifies video number but --yes-playlist
+ 'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/1',
+ 'playlist_mincount': 2,
+ 'info_dict': {
+ 'id': '1600649710662213632',
+ 'title': 'md5:be05989b0722e114103ed3851a0ffae2',
+ 'timestamp': 1670459604.0,
+ 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c',
+ 'comment_count': int,
+ 'uploader_id': 'CTVJLaidlaw',
+ 'repost_count': int,
+ 'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'],
+ 'upload_date': '20221208',
+ 'age_limit': 0,
+ 'uploader': 'Jocelyn Laidlaw',
+ 'uploader_url': 'https://twitter.com/CTVJLaidlaw',
+ 'like_count': int,
+ },
+ }, {
+ # URL specifies video number and --no-playlist
+ 'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/2',
+ 'info_dict': {
+ 'id': '1600649511827013632',
+ 'ext': 'mp4',
+ 'title': 'md5:be05989b0722e114103ed3851a0ffae2',
+ 'thumbnail': r're:^https?://.+\.jpg',
+ 'timestamp': 1670459604.0,
+ 'uploader_id': 'CTVJLaidlaw',
+ 'uploader': 'Jocelyn Laidlaw',
+ 'repost_count': int,
+ 'comment_count': int,
+ 'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'],
+ 'duration': 102.226,
+ 'uploader_url': 'https://twitter.com/CTVJLaidlaw',
+ 'display_id': '1600649710662213632',
+ 'like_count': int,
+ 'description': 'md5:591c19ce66fadc2359725d5cd0d1052c',
+ 'upload_date': '20221208',
+ 'age_limit': 0,
+ },
+ 'params': {'noplaylist': True},
}, {
# onion route
'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273',
}
def _real_extract(self, url):
- twid = self._match_id(url)
+ twid, selected_index = self._match_valid_url(url).group('id', 'index')
if self.is_logged_in or self._configuration_arg('force_graphql'):
self.write_debug(f'Using GraphQL API (Auth = {self.is_logged_in})')
result = self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid)
fmts, subs = self._extract_variant_formats(variant, twid)
subtitles = self._merge_subtitles(subtitles, subs)
formats.extend(fmts)
- self._sort_formats(formats, ('res', 'br', 'size', 'proto')) # The codec of http formats are unknown
thumbnails = []
media_url = media.get('media_url_https') or media.get('media_url')
'subtitles': subtitles,
'thumbnails': thumbnails,
'duration': float_or_none(video_info.get('duration_millis'), 1000),
+ # The codec of http formats are unknown
+ '_format_sort_fields': ('res', 'br', 'size', 'proto'),
}
def extract_from_card_info(card):
vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url')
content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player'))
formats, subtitles = self._extract_formats_from_vmap_url(vmap_url, content_id or twid)
- self._sort_formats(formats)
thumbnails = []
for suffix in ('_small', '', '_large', '_x_large', '_original'):
entries[0]['_old_archive_ids'] = [make_archive_id(self, twid)]
+ if not self._yes_playlist(twid, selected_index, video_label='URL-specified video number'):
+ index = int(selected_index) - 1
+ if index >= len(entries):
+ raise ExtractorError(f'Video #{selected_index} is unavailable', expected=True)
+
+ return entries[index]
+
if len(entries) == 1:
return entries[0]
class TwitterSpacesIE(TwitterBaseIE):
IE_NAME = 'twitter:spaces'
_VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/spaces/(?P<id>[0-9a-zA-Z]{13})'
- _TWITTER_GRAPHQL = 'https://twitter.com/i/api/graphql/HPEisOmj1epUNLCWTYhUWw/'
_TESTS = [{
'url': 'https://twitter.com/i/spaces/1RDxlgyvNXzJL',
# XXX: Native downloader does not work
formats = self._extract_m3u8_formats(
traverse_obj(source, 'noRedirectPlaybackUrl', 'location'),
- metadata['media_key'], 'm4a', 'm3u8', live=live_status == 'is_live')
+ metadata['media_key'], 'm4a', 'm3u8', live=live_status == 'is_live',
+ headers={'Referer': 'https://twitter.com/'})
for fmt in formats:
fmt.update({'vcodec': 'none', 'acodec': 'aac'})