return s
-# find the correct sorting and add the required base classes so that sublcasses
+# find the correct sorting and add the required base classes so that subclasses
# can be correctly created
classes = _ALL_CLASSES[:-1]
ordered_cls = []
- **ARD:mediathek**
- **ARDBetaMediathek**
- **Arkena**
- - **arte.tv:+7**
- - **arte.tv:embed**
- - **arte.tv:playlist**
+ - **ArteTV**
+ - **ArteTVEmbed**
+ - **ArteTVPlaylist**
- **AsianCrush**
- **AsianCrushPlaylist**
- **AtresPlayer**
- **la7.it**
- **laola1tv**
- **laola1tv:embed**
+ - **lbry.tv**
- **LCI**
- **Lcp**
- **LcpPlay**
- **SpankBangPlaylist**
- **Spankwire**
- **Spiegel**
- - **Spiegel:Article**: Articles on spiegel.de
- - **Spiegeltv**
- **sport.francetvinfo.fr**
- **Sport5**
- **SportBox**
- **YourPorn**
- **YourUpload**
- **youtube**: YouTube.com
- - **youtube:channel**: YouTube.com channels
- **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication)
- **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication)
- **youtube:live**: YouTube.com live streams
- **youtube:playlist**: YouTube.com playlists
- - **youtube:playlists**: YouTube.com user/channel playlists
- **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication)
- **youtube:search**: YouTube.com searches
- **youtube:search:date**: YouTube.com searches, newest videos first
- **youtube:search_url**: YouTube.com search URLs
- **youtube:show**: YouTube.com (multi-season) shows
- **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)
- - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword)
+ - **YoutubeYtUser**: YouTube.com user videos (URL or "ytuser" keyword)
+ - **youtube:tab**: YouTube.com tab
- **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)
- **Zapiks**
- **Zaq1**
def test_youtube_playlist_matching(self):
assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist'])
+ assertTab = lambda url: self.assertMatch(url, ['youtube:tab'])
assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585
- assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
+ assertPlaylist('PL63F0C78739B09958')
+ assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
- assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
- assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
+ assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
+ assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M'))
# Top tracks
- assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101')
+ assertTab('https://www.youtube.com/playlist?list=MCUS.20142101')
def test_youtube_matching(self):
self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M'))
self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube'])
def test_youtube_channel_matching(self):
- assertChannel = lambda url: self.assertMatch(url, ['youtube:channel'])
+ assertChannel = lambda url: self.assertMatch(url, ['youtube:tab'])
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM')
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec')
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')
- def test_youtube_user_matching(self):
- self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:user'])
+ # def test_youtube_user_matching(self):
+ # self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab'])
def test_youtube_feeds(self):
self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater'])
self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions'])
self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended'])
- self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites'])
- def test_youtube_show_matching(self):
- self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show'])
-
- def test_youtube_search_matching(self):
- self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
- self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
+ # def test_youtube_search_matching(self):
+ # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
+ # self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
def test_youtube_extract(self):
assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id)
self.assertEqual(d['x'], 1)
self.assertEqual(d['y'], 'a')
+ # Just drop ! prefix for now though this results in a wrong value
+ on = js_to_json('''{
+ a: !0,
+ b: !1,
+ c: !!0,
+ d: !!42.42,
+ e: !!![],
+ f: !"abc",
+ g: !"",
+ !42: 42
+ }''')
+ self.assertEqual(json.loads(on), {
+ 'a': 0,
+ 'b': 1,
+ 'c': 0,
+ 'd': 42.42,
+ 'e': [],
+ 'f': "abc",
+ 'g': "",
+ '42': 42
+ })
+
on = js_to_json('["abc", "def",]')
self.assertEqual(json.loads(on), ['abc', 'def'])
on = js_to_json('{42:4.2e1}')
self.assertEqual(json.loads(on), {'42': 42.0})
+ on = js_to_json('{ "0x40": "0x40" }')
+ self.assertEqual(json.loads(on), {'0x40': '0x40'})
+
+ on = js_to_json('{ "040": "040" }')
+ self.assertEqual(json.loads(on), {'040': '040'})
+
def test_js_to_json_malformed(self):
self.assertEqual(js_to_json('42a1'), '42"a1"')
self.assertEqual(js_to_json('42a-1'), '42"a"-1')
video_element = video_xml.findall(compat_xpath('./track/video'))[-1]
if video_element is None or video_element.text is None:
raise ExtractorError(
- 'Video %s video does not exist' % video_id, expected=True)
+ 'Video %s does not exist' % video_id, expected=True)
video_url = video_element.text.strip()
import re
from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import (
+ compat_str,
+ compat_urlparse,
+)
from ..utils import (
ExtractorError,
int_or_none,
qualities,
try_get,
unified_strdate,
+ url_or_none,
)
-# There are different sources of video in arte.tv, the extraction process
-# is different for each one. The videos usually expire in 7 days, so we can't
-# add tests.
-
class ArteTVBaseIE(InfoExtractor):
- def _extract_from_json_url(self, json_url, video_id, lang, title=None):
- info = self._download_json(json_url, video_id)
+ _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
+ _API_BASE = 'https://api.arte.tv/api/player/v1'
+
+
+class ArteTVIE(ArteTVBaseIE):
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
+ api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
+ )
+ /(?P<id>\d{6}-\d{3}-[AF])
+ ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
+ _TESTS = [{
+ 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
+ 'info_dict': {
+ 'id': '088501-000-A',
+ 'ext': 'mp4',
+ 'title': 'Mexico: Stealing Petrol to Survive',
+ 'upload_date': '20190628',
+ },
+ }, {
+ 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ lang = mobj.group('lang') or mobj.group('lang_2')
+
+ info = self._download_json(
+ '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id)
player_info = info['videoJsonPlayer']
vsr = try_get(player_info, lambda x: x['VSR'], dict)
if not upload_date_str:
upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
- title = (player_info.get('VTI') or title or player_info['VID']).strip()
+ title = (player_info.get('VTI') or player_info['VID']).strip()
subtitle = player_info.get('VSU', '').strip()
if subtitle:
title += ' - %s' % subtitle
- info_dict = {
- 'id': player_info['VID'],
- 'title': title,
- 'description': player_info.get('VDE'),
- 'upload_date': unified_strdate(upload_date_str),
- 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
- }
qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ'])
LANGS = {
formats = []
for format_id, format_dict in vsr.items():
f = dict(format_dict)
+ format_url = url_or_none(f.get('url'))
+ streamer = f.get('streamer')
+ if not format_url and not streamer:
+ continue
versionCode = f.get('versionCode')
l = re.escape(langcode)
else:
lang_pref = -1
+ media_type = f.get('mediaType')
+ if media_type == 'hls':
+ m3u8_formats = self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id=format_id, fatal=False)
+ for m3u8_format in m3u8_formats:
+ m3u8_format['language_preference'] = lang_pref
+ formats.extend(m3u8_formats)
+ continue
+
format = {
'format_id': format_id,
'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
'quality': qfunc(f.get('quality')),
}
- if f.get('mediaType') == 'rtmp':
+ if media_type == 'rtmp':
format['url'] = f['streamer']
format['play_path'] = 'mp4:' + f['url']
format['ext'] = 'flv'
formats.append(format)
- self._check_formats(formats, video_id)
self._sort_formats(formats)
- info_dict['formats'] = formats
- return info_dict
-
+ return {
+ 'id': player_info.get('VID') or video_id,
+ 'title': title,
+ 'description': player_info.get('VDE'),
+ 'upload_date': unified_strdate(upload_date_str),
+ 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
+ 'formats': formats,
+ }
-class ArteTVPlus7IE(ArteTVBaseIE):
- IE_NAME = 'arte.tv:+7'
- _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])'
+class ArteTVEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
_TESTS = [{
- 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
+ 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
'info_dict': {
- 'id': '088501-000-A',
+ 'id': '100605-013-A',
'ext': 'mp4',
- 'title': 'Mexico: Stealing Petrol to Survive',
- 'upload_date': '20190628',
+ 'title': 'United we Stream November Lockdown Edition #13',
+ 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
+ 'upload_date': '20201116',
},
+ }, {
+ 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
+ 'only_matching': True,
}]
- def _real_extract(self, url):
- lang, video_id = re.match(self._VALID_URL, url).groups()
- return self._extract_from_json_url(
- 'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id),
- video_id, lang)
-
-
-class ArteTVEmbedIE(ArteTVPlus7IE):
- IE_NAME = 'arte.tv:embed'
- _VALID_URL = r'''(?x)
- https://www\.arte\.tv
- /player/v3/index\.php\?json_url=
- (?P<json_url>
- https?://api\.arte\.tv/api/player/v1/config/
- (?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF])
- )
- '''
-
- _TESTS = []
+ @staticmethod
+ def _extract_urls(webpage):
+ return [url for _, url in re.findall(
+ r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
+ webpage)]
def _real_extract(self, url):
- json_url, lang, video_id = re.match(self._VALID_URL, url).groups()
- return self._extract_from_json_url(json_url, video_id, lang)
+ qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ json_url = qs['json_url'][0]
+ video_id = ArteTVIE._match_id(json_url)
+ return self.url_result(
+ json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
class ArteTVPlaylistIE(ArteTVBaseIE):
- IE_NAME = 'arte.tv:playlist'
- _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})'
-
+ _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
'info_dict': {
'description': 'md5:d322c55011514b3a7241f7fb80d494c2',
},
'playlist_mincount': 6,
+ }, {
+ 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
lang, playlist_id = re.match(self._VALID_URL, url).groups()
collection = self._download_json(
- 'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos'
- % (lang, playlist_id), playlist_id)
+ '%s/collectionData/%s/%s?source=videos'
+ % (self._API_BASE, lang, playlist_id), playlist_id)
+ entries = []
+ for video in collection['videos']:
+ if not isinstance(video, dict):
+ continue
+ video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl'))
+ if not video_url:
+ continue
+ video_id = video.get('programId')
+ entries.append({
+ '_type': 'url_transparent',
+ 'url': video_url,
+ 'id': video_id,
+ 'title': video.get('title'),
+ 'alt_title': video.get('subtitle'),
+ 'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)),
+ 'duration': int_or_none(video.get('durationSeconds')),
+ 'view_count': int_or_none(video.get('views')),
+ 'ie_key': ArteTVIE.ie_key(),
+ })
title = collection.get('title')
description = collection.get('shortDescription') or collection.get('teaserText')
- entries = [
- self._extract_from_json_url(
- video['jsonUrl'], video.get('programId') or playlist_id, lang)
- for video in collection['videos'] if video.get('jsonUrl')]
return self.playlist_result(entries, playlist_id, title, description)
+# coding: utf-8
from __future__ import unicode_literals
import random
import time
from .common import InfoExtractor
-from ..compat import (
- compat_str,
- compat_urlparse,
-)
+from ..compat import compat_str
from ..utils import (
ExtractorError,
float_or_none,
parse_filesize,
str_or_none,
try_get,
- unescapeHTML,
update_url_query,
unified_strdate,
unified_timestamp,
url_or_none,
+ urljoin,
)
-class BandcampBaseIE(InfoExtractor):
- """Provide base functions for Bandcamp extractors"""
-
- def _extract_json_from_html_data_attribute(self, webpage, suffix, video_id):
- json_string = self._html_search_regex(
- r' data-%s="([^"]*)' % suffix,
- webpage, '%s json' % suffix, default='{}')
-
- return self._parse_json(json_string, video_id)
-
- def _parse_json_track(self, json):
- formats = []
- file_ = json.get('file')
- if isinstance(file_, dict):
- for format_id, format_url in file_.items():
- if not url_or_none(format_url):
- continue
- ext, abr_str = format_id.split('-', 1)
- formats.append({
- 'format_id': format_id,
- 'url': self._proto_relative_url(format_url, 'http:'),
- 'ext': ext,
- 'vcodec': 'none',
- 'acodec': ext,
- 'abr': int_or_none(abr_str),
- })
-
- return {
- 'duration': float_or_none(json.get('duration')),
- 'id': str_or_none(json.get('track_id') or json.get('id')),
- 'title': json.get('title'),
- 'title_link': json.get('title_link'),
- 'number': int_or_none(json.get('track_num')),
- 'formats': formats
- }
-
-
-class BandcampIE(BandcampBaseIE):
- IE_NAME = "Bandcamp:track"
- _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
+class BandcampIE(InfoExtractor):
+ _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'md5': 'c557841d5e50261777a6585648adf439',
'info_dict': {
'id': '1812978515',
'ext': 'mp3',
- 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
+ 'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭",
'duration': 9.8485,
- 'uploader': "youtube-dl \"'/\\\u00e4\u21ad",
- 'timestamp': 1354224127,
+ 'uploader': 'youtube-dl "\'/\\ä↭',
'upload_date': '20121129',
+ 'timestamp': 1354224127,
},
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
# free download
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
- 'md5': '5d92af55811e47f38962a54c30b07ef0',
'info_dict': {
'id': '2650410135',
'ext': 'aiff',
},
}]
+ def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True):
+ return self._parse_json(self._html_search_regex(
+ r'data-%s=(["\'])({.+?})\1' % attr, webpage,
+ attr + ' data', group=2), video_id, fatal=fatal)
+
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- title = mobj.group('title')
- url_track_title = title
+ title = self._match_id(url)
webpage = self._download_webpage(url, title)
- thumbnail = self._html_search_meta('og:image', webpage, default=None)
-
- json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", url_track_title)
- json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", url_track_title)
-
- json_tracks = json_tralbum.get('trackinfo')
- if not json_tracks:
- raise ExtractorError('Could not extract track')
-
- track = self._parse_json_track(json_tracks[0])
- artist = json_tralbum.get('artist')
- album_title = json_embed.get('album_title')
-
- json_album = json_tralbum.get('packages')
- if json_album:
- json_album = json_album[0]
- album_publish_date = json_album.get('album_publish_date')
- album_release_date = json_album.get('album_release_date')
- else:
- album_publish_date = None
- album_release_date = json_tralbum.get('album_release_date')
-
- timestamp = unified_timestamp(json_tralbum.get('current', {}).get('publish_date') or album_publish_date)
- release_date = unified_strdate(album_release_date)
-
- download_link = self._search_regex(
- r'freeDownloadPage(?:["\']|"):\s*(["\']|")(?P<url>(?:(?!\1).)+)\1', webpage,
- 'download link', default=None, group='url')
+ tralbum = self._extract_data_attr(webpage, title)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ track_id = None
+ track = None
+ track_number = None
+ duration = None
+
+ formats = []
+ track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict)
+ if track_info:
+ file_ = track_info.get('file')
+ if isinstance(file_, dict):
+ for format_id, format_url in file_.items():
+ if not url_or_none(format_url):
+ continue
+ ext, abr_str = format_id.split('-', 1)
+ formats.append({
+ 'format_id': format_id,
+ 'url': self._proto_relative_url(format_url, 'http:'),
+ 'ext': ext,
+ 'vcodec': 'none',
+ 'acodec': ext,
+ 'abr': int_or_none(abr_str),
+ })
+ track = track_info.get('title')
+ track_id = str_or_none(
+ track_info.get('track_id') or track_info.get('id'))
+ track_number = int_or_none(track_info.get('track_num'))
+ duration = float_or_none(track_info.get('duration'))
+
+ embed = self._extract_data_attr(webpage, title, 'embed', False)
+ current = tralbum.get('current') or {}
+ artist = embed.get('artist') or current.get('artist') or tralbum.get('artist')
+ timestamp = unified_timestamp(
+ current.get('publish_date') or tralbum.get('album_publish_date'))
+
+ download_link = tralbum.get('freeDownloadPage')
if download_link:
- track_id = self._search_regex(
- r'\?id=(?P<id>\d+)&',
- download_link, 'track id')
+ track_id = compat_str(tralbum['id'])
download_webpage = self._download_webpage(
download_link, track_id, 'Downloading free downloads page')
- blob = self._parse_json(
- self._search_regex(
- r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
- 'blob', group='blob'),
- track_id, transform_source=unescapeHTML)
+ blob = self._extract_data_attr(download_webpage, track_id, 'blob')
info = try_get(
blob, (lambda x: x['digital_items'][0],
if info:
downloads = info.get('downloads')
if isinstance(downloads, dict):
+ if not track:
+ track = info.get('title')
if not artist:
artist = info.get('artist')
if not thumbnail:
retry_url = url_or_none(stat.get('retry_url'))
if not retry_url:
continue
- track['formats'].append({
+ formats.append({
'url': self._proto_relative_url(retry_url, 'http:'),
'ext': download_formats.get(format_id),
'format_id': format_id,
'vcodec': 'none',
})
- self._sort_formats(track['formats'])
+ self._sort_formats(formats)
- title = '%s - %s' % (artist, track.get('title')) if artist else track.get('title')
+ title = '%s - %s' % (artist, track) if artist else track
+
+ if not duration:
+ duration = float_or_none(self._html_search_meta(
+ 'duration', webpage, default=None))
return {
- 'album': album_title,
- 'artist': artist,
- 'duration': track['duration'],
- 'formats': track['formats'],
- 'id': track['id'],
- 'release_date': release_date,
+ 'id': track_id,
+ 'title': title,
'thumbnail': thumbnail,
+ 'uploader': artist,
'timestamp': timestamp,
- 'title': title,
- 'track': track['title'],
- 'track_id': track['id'],
- 'track_number': track['number'],
- 'uploader': artist
+ 'release_date': unified_strdate(tralbum.get('album_release_date')),
+ 'duration': duration,
+ 'track': track,
+ 'track_number': track_number,
+ 'track_id': track_id,
+ 'artist': artist,
+ 'album': embed.get('album_title'),
+ 'formats': formats,
}
-class BandcampAlbumIE(BandcampBaseIE):
+class BandcampAlbumIE(BandcampIE):
IE_NAME = 'Bandcamp:album'
- _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?'
+ _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?'
_TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
'info_dict': {
'id': '1353101989',
'ext': 'mp3',
- 'title': 'Intro',
+ 'title': 'Blazo - Intro',
+ 'timestamp': 1311756226,
+ 'upload_date': '20110727',
+ 'uploader': 'Blazo',
}
},
{
'info_dict': {
'id': '38097443',
'ext': 'mp3',
- 'title': 'Kero One - Keep It Alive (Blazo remix)',
+ 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)',
+ 'timestamp': 1311757238,
+ 'upload_date': '20110727',
+ 'uploader': 'Blazo',
}
},
],
'title': '"Entropy" EP',
'uploader_id': 'jstrecords',
'id': 'entropy-ep',
+ 'description': 'md5:0ff22959c943622972596062f2f366a5',
},
'playlist_mincount': 3,
}, {
'id': 'we-are-the-plague',
'title': 'WE ARE THE PLAGUE',
'uploader_id': 'insulters',
+ 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f',
},
'playlist_count': 2,
}]
else super(BandcampAlbumIE, cls).suitable(url))
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- uploader_id = mobj.group('subdomain')
- album_id = mobj.group('album_id')
+ uploader_id, album_id = re.match(self._VALID_URL, url).groups()
playlist_id = album_id or uploader_id
webpage = self._download_webpage(url, playlist_id)
-
- json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", playlist_id)
- json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", playlist_id)
-
- json_tracks = json_tralbum.get('trackinfo')
- if not json_tracks:
- raise ExtractorError('Could not extract album tracks')
-
- album_title = json_embed.get('album_title')
-
+ tralbum = self._extract_data_attr(webpage, playlist_id)
+ track_info = tralbum.get('trackinfo')
+ if not track_info:
+ raise ExtractorError('The page doesn\'t contain any tracks')
# Only tracks with duration info have songs
- tracks = [self._parse_json_track(track) for track in json_tracks]
entries = [
self.url_result(
- compat_urlparse.urljoin(url, track['title_link']),
- ie=BandcampIE.ie_key(), video_id=track['id'],
- video_title=track['title'])
- for track in tracks
- if track.get('duration')]
+ urljoin(url, t['title_link']), BandcampIE.ie_key(),
+ str_or_none(t.get('track_id') or t.get('id')), t.get('title'))
+ for t in track_info
+ if t.get('duration')]
+
+ current = tralbum.get('current') or {}
return {
'_type': 'playlist',
'uploader_id': uploader_id,
'id': playlist_id,
- 'title': album_title,
- 'entries': entries
+ 'title': current.get('title'),
+ 'description': current.get('about'),
+ 'entries': entries,
}
-class BandcampWeeklyIE(InfoExtractor):
+class BandcampWeeklyIE(BandcampIE):
IE_NAME = 'Bandcamp:weekly'
_VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
_TESTS = [{
'release_date': '20170404',
'series': 'Bandcamp Weekly',
'episode': 'Magic Moments',
- 'episode_number': 208,
'episode_id': '224',
- }
+ },
+ 'params': {
+ 'format': 'opus-lo',
+ },
}, {
'url': 'https://bandcamp.com/?blah/blah@&show=228',
'only_matching': True
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- blob = self._parse_json(
- self._search_regex(
- r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage,
- 'blob', group='blob'),
- video_id, transform_source=unescapeHTML)
+ show_id = self._match_id(url)
+ webpage = self._download_webpage(url, show_id)
- show = blob['bcw_show']
+ blob = self._extract_data_attr(webpage, show_id, 'blob')
- # This is desired because any invalid show id redirects to `bandcamp.com`
- # which happens to expose the latest Bandcamp Weekly episode.
- show_id = int_or_none(show.get('show_id')) or int_or_none(video_id)
+ show = blob['bcw_data'][show_id]
formats = []
for format_id, format_url in show['audio_stream'].items():
if subtitle:
title += ' - %s' % subtitle
- episode_number = None
- seq = blob.get('bcw_seq')
-
- if seq and isinstance(seq, list):
- try:
- episode_number = next(
- int_or_none(e.get('episode_number'))
- for e in seq
- if isinstance(e, dict) and int_or_none(e.get('id')) == show_id)
- except StopIteration:
- pass
-
return {
- 'id': video_id,
+ 'id': show_id,
'title': title,
'description': show.get('desc') or show.get('short_desc'),
'duration': float_or_none(show.get('audio_duration')),
'release_date': unified_strdate(show.get('published_date')),
'series': 'Bandcamp Weekly',
'episode': show.get('subtitle'),
- 'episode_number': episode_number,
- 'episode_id': compat_str(video_id),
+ 'episode_id': show_id,
'formats': formats
}
# coding: utf-8
from __future__ import unicode_literals
+import re
from .common import InfoExtractor
from ..utils import smuggle_url
class CNBCVideoIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)'
_TEST = {
'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html',
'info_dict': {
}
def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
- video_id = self._search_regex(
- r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id,
- 'video id')
+ path, display_id = re.match(self._VALID_URL, url).groups()
+ video_id = self._download_json(
+ 'https://webql-redesign.cnbcfm.com/graphql', display_id, query={
+ 'query': '''{
+ page(path: "%s") {
+ vcpsId
+ }
+}''' % path,
+ })['data']['page']['vcpsId']
return self.url_result(
- 'http://video.cnbc.com/gallery/?video=%s' % video_id,
+ 'http://video.cnbc.com/gallery/?video=%d' % video_id,
CNBCIE.ie_key())
try:
self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
return True
- except ExtractorError:
+ except ExtractorError as e:
self.to_screen(
- '%s: %s URL is invalid, skipping' % (video_id, item))
+ '%s: %s URL is invalid, skipping: %s'
+ % (video_id, item, error_to_compat_str(e.cause)))
return False
def http_scheme(self):
mimetype2ext,
orderedSet,
parse_iso8601,
+ strip_or_none,
+ try_get,
)
'uploader': 'gq',
'upload_date': '20170321',
'timestamp': 1490126427,
+ 'description': 'How much grimmer would things be if these people were competent?',
},
}, {
# JS embed
'title': '3D printed TSA Travel Sentry keys really do open TSA locks',
'uploader': 'arstechnica',
'upload_date': '20150916',
- 'timestamp': 1442434955,
+ 'timestamp': 1442434920,
}
}, {
'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player',
})
self._sort_formats(formats)
+ subtitles = {}
+ for t, caption in video_info.get('captions', {}).items():
+ caption_url = caption.get('src')
+ if not (t in ('vtt', 'srt', 'tml') and caption_url):
+ continue
+ subtitles.setdefault('en', []).append({'url': caption_url})
+
return {
'id': video_id,
'formats': formats,
'season': video_info.get('season_title'),
'timestamp': parse_iso8601(video_info.get('premiere_date')),
'categories': video_info.get('categories'),
+ 'subtitles': subtitles,
}
def _real_extract(self, url):
if url_type == 'series':
return self._extract_series(url, webpage)
else:
- params = self._extract_video_params(webpage, display_id)
- info = self._search_json_ld(
- webpage, display_id, fatal=False)
+ video = try_get(self._parse_json(self._search_regex(
+ r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
+ 'preload state', '{}'), display_id),
+ lambda x: x['transformed']['video'])
+ if video:
+ params = {'videoId': video['id']}
+ info = {'description': strip_or_none(video.get('description'))}
+ else:
+ params = self._extract_video_params(webpage, display_id)
+ info = self._search_json_ld(
+ webpage, display_id, fatal=False)
info.update(self._extract_video(params))
return info
ARDMediathekIE,
)
from .arte import (
- ArteTVPlus7IE,
+ ArteTVIE,
ArteTVEmbedIE,
ArteTVPlaylistIE,
)
EHFTVIE,
ITTFIE,
)
+from .lbry import LBRYIE
from .lci import LCIIE
from .lcp import (
LcpPlayIE,
SpankBangPlaylistIE,
)
from .spankwire import SpankwireIE
-from .spiegel import SpiegelIE, SpiegelArticleIE
-from .spiegeltv import SpiegeltvIE
+from .spiegel import SpiegelIE
from .spike import (
BellatorIE,
ParamountNetworkIE,
from .yourupload import YourUploadIE
from .youtube import (
YoutubeIE,
- YoutubeChannelIE,
YoutubeFavouritesIE,
YoutubeHistoryIE,
YoutubeLiveIE,
+ YoutubeTabIE,
YoutubePlaylistIE,
- YoutubePlaylistsIE,
YoutubeRecommendedIE,
YoutubeSearchDateIE,
YoutubeSearchIE,
YoutubeSubscriptionsIE,
YoutubeTruncatedIDIE,
YoutubeTruncatedURLIE,
- YoutubeUserIE,
+ YoutubeYtUserIE,
YoutubeWatchLaterIE,
)
from .zapiks import ZapiksIE
parse_duration,
try_get,
url_or_none,
+ urljoin,
)
from .dailymotion import DailymotionIE
is_live = None
- formats = []
- for video in info['videos']:
- if video['statut'] != 'ONLINE':
+ videos = []
+
+ for video in (info.get('videos') or []):
+ if video.get('statut') != 'ONLINE':
continue
- video_url = video['url']
+ if not video.get('url'):
+ continue
+ videos.append(video)
+
+ if not videos:
+ for device_type in ['desktop', 'mobile']:
+ fallback_info = self._download_json(
+ 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id,
+ video_id, 'Downloading fallback %s video JSON' % device_type, query={
+ 'device_type': device_type,
+ 'browser': 'chrome',
+ }, fatal=False)
+
+ if fallback_info and fallback_info.get('video'):
+ videos.append(fallback_info['video'])
+
+ formats = []
+ for video in videos:
+ video_url = video.get('url')
if not video_url:
continue
if is_live is None:
is_live = (try_get(
- video, lambda x: x['plages_ouverture'][0]['direct'],
- bool) is True) or '/live.francetv.fr/' in video_url
- format_id = video['format']
+ video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True
+ or video.get('is_live') is True
+ or '/live.francetv.fr/' in video_url)
+ format_id = video.get('format')
ext = determine_ext(video_url)
if ext == 'f4m':
if georestricted:
sign(video_url, format_id), video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id=format_id,
fatal=False))
+ elif ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False))
elif video_url.startswith('rtmp'):
formats.append({
'url': video_url,
'url': video_url,
'format_id': format_id,
})
+
self._sort_formats(formats)
title = info['titre']
return {
'id': video_id,
'title': self._live_title(title) if is_live else title,
- 'description': clean_html(info['synopsis']),
- 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']),
- 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']),
- 'timestamp': int_or_none(info['diffusion']['timestamp']),
+ 'description': clean_html(info.get('synopsis')),
+ 'thumbnail': urljoin('http://pluzz.francetv.fr', info.get('image')),
+ 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')),
+ 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])),
'is_live': is_live,
'formats': formats,
'subtitles': subtitles,
from .videa import VideaIE
from .twentymin import TwentyMinutenIE
from .ustream import UstreamIE
+from .arte import ArteTVEmbedIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
from .limelight import LimelightBaseIE
return self.url_result(ustream_url, UstreamIE.ie_key())
# Look for embedded arte.tv player
- mobj = re.search(
- r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"',
- webpage)
- if mobj is not None:
- return self.url_result(mobj.group('url'), 'ArteTVEmbed')
+ arte_urls = ArteTVEmbedIE._extract_urls(webpage)
+ if arte_urls:
+ return self.playlist_from_matches(arte_urls, video_id, video_title)
# Look for embedded francetv player
mobj = re.search(
elif function in other_functions:
other_functions[function]()
else:
- raise ExtractorError('Unknown funcion %s' % function)
+ raise ExtractorError('Unknown function %s' % function)
return sdk.target
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ determine_ext,
+ ExtractorError,
+ int_or_none,
+ mimetype2ext,
+ try_get,
+)
+
+
+class LBRYIE(InfoExtractor):
+ IE_NAME = 'lbry.tv'
+ _VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[0-9a-zA-Z-]+:[0-9a-z]+/[0-9a-zA-Z().-]+:[0-9a-z])'
+ _TESTS = [{
+ # Video
+ 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1',
+ 'md5': '65bd7ec1f6744ada55da8e4c48a2edf9',
+ 'info_dict': {
+ 'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d',
+ 'ext': 'mp4',
+ 'title': 'First day in LBRY? Start HERE!',
+ 'description': 'md5:f6cb5c704b332d37f5119313c2c98f51',
+ 'timestamp': 1595694354,
+ 'upload_date': '20200725',
+ }
+ }, {
+ # Audio
+ 'url': 'https://lbry.tv/@LBRYFoundation:0/Episode-1:e',
+ 'md5': 'c94017d3eba9b49ce085a8fad6b98d00',
+ 'info_dict': {
+ 'id': 'e7d93d772bd87e2b62d5ab993c1c3ced86ebb396',
+ 'ext': 'mp3',
+ 'title': 'The LBRY Foundation Community Podcast Episode 1 - Introduction, Streaming on LBRY, Transcoding',
+ 'description': 'md5:661ac4f1db09f31728931d7b88807a61',
+ 'timestamp': 1591312601,
+ 'upload_date': '20200604',
+ }
+ }, {
+ 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e',
+ 'only_matching': True,
+ }]
+
+ def _call_api_proxy(self, method, display_id, params):
+ return self._download_json(
+ 'https://api.lbry.tv/api/v1/proxy', display_id,
+ headers={'Content-Type': 'application/json-rpc'},
+ data=json.dumps({
+ 'method': method,
+ 'params': params,
+ }).encode())['result']
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url).replace(':', '#')
+ uri = 'lbry://' + display_id
+ result = self._call_api_proxy(
+ 'resolve', display_id, {'urls': [uri]})[uri]
+ result_value = result['value']
+ if result_value.get('stream_type') not in ('video', 'audio'):
+ raise ExtractorError('Unsupported URL', expected=True)
+ streaming_url = self._call_api_proxy(
+ 'get', display_id, {'uri': uri})['streaming_url']
+ source = result_value.get('source') or {}
+ media = result_value.get('video') or result_value.get('audio') or {}
+ signing_channel = result_value.get('signing_channel') or {}
+
+ return {
+ 'id': result['claim_id'],
+ 'title': result_value['title'],
+ 'thumbnail': try_get(result_value, lambda x: x['thumbnail']['url'], compat_str),
+ 'description': result_value.get('description'),
+ 'license': result_value.get('license'),
+ 'timestamp': int_or_none(result.get('timestamp')),
+ 'tags': result_value.get('tags'),
+ 'width': int_or_none(media.get('width')),
+ 'height': int_or_none(media.get('height')),
+ 'duration': int_or_none(media.get('duration')),
+ 'channel': signing_channel.get('name'),
+ 'channel_id': signing_channel.get('claim_id'),
+ 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')),
+ 'filesize': int_or_none(source.get('size')),
+ 'url': streaming_url,
+ }
from .common import InfoExtractor
from ..utils import (
- determine_ext,
- int_or_none,
- parse_duration,
- remove_end,
+ clean_html,
+ merge_dicts,
)
class LRTIE(InfoExtractor):
IE_NAME = 'lrt.lt'
- _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P<path>/mediateka/irasas/(?P<id>[0-9]+))'
_TESTS = [{
# m3u8 download
- 'url': 'http://www.lrt.lt/mediateka/irasas/54391/',
- 'md5': 'fe44cf7e4ab3198055f2c598fc175cb0',
+ 'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene',
+ 'md5': '85cb2bb530f31d91a9c65b479516ade4',
'info_dict': {
- 'id': '54391',
+ 'id': '2000127261',
'ext': 'mp4',
- 'title': 'Septynios Kauno dienos',
- 'description': 'md5:24d84534c7dc76581e59f5689462411a',
- 'duration': 1783,
- 'view_count': int,
- 'like_count': int,
+ 'title': 'Greita ir gardu: Sicilijos įkvėpta klasikinių makaronų su baklažanais vakarienė',
+ 'description': 'md5:ad7d985f51b0dc1489ba2d76d7ed47fa',
+ 'duration': 3035,
+ 'timestamp': 1604079000,
+ 'upload_date': '20201030',
},
}, {
# direct mp3 download
},
}]
+ def _extract_js_var(self, webpage, var_name, default):
+ return self._search_regex(
+ r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name,
+ webpage, var_name.replace('_', ' '), default, group=2)
+
def _real_extract(self, url):
- video_id = self._match_id(url)
+ path, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, video_id)
- title = remove_end(self._og_search_title(webpage), ' - LRT')
-
- formats = []
- for _, file_url in re.findall(
- r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage):
- ext = determine_ext(file_url)
- if ext not in ('m3u8', 'mp3'):
- continue
- # mp3 served as m3u8 produces stuttered media file
- if ext == 'm3u8' and '.mp3' in file_url:
- continue
- if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- file_url, video_id, 'mp4', entry_protocol='m3u8_native',
- fatal=False))
- elif ext == 'mp3':
- formats.append({
- 'url': file_url,
- 'vcodec': 'none',
- })
- self._sort_formats(formats)
+ media_url = self._extract_js_var(webpage, 'main_url', path)
+ media = self._download_json(self._extract_js_var(
+ webpage, 'media_info_url',
+ 'https://www.lrt.lt/servisai/stream_url/vod/media_info/'),
+ video_id, query={'url': media_url})
+ jw_data = self._parse_jwplayer_data(
+ media['playlist_item'], video_id, base_url=url)
- thumbnail = self._og_search_thumbnail(webpage)
- description = self._og_search_description(webpage)
- duration = parse_duration(self._search_regex(
- r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1',
- webpage, 'duration', default=None, group='duration'))
+ json_ld_data = self._search_json_ld(webpage, video_id)
- view_count = int_or_none(self._html_search_regex(
- r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>',
- webpage, 'view count', fatal=False, group='count'))
- like_count = int_or_none(self._search_regex(
- r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<',
- webpage, 'like count', fatal=False, group='count'))
+ tags = []
+ for tag in (media.get('tags') or []):
+ tag_name = tag.get('name')
+ if not tag_name:
+ continue
+ tags.append(tag_name)
- return {
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- 'thumbnail': thumbnail,
- 'description': description,
- 'duration': duration,
- 'view_count': view_count,
- 'like_count': like_count,
+ clean_info = {
+ 'description': clean_html(media.get('content')),
+ 'tags': tags,
}
+
+ return merge_dicts(clean_info, jw_data, json_ld_data)
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import merge_dicts
+from ..utils import (
+ clean_html,
+ dict_get,
+ float_or_none,
+ int_or_none,
+ merge_dicts,
+ parse_duration,
+ try_get,
+)
class MallTVIE(InfoExtractor):
'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
'ext': 'mp4',
'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?',
- 'description': 'md5:25fc0ec42a72ba602b602c683fa29deb',
+ 'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35',
'duration': 216,
'timestamp': 1538870400,
'upload_date': '20181007',
webpage = self._download_webpage(
url, display_id, headers=self.geo_verification_headers())
- SOURCE_RE = r'(<source[^>]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P<id>[\da-z]+)/index)\b'
+ video = self._parse_json(self._search_regex(
+ r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);',
+ webpage, 'video object'), display_id)
+ video_source = video['VideoSource']
video_id = self._search_regex(
- SOURCE_RE, webpage, 'video id', group='id')
+ r'/([\da-z]+)/index\b', video_source, 'video id')
+
+ formats = self._extract_m3u8_formats(
+ video_source + '.m3u8', video_id, 'mp4', 'm3u8_native')
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for s in (video.get('Subtitles') or {}):
+ s_url = s.get('Url')
+ if not s_url:
+ continue
+ subtitles.setdefault(s.get('Language') or 'cz', []).append({
+ 'url': s_url,
+ })
+
+ entity_counts = video.get('EntityCounts') or {}
- media = self._parse_html5_media_entries(
- url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id,
- m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0]
+ def get_count(k):
+ v = entity_counts.get(k + 's') or {}
+ return int_or_none(dict_get(v, ('Count', 'StrCount')))
info = self._search_json_ld(webpage, video_id, default={})
- return merge_dicts(media, info, {
+ return merge_dicts({
'id': video_id,
'display_id': display_id,
- 'title': self._og_search_title(webpage, default=None) or display_id,
- 'description': self._og_search_description(webpage, default=None),
- 'thumbnail': self._og_search_thumbnail(webpage, default=None),
- })
+ 'title': video.get('Title'),
+ 'description': clean_html(video.get('Description')),
+ 'thumbnail': video.get('ThumbnailUrl'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')),
+ 'view_count': get_count('View'),
+ 'like_count': get_count('Like'),
+ 'dislike_count': get_count('Dislike'),
+ 'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])),
+ 'comment_count': get_count('Comment'),
+ }, info)
class MGTVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html'
+ _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html'
IE_DESC = '芒果TV'
- _GEO_COUNTRIES = ['CN']
_TESTS = [{
'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html',
}, {
'url': 'http://www.mgtv.com/b/301817/3826653.html',
'only_matching': True,
+ }, {
+ 'url': 'https://w.mgtv.com/b/301817/3826653.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
+ tk2 = base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1]
try:
api_data = self._download_json(
'https://pcweb.api.mgtv.com/player/video', video_id, query={
- 'tk2': base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1],
+ 'tk2': tk2,
'video_id': video_id,
}, headers=self.geo_verification_headers())['data']
except ExtractorError as e:
stream_data = self._download_json(
'https://pcweb.api.mgtv.com/player/getSource', video_id, query={
'pm2': api_data['atc']['pm2'],
+ 'tk2': tk2,
'video_id': video_id,
}, headers=self.geo_verification_headers())['data']
stream_domain = stream_data['stream_domain'][0]
'only_matching': True,
}]
+ @staticmethod
+ def extract_child_with_type(parent, t):
+ children = parent['children']
+ return next(c for c in children if c.get('type') == t)
+
+ def _extract_mgid(self, webpage):
+ data = self._parse_json(self._search_regex(
+ r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
+ main_container = self.extract_child_with_type(data, 'MainContainer')
+ video_player = self.extract_child_with_type(main_container, 'VideoPlayer')
+ return video_player['props']['media']['video']['config']['uri']
+
class MTVJapanIE(MTVServicesInfoExtractor):
IE_NAME = 'mtvjapan'
from ..compat import compat_urllib_parse_unquote
from ..utils import (
int_or_none,
- js_to_json,
parse_duration,
smuggle_url,
try_get,
webpage = self._download_webpage(url, video_id)
data = self._parse_json(self._search_regex(
- r'window\.__data\s*=\s*({.+});', webpage,
- 'bootstrap json'), video_id, js_to_json)
+ r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>',
+ webpage, 'bootstrap json'), video_id)['props']['initialState']
video_data = try_get(data, lambda x: x['video']['current'], dict)
if not video_data:
video_data = data['article']['content'][0]['primaryMedia']['video']
'params': {
'skip_download': True,
},
+ }, {
+ # with subtitles
+ 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html',
+ 'info_dict': {
+ 'id': 'extra18674',
+ 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring',
+ 'ext': 'mp4',
+ 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring',
+ 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6',
+ 'uploader': 'ndrtv',
+ 'upload_date': '20201113',
+ 'duration': 1749,
+ 'subtitles': {
+ 'de': [{
+ 'ext': 'ttml',
+ 'url': r're:^https://www\.ndr\.de.+',
+ }],
+ },
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html',
'only_matching': True,
'preference': quality_key(thumbnail.get('quality')),
})
+ subtitles = {}
+ tracks = config.get('tracks')
+ if tracks and isinstance(tracks, list):
+ for track in tracks:
+ if not isinstance(track, dict):
+ continue
+ track_url = urljoin(url, track.get('src'))
+ if not track_url:
+ continue
+ subtitles.setdefault(track.get('srclang') or 'de', []).append({
+ 'url': track_url,
+ 'ext': 'ttml',
+ })
+
return {
'id': video_id,
'title': title,
'duration': duration,
'thumbnails': thumbnails,
'formats': formats,
+ 'subtitles': subtitles,
}
int_or_none,
parse_duration,
strip_or_none,
- try_get,
+ unescapeHTML,
unified_strdate,
unified_timestamp,
update_url_query,
_UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
_GEO_COUNTRIES = ['IT']
_GEO_BYPASS = False
- _BASE_URL = 'https://www.raiplay.it'
def _extract_relinker_info(self, relinker_url, video_id):
if not re.match(r'https?://', relinker_url):
class RaiPlayIE(RaiBaseIE):
- _VALID_URL = r'(?P<url>(?P<base>https?://(?:www\.)?raiplay\.it/.+?-)(?P<id>%s)(?P<ext>\.(?:html|json)))' % RaiBaseIE._UUID_RE
+ _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.(?:html|json))' % RaiBaseIE._UUID_RE
_TESTS = [{
+ 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter',
+ 'md5': '340aa3b7afb54bfd14a8c11786450d76',
+ 'info_dict': {
+ 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66',
+ 'ext': 'mp4',
+ 'title': 'La Casa Bianca',
+ 'alt_title': 'S2016 - Puntata del 23/10/2016',
+ 'description': 'md5:a09d45890850458077d1f68bb036e0a5',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Rai 3',
+ 'creator': 'Rai 3',
+ 'duration': 3278,
+ 'timestamp': 1477764300,
+ 'upload_date': '20161029',
+ 'series': 'La Casa Bianca',
+ 'season': '2016',
+ },
+ 'skip': 'This content is not available',
+ }, {
'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
'md5': '8970abf8caf8aef4696e7b1f2adfc696',
'info_dict': {
'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
'ext': 'mp4',
'title': 'Report del 07/04/2014',
- 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014 ',
+ 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014',
'description': 'md5:d730c168a58f4bb35600fc2f881ec04e',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Rai Gulp',
'duration': 6160,
+ 'series': 'Report',
+ 'season': '2013/14',
},
'params': {
'skip_download': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- url, base, video_id, ext = mobj.group('url', 'base', 'id', 'ext')
+ url, video_id = re.match(self._VALID_URL, url).groups()
media = self._download_json(
- '%s%s.json' % (base, video_id), video_id, 'Downloading video JSON')
+ url.replace('.html', '.json'), video_id, 'Downloading video JSON')
title = media['name']
video = media['video']
self._sort_formats(relinker_info['formats'])
thumbnails = []
- if 'images' in media:
- for _, value in media.get('images').items():
- if value:
- thumbnails.append({
- 'url': urljoin(RaiBaseIE._BASE_URL, value.replace('[RESOLUTION]', '600x400'))
- })
+ for _, value in media.get('images', {}).items():
+ if value:
+ thumbnails.append({
+ 'url': urljoin(url, value),
+ })
- timestamp = unified_timestamp(try_get(
- media, lambda x: x['availabilities'][0]['start'], compat_str))
+ date_published = media.get('date_published')
+ time_published = media.get('time_published')
+ if date_published and time_published:
+ date_published += ' ' + time_published
subtitles = self._extract_subtitles(url, video.get('subtitles'))
+ program_info = media.get('program_info') or {}
+ season = media.get('season')
+
info = {
'id': video_id,
'title': self._live_title(title) if relinker_info.get(
'is_live') else title,
- 'alt_title': media.get('subtitle'),
+ 'alt_title': strip_or_none(media.get('subtitle')),
'description': media.get('description'),
'uploader': strip_or_none(media.get('channel')),
- 'creator': strip_or_none(media.get('editor')),
+ 'creator': strip_or_none(media.get('editor') or None),
'duration': parse_duration(video.get('duration')),
- 'timestamp': timestamp,
+ 'timestamp': unified_timestamp(date_published),
'thumbnails': thumbnails,
- 'series': try_get(
- media, lambda x: x['isPartOf']['name'], compat_str),
- 'season_number': int_or_none(try_get(
- media, lambda x: x['isPartOf']['numeroStagioni'])),
- 'season': media.get('stagione') or None,
+ 'series': program_info.get('name'),
+ 'season_number': int_or_none(season),
+ 'season': season if (season and not season.isdigit()) else None,
+ 'episode': media.get('episode_title'),
+ 'episode_number': int_or_none(media.get('episode')),
'subtitles': subtitles,
}
'display_id': 'rainews24',
'ext': 'mp4',
'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
- 'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497',
+ 'description': 'md5:6eca31500550f9376819f174e5644754',
'uploader': 'Rai News 24',
'creator': 'Rai News 24',
'is_live': True,
def _real_extract(self, url):
display_id = self._match_id(url)
- media = self._download_json(
- '%s.json' % urljoin(RaiBaseIE._BASE_URL, 'dirette/' + display_id),
- display_id, 'Downloading channel JSON')
-
- title = media['name']
- video = media['video']
- video_id = media['id'].replace('ContentItem-', '')
+ webpage = self._download_webpage(url, display_id)
- relinker_info = self._extract_relinker_info(video['content_url'], video_id)
- self._sort_formats(relinker_info['formats'])
+ video_id = self._search_regex(
+ r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE,
+ webpage, 'content id')
- info = {
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': RaiPlayIE.ie_key(),
+ 'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id,
'id': video_id,
'display_id': display_id,
- 'title': self._live_title(title) if relinker_info.get(
- 'is_live') else title,
- 'alt_title': media.get('subtitle'),
- 'description': media.get('description'),
- 'uploader': strip_or_none(media.get('channel')),
- 'creator': strip_or_none(media.get('editor')),
- 'duration': parse_duration(video.get('duration')),
}
- info.update(relinker_info)
- return info
-
class RaiPlayPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)'
'info_dict': {
'id': 'nondirloalmiocapo',
'title': 'Non dirlo al mio capo',
- 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b',
+ 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86',
},
'playlist_mincount': 12,
}]
def _real_extract(self, url):
playlist_id = self._match_id(url)
- media = self._download_json(
- '%s.json' % urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id),
- playlist_id, 'Downloading program JSON')
-
- title = media['name']
- description = media['program_info']['description']
+ webpage = self._download_webpage(url, playlist_id)
- content_sets = [s['id'] for b in media['blocks'] for s in b['sets']]
+ title = self._html_search_meta(
+ ('programma', 'nomeProgramma'), webpage, 'title')
+ description = unescapeHTML(self._html_search_meta(
+ ('description', 'og:description'), webpage, 'description'))
entries = []
- for cs in content_sets:
- medias = self._download_json(
- '%s/%s.json' % (urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id), cs),
- cs, 'Downloading content set JSON')
- for m in medias['items']:
- video_url = urljoin(url, m['path_id'])
- entries.append(self.url_result(
- video_url, ie=RaiPlayIE.ie_key(),
- video_id=RaiPlayIE._match_id(video_url)))
+ for mobj in re.finditer(
+ r'<a\b[^>]+\bhref=(["\'])(?P<path>/raiplay/video/.+?)\1',
+ webpage):
+ video_url = urljoin(url, mobj.group('path'))
+ entries.append(self.url_result(
+ video_url, ie=RaiPlayIE.ie_key(),
+ video_id=RaiPlayIE._match_id(video_url)))
return self.playlist_result(entries, playlist_id, title, description)
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1758,
'upload_date': '20140612',
- }
+ },
+ 'skip': 'This content is available only in Italy',
}, {
# with ContentItem in many metas
'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html',
'duration': 2214,
'upload_date': '20161103',
}
+ }, {
+ # drawMediaRaiTV(...)
+ 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
+ 'md5': '2dd727e61114e1ee9c47f0da6914e178',
+ 'info_dict': {
+ 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af',
+ 'ext': 'mp4',
+ 'title': 'Il pacco',
+ 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'upload_date': '20141221',
+ },
+ 'skip': 'This content is not available',
}, {
# initEdizione('ContentItem-...'
'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined',
'upload_date': '20170401',
},
'skip': 'Changes daily',
+ }, {
+ # HDS live stream with only relinker URL
+ 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews',
+ 'info_dict': {
+ 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc',
+ 'ext': 'flv',
+ 'title': 'EuroNews',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'This content is available only in Italy',
}, {
# HLS live stream with ContentItem in og:url
'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html',
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ float_or_none,
+ int_or_none,
+ unified_timestamp,
+ urlencode_postdata,
+ url_or_none,
+)
class ServusIE(InfoExtractor):
(?:www\.)?
(?:
servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)|
- servustv\.com/videos
+ (?:servustv|pm-wissen)\.com/videos
)
/(?P<id>[aA]{2}-\w+|\d+-\d+)
'''
_TESTS = [{
# new URL schema
'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/',
- 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4',
+ 'md5': '60474d4c21f3eb148838f215c37f02b9',
'info_dict': {
'id': 'AA-1T6VBU5PW1W12',
'ext': 'mp4',
'title': 'Die Grünen aus Sicht des Volkes',
+ 'alt_title': 'Talk im Hangar-7 Voxpops Gruene',
'description': 'md5:1247204d85783afe3682644398ff2ec4',
'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 62.442,
+ 'timestamp': 1605193976,
+ 'upload_date': '20201112',
+ 'series': 'Talk im Hangar-7',
+ 'season': 'Season 9',
+ 'season_number': 9,
+ 'episode': 'Episode 31 - September 14',
+ 'episode_number': 31,
}
}, {
# old URL schema
}, {
'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/',
'only_matching': True,
+ }, {
+ 'url': 'https://www.pm-wissen.com/videos/aa-24mus4g2w2112/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url).upper()
- webpage = self._download_webpage(url, video_id)
- title = self._search_regex(
- (r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
- r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'),
- webpage, 'title', default=None,
- group='title') or self._og_search_title(webpage)
- title = re.sub(r'\s*-\s*Servus TV\s*$', '', title)
- description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage)
+ token = self._download_json(
+ 'https://auth.redbullmediahouse.com/token', video_id,
+ 'Downloading token', data=urlencode_postdata({
+ 'grant_type': 'client_credentials',
+ }), headers={
+ 'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==',
+ })
+ access_token = token['access_token']
+ token_type = token.get('token_type', 'Bearer')
- formats = self._extract_m3u8_formats(
- 'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id,
- video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
+ video = self._download_json(
+ 'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id,
+ video_id, 'Downloading video JSON', headers={
+ 'Authorization': '%s %s' % (token_type, access_token),
+ })
+
+ formats = []
+ thumbnail = None
+ for resource in video['resources']:
+ if not isinstance(resource, dict):
+ continue
+ format_url = url_or_none(resource.get('url'))
+ if not format_url:
+ continue
+ extension = resource.get('extension')
+ type_ = resource.get('type')
+ if extension == 'jpg' or type_ == 'reference_keyframe':
+ thumbnail = format_url
+ continue
+ ext = determine_ext(format_url)
+ if type_ == 'dash' or ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ format_url, video_id, mpd_id='dash', fatal=False))
+ elif type_ == 'hls' or ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ elif extension == 'mp4' or ext == 'mp4':
+ formats.append({
+ 'url': format_url,
+ 'format_id': type_,
+ 'width': int_or_none(resource.get('width')),
+ 'height': int_or_none(resource.get('height')),
+ })
self._sort_formats(formats)
+ attrs = {}
+ for attribute in video['attributes']:
+ if not isinstance(attribute, dict):
+ continue
+ key = attribute.get('fieldKey')
+ value = attribute.get('fieldValue')
+ if not key or not value:
+ continue
+ attrs[key] = value
+
+ title = attrs.get('title_stv') or video_id
+ alt_title = attrs.get('title')
+ description = attrs.get('long_description') or attrs.get('short_description')
+ series = attrs.get('label')
+ season = attrs.get('season')
+ episode = attrs.get('chapter')
+ duration = float_or_none(attrs.get('duration'), scale=1000)
+ season_number = int_or_none(self._search_regex(
+ r'Season (\d+)', season or '', 'season number', default=None))
+ episode_number = int_or_none(self._search_regex(
+ r'Episode (\d+)', episode or '', 'episode number', default=None))
+
return {
'id': video_id,
'title': title,
+ 'alt_title': alt_title,
'description': description,
'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': unified_timestamp(video.get('lastPublished')),
+ 'series': series,
+ 'season': season,
+ 'season_number': season_number,
+ 'episode': episode,
+ 'episode_number': episode_number,
'formats': formats,
}
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from .nexx import (
- NexxIE,
- NexxEmbedIE,
-)
-from .spiegeltv import SpiegeltvIE
-from ..compat import compat_urlparse
-from ..utils import (
- parse_duration,
- strip_or_none,
- unified_timestamp,
-)
+from .jwplatform import JWPlatformIE
class SpiegelIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$'
+ _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
+ _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE
_TESTS = [{
'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
- 'md5': 'b57399839d055fccfeb9a0455c439868',
+ 'md5': '50c7948883ec85a3e431a0a44b7ad1d6',
'info_dict': {
- 'id': '563747',
+ 'id': 'II0BUyxY',
+ 'display_id': '1259285',
'ext': 'mp4',
- 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv',
+ 'title': 'Vulkan Tungurahua in Ecuador ist wieder aktiv - DER SPIEGEL - Wissenschaft',
'description': 'md5:8029d8310232196eb235d27575a8b9f4',
- 'duration': 49,
+ 'duration': 48.0,
'upload_date': '20130311',
- 'timestamp': 1362994320,
+ 'timestamp': 1362997920,
},
}, {
'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
- 'md5': '5b6c2f4add9d62912ed5fc78a1faed80',
- 'info_dict': {
- 'id': '580988',
- 'ext': 'mp4',
- 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers',
- 'description': 'md5:c2322b65e58f385a820c10fa03b2d088',
- 'duration': 983,
- 'upload_date': '20131115',
- 'timestamp': 1384546642,
- },
+ 'only_matching': True,
}, {
- 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html',
- 'md5': '97b91083a672d72976faa8433430afb9',
- 'info_dict': {
- 'id': '601883',
- 'ext': 'mp4',
- 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.',
- 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"',
- 'upload_date': '20140904',
- 'timestamp': 1409834160,
- }
+ 'url': 'https://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html',
+ 'only_matching': True,
}, {
- 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html',
+ 'url': 'https://www.spiegel.de/panorama/urteile-im-goldmuenzenprozess-haftstrafen-fuer-clanmitglieder-a-aae8df48-43c1-4c61-867d-23f0a2d254b7',
'only_matching': True,
}, {
- # nexx video
'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html',
'only_matching': True,
+ }, {
+ 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
- metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id
- handle = self._request_webpage(metadata_url, video_id)
-
- # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html
- if SpiegeltvIE.suitable(handle.geturl()):
- return self.url_result(handle.geturl(), 'Spiegeltv')
-
- video_data = self._parse_json(self._webpage_read_content(
- handle, metadata_url, video_id), video_id)
- title = video_data['title']
- nexx_id = video_data['nexxOmniaId']
- domain_id = video_data.get('nexxOmniaDomain') or '748'
-
+ webpage = self._download_webpage(url, video_id)
+ media_id = self._html_search_regex(
+ r'("|["\'])mediaId\1\s*:\s*("|["\'])(?P<id>(?:(?!\2).)+)\2',
+ webpage, 'media id', group='id')
return {
'_type': 'url_transparent',
'id': video_id,
- 'url': 'nexx:%s:%s' % (domain_id, nexx_id),
- 'title': title,
- 'description': strip_or_none(video_data.get('teaser')),
- 'duration': parse_duration(video_data.get('duration')),
- 'timestamp': unified_timestamp(video_data.get('datum')),
- 'ie_key': NexxIE.ie_key(),
+ 'display_id': video_id,
+ 'url': 'jwplatform:%s' % media_id,
+ 'title': self._og_search_title(webpage, default=None),
+ 'ie_key': JWPlatformIE.ie_key(),
}
-
-
-class SpiegelArticleIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html'
- IE_NAME = 'Spiegel:Article'
- IE_DESC = 'Articles on spiegel.de'
- _TESTS = [{
- 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html',
- 'info_dict': {
- 'id': '1516455',
- 'ext': 'mp4',
- 'title': 'Faszination Badminton: Nennt es bloß nicht Federball',
- 'description': 're:^Patrick Kämnitz gehört.{100,}',
- 'upload_date': '20140825',
- },
- }, {
- 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html',
- 'info_dict': {
-
- },
- 'playlist_count': 6,
- }, {
- # Nexx iFrame embed
- 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html',
- 'info_dict': {
- 'id': '161464',
- 'ext': 'mp4',
- 'title': 'Nervenkitzel Achterbahn',
- 'alt_title': 'Karussellbauer in Deutschland',
- 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
- 'release_year': 2005,
- 'creator': 'SPIEGEL TV',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 2761,
- 'timestamp': 1394021479,
- 'upload_date': '20140305',
- },
- 'params': {
- 'format': 'bestvideo',
- 'skip_download': True,
- },
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- # Single video on top of the page
- video_link = self._search_regex(
- r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage,
- 'video page URL', default=None)
- if video_link:
- video_url = compat_urlparse.urljoin(
- self.http_scheme() + '//spiegel.de/', video_link)
- return self.url_result(video_url)
-
- # Multiple embedded videos
- embeds = re.findall(
- r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"',
- webpage)
- entries = [
- self.url_result(compat_urlparse.urljoin(
- self.http_scheme() + '//spiegel.de/', embed_path))
- for embed_path in embeds]
- if embeds:
- return self.playlist_result(entries)
-
- return self.playlist_from_matches(
- NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key())
class TwentyThreeVideoIE(InfoExtractor):
IE_NAME = '23video'
- _VALID_URL = r'https?://video\.(?P<domain>twentythree\.net|23video\.com|filmweb\.no)/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)'
- _TEST = {
+ _VALID_URL = r'https?://(?P<domain>[^.]+\.(?:twentythree\.net|23video\.com|filmweb\.no))/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)'
+ _TESTS = [{
'url': 'https://video.twentythree.net/v.ihtml/player.html?showDescriptions=0&source=site&photo%5fid=20448876&autoPlay=1',
'md5': '75fcf216303eb1dae9920d651f85ced4',
'info_dict': {
'uploader_id': '12258964',
'uploader': 'Rasmus Bysted',
}
- }
+ }, {
+ 'url': 'https://bonnier-publications-danmark.23video.com/v.ihtml/player.html?token=f0dc46476e06e13afd5a1f84a29e31e8&source=embed&photo%5fid=36137620',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
domain, query, photo_id = re.match(self._VALID_URL, url).groups()
- base_url = 'https://video.%s' % domain
+ base_url = 'https://%s' % domain
photo_data = self._download_json(
base_url + '/api/photo/list?' + query, photo_id, query={
'format': 'json',
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import unified_timestamp
-import re
+from ..utils import (
+ dict_get,
+ int_or_none,
+ unified_timestamp,
+)
class URPlayIE(InfoExtractor):
'info_dict': {
'id': '203704',
'ext': 'mp4',
- 'title': 'Om vetenskap, kritiskt tänkande och motstånd',
+ 'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd',
'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a',
'timestamp': 1513292400,
'upload_date': '20171214',
'ext': 'mp4',
'title': 'Tripp, Trapp, Träd : Sovkudde',
'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1',
- 'timestamp': 1440093600,
+ 'timestamp': 1440086400,
'upload_date': '20150820',
},
}, {
def _real_extract(self, url):
video_id = self._match_id(url)
-
+ url = url.replace('skola.se/Produkter', 'play.se/program')
webpage = self._download_webpage(url, video_id)
- urplayer_data = re.sub(""", "\"", self._search_regex(
- r'components\/Player\/Player\" data-react-props=\"({.+?})\"',
- webpage, 'urplayer data'))
- urplayer_data = self._parse_json(urplayer_data, video_id)
- for i in range(len(urplayer_data['accessibleEpisodes'])):
- if urplayer_data.get('accessibleEpisodes', {})[i].get('id') == int(video_id):
- urplayer_data = urplayer_data['accessibleEpisodes'][i]
- break
+ urplayer_data = self._parse_json(self._html_search_regex(
+ r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"',
+ webpage, 'urplayer data'), video_id)['currentProduct']
+ episode = urplayer_data['title']
host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect']
formats = []
- urplayer_streams = urplayer_data.get("streamingInfo")
- for quality in ('sd'), ('hd'):
- location = (urplayer_streams.get("raw", {}).get(quality, {}).get("location")
- or urplayer_streams.get("sweComplete", {}).get(quality, {}).get("location"))
- if location:
+ urplayer_streams = urplayer_data.get('streamingInfo', {})
+
+ for k, v in urplayer_streams.get('raw', {}).items():
+ if not (k in ('sd', 'hd') and isinstance(v, dict)):
+ continue
+ file_http = v.get('location')
+ if file_http:
formats.extend(self._extract_wowza_formats(
- 'http://%s/%s/playlist.m3u8' % (host, location), video_id,
- skip_protocols=['f4m', 'rtmp', 'rtsp']))
+ 'http://%s/%splaylist.m3u8' % (host, file_http),
+ video_id, skip_protocols=['f4m', 'rtmp', 'rtsp']))
self._sort_formats(formats)
+
subtitles = {}
subs = urplayer_streams.get("sweComplete", {}).get("tt", {}).get("location")
if subs:
'url': subs,
})
+ image = urplayer_data.get('image') or {}
+ thumbnails = []
+ for k, v in image.items():
+ t = {
+ 'id': k,
+ 'url': v,
+ }
+ wh = k.split('x')
+ if len(wh) == 2:
+ t.update({
+ 'width': int_or_none(wh[0]),
+ 'height': int_or_none(wh[1]),
+ })
+ thumbnails.append(t)
+
+ series = urplayer_data.get('series') or {}
+ series_title = dict_get(series, ('seriesTitle', 'title')) or dict_get(urplayer_data, ('seriesTitle', 'mainTitle'))
+
return {
'id': video_id,
- 'title': urplayer_data['title'],
- 'description': self._og_search_description(webpage),
- 'thumbnail': urplayer_data.get('image', {}).get('1280x720'),
- 'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'),
- webpage, 'timestamp')),
- 'series': urplayer_data.get('seriesTitle'),
'subtitles': subtitles,
+ 'title': '%s : %s' % (series_title, episode) if series_title else episode,
+ 'description': urplayer_data.get('description'),
+ 'thumbnails': thumbnails,
+ 'timestamp': unified_timestamp(urplayer_data.get('publishedAt')),
+ 'series': series_title,
'formats': formats,
+ 'duration': int_or_none(urplayer_data.get('duration')),
+ 'categories': urplayer_data.get('categories'),
+ 'tags': urplayer_data.get('keywords'),
+ 'season': series.get('label'),
+ 'episode': episode,
+ 'episode_number': int_or_none(urplayer_data.get('episodeNumber')),
}
# coding: utf-8
from __future__ import unicode_literals
-from .adobepass import AdobePassIE
-from ..utils import (
- NO_DEFAULT,
- smuggle_url,
- update_url_query,
-)
+from .nbc import NBCIE
-class USANetworkIE(AdobePassIE):
- _VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P<id>[^/?#]+)'
- _TEST = {
- 'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity',
- 'md5': '33c0d2ba381571b414024440d08d57fd',
+class USANetworkIE(NBCIE):
+ _VALID_URL = r'https?(?P<permalink>://(?:www\.)?usanetwork\.com/(?:[^/]+/videos?|movies?)/(?:[^/]+/)?(?P<id>\d+))'
+ _TESTS = [{
+ 'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302',
'info_dict': {
- 'id': '3086229',
+ 'id': '4185302',
'ext': 'mp4',
- 'title': 'HPE Cybersecurity',
- 'description': 'The more we digitize our world, the more vulnerable we are.',
- 'upload_date': '20160818',
- 'timestamp': 1471535460,
- 'uploader': 'NBCU-USA',
+ 'title': 'Intelligence (Trailer)',
+ 'description': 'A maverick NSA agent enlists the help of a junior systems analyst in a workplace power grab.',
+ 'upload_date': '20200715',
+ 'timestamp': 1594785600,
+ 'uploader': 'NBCU-MPAT',
},
- }
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
-
- def _x(name, default=NO_DEFAULT):
- return self._search_regex(
- r'data-%s\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % name,
- webpage, name, default=default, group='value')
-
- video_id = _x('mpx-guid')
- title = _x('episode-title')
- mpx_account_id = _x('mpx-account-id', '2304992029')
-
- query = {
- 'mbr': 'true',
- }
- if _x('is-full-episode', None) == '1':
- query['manifest'] = 'm3u'
-
- if _x('is-entitlement', None) == '1':
- adobe_pass = {}
- drupal_settings = self._search_regex(
- r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
- webpage, 'drupal settings', fatal=False)
- if drupal_settings:
- drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False)
- if drupal_settings:
- adobe_pass = drupal_settings.get('adobePass', {})
- resource = self._get_mvpd_resource(
- adobe_pass.get('adobePassResourceId', 'usa'),
- title, video_id, _x('episode-rating', 'TV-14'))
- query['auth'] = self._extract_mvpd_auth(
- url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource)
-
- info = self._search_json_ld(webpage, video_id, default={})
- info.update({
- '_type': 'url_transparent',
- 'url': smuggle_url(update_url_query(
- 'http://link.theplatform.com/s/HNK2IC/media/guid/%s/%s' % (mpx_account_id, video_id),
- query), {'force_smil_url': True}),
- 'id': video_id,
- 'title': title,
- 'series': _x('show-title', None),
- 'episode': title,
- 'ie_key': 'ThePlatform',
- })
- return info
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }]
class UstreamIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)'
IE_NAME = 'ustream'
_TESTS = [{
'url': 'http://www.ustream.tv/recorded/20274954',
'params': {
'skip_download': True, # m3u8 download
},
+ }, {
+ 'url': 'https://video.ibm.com/embed/recorded/128240221?&autoplay=true&controls=true&volume=100',
+ 'only_matching': True,
}]
@staticmethod
def _extract_url(webpage):
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
+ r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage)
if mobj is not None:
return mobj.group('url')
def _real_extract(self, url):
album_id = self._match_id(url)
- webpage = self._download_webpage(url, album_id)
- viewer = self._parse_json(self._search_regex(
- r'bootstrap_data\s*=\s*({.+?})</script>',
- webpage, 'bootstrap data'), album_id)['viewer']
+ viewer = self._download_json(
+ 'https://vimeo.com/_rv/viewer', album_id, fatal=False)
+ if not viewer:
+ webpage = self._download_webpage(url, album_id)
+ viewer = self._parse_json(self._search_regex(
+ r'bootstrap_data\s*=\s*({.+?})</script>',
+ webpage, 'bootstrap data'), album_id)['viewer']
jwt = viewer['jwt']
album = self._download_json(
'https://api.vimeo.com/albums/' + album_id,
import re
import time
import itertools
+import json
from .common import InfoExtractor
from .naver import NaverBaseIE
-from ..compat import compat_str
+from ..compat import (
+ compat_HTTPError,
+ compat_str,
+)
from ..utils import (
ExtractorError,
+ int_or_none,
merge_dicts,
try_get,
urlencode_postdata,
)
-class VLiveIE(NaverBaseIE):
+class VLiveBaseIE(NaverBaseIE):
+ _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
+
+
+class VLiveIE(VLiveBaseIE):
IE_NAME = 'vlive'
- _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|post)/(?P<id>(?:\d-)?[0-9]+)'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P<id>[0-9]+)'
_NETRC_MACHINE = 'vlive'
_TESTS = [{
- 'url': 'https://www.vlive.tv/video/1326',
- 'md5': 'cc7314812855ce56de70a06a27314983',
- 'info_dict': {
- 'id': '1326',
- 'ext': 'mp4',
- 'title': "[V LIVE] Girl's Day's Broadcast",
- 'creator': "Girl's Day",
- 'view_count': int,
- 'uploader_id': 'muploader_a',
- },
- },
- {
- 'url': 'https://vlive.tv/post/1-18244258',
+ 'url': 'http://www.vlive.tv/video/1326',
'md5': 'cc7314812855ce56de70a06a27314983',
'info_dict': {
'id': '1326',
'ext': 'mp4',
- 'title': "[V LIVE] Girl's Day's Broadcast",
+ 'title': "Girl's Day's Broadcast",
'creator': "Girl's Day",
'view_count': int,
'uploader_id': 'muploader_a',
},
- },
- {
- 'url': 'https://www.vlive.tv/video/16937',
+ }, {
+ 'url': 'http://www.vlive.tv/video/16937',
'info_dict': {
'id': '16937',
'ext': 'mp4',
- 'title': '[V LIVE] 첸백시 걍방',
+ 'title': '첸백시 걍방',
'creator': 'EXO',
'view_count': int,
'subtitles': 'mincount:12',
'subtitles': 'mincount:10',
},
'skip': 'This video is only available for CH+ subscribers',
+ }, {
+ 'url': 'https://www.vlive.tv/embed/1326',
+ 'only_matching': True,
}]
- @classmethod
- def suitable(cls, url):
- return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url)
-
def _real_initialize(self):
self._login()
if not is_logged_in():
raise ExtractorError('Unable to log in', expected=True)
- def _real_extract(self, url):
- # url may match on a post or a video url with a post_id potentially matching a video_id
- working_id = self._match_id(url)
- webpage = self._download_webpage(url, working_id)
-
- PARAMS_RE = r'window\.__PRELOADED_STATE__\s*=\s*({.*});?\s*</script>'
- PARAMS_FIELD = 'params'
-
- params = self._search_regex(
- PARAMS_RE, webpage, PARAMS_FIELD, default='', flags=re.DOTALL)
- params = self._parse_json(params, working_id, fatal=False)
-
- video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"], dict)
-
- if video_params is None:
- error = try_get(params, lambda x: x["postDetail"]["error"], dict)
- error_data = try_get(error, lambda x: x["data"], dict)
- error_video = try_get(error_data, lambda x: x["officialVideo"], dict)
- error_msg = try_get(error, lambda x: x["message"], compat_str)
- product_type = try_get(error_data,
- [lambda x: x["officialVideo"]["productType"],
- lambda x: x["board"]["boardType"]],
- compat_str)
-
- if error_video is not None:
- if product_type in ('VLIVE_PLUS', 'VLIVE+'):
- self.raise_login_required('This video is only available with V LIVE+.')
- elif error_msg is not None:
- raise ExtractorError('V LIVE reported the following error: %s' % error_msg)
- else:
- raise ExtractorError('Failed to extract video parameters.')
- elif 'post' in url:
- raise ExtractorError('Url does not appear to be a video post.', expected=True)
- else:
- raise ExtractorError('Failed to extract video parameters.')
-
- video_id = working_id if 'video' in url else str(video_params["videoSeq"])
+ def _call_api(self, path_template, video_id, fields=None):
+ query = {'appId': self._APP_ID}
+ if fields:
+ query['fields'] = fields
+ return self._download_json(
+ 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id,
+ 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0],
+ headers={'Referer': 'https://www.vlive.tv/'}, query=query)
- video_type = video_params["type"]
- if video_type in ('VOD'):
- encoding_status = video_params["encodingStatus"]
- if encoding_status == 'COMPLETE':
- return self._replay(video_id, webpage, params, video_params)
- else:
- raise ExtractorError('VOD encoding not yet complete. Please try again later.',
- expected=True)
- elif video_type in ('LIVE'):
- video_status = video_params["status"]
- if video_status in ('RESERVED'):
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ try:
+ post = self._call_api(
+ 'post/v1.0/officialVideoPost-%s', video_id,
+ 'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId}')
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ self.raise_login_required(json.loads(e.cause.read().decode())['message'])
+ raise
+
+ video = post['officialVideo']
+
+ def get_common_fields():
+ channel = post.get('channel') or {}
+ return {
+ 'title': video.get('title'),
+ 'creator': post.get('author', {}).get('nickname'),
+ 'channel': channel.get('channelName'),
+ 'channel_id': channel.get('channelCode'),
+ 'duration': int_or_none(video.get('playTime')),
+ 'view_count': int_or_none(video.get('playCount')),
+ 'like_count': int_or_none(video.get('likeCount')),
+ 'comment_count': int_or_none(video.get('commentCount')),
+ }
+
+ video_type = video.get('type')
+ if video_type == 'VOD':
+ inkey = self._call_api('video/v1.0/vod/%s/inkey', video_id)['inkey']
+ vod_id = video['vodId']
+ return merge_dicts(
+ get_common_fields(),
+ self._extract_video_info(video_id, vod_id, inkey))
+ elif video_type == 'LIVE':
+ status = video.get('status')
+ if status == 'ON_AIR':
+ stream_url = self._call_api(
+ 'old/v3/live/%s/playInfo',
+ video_id)['result']['adaptiveStreamUrl']
+ formats = self._extract_m3u8_formats(stream_url, video_id, 'mp4')
+ info = get_common_fields()
+ info.update({
+ 'title': self._live_title(video['title']),
+ 'id': video_id,
+ 'formats': formats,
+ 'is_live': True,
+ })
+ return info
+ elif status == 'ENDED':
+ raise ExtractorError(
+ 'Uploading for replay. Please wait...', expected=True)
+ elif status == 'RESERVED':
raise ExtractorError('Coming soon!', expected=True)
- elif video_status in ('ENDED', 'END'):
- raise ExtractorError('Uploading for replay. Please wait...', expected=True)
+ elif video.get('exposeStatus') == 'CANCEL':
+ raise ExtractorError(
+ 'We are sorry, but the live broadcast has been canceled.',
+ expected=True)
else:
- return self._live(video_id, webpage, params)
- else:
- raise ExtractorError('Unknown video type %s' % video_type)
-
- def _get_common_fields(self, webpage, params):
- title = self._og_search_title(webpage)
- description = self._html_search_meta(
- ['og:description', 'description', 'twitter:description'],
- webpage, 'description', default=None)
- creator = (try_get(params, lambda x: x["channel"]["channel"]["channelName"], compat_str)
- or self._search_regex(r'on (.*) channel', description or '', 'creator', fatal=False))
- thumbnail = self._og_search_thumbnail(webpage)
- return {
- 'title': title,
- 'creator': creator,
- 'thumbnail': thumbnail,
- }
-
- def _live(self, video_id, webpage, params):
- LIVE_INFO_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/old/v3/live/%s/playInfo' % video_id
- play_info = self._download_json(LIVE_INFO_ENDPOINT, video_id,
- headers={"referer": "https://www.vlive.tv"})
-
- streams = try_get(play_info, lambda x: x["result"]["streamList"], list) or []
-
- formats = []
- for stream in streams:
- formats.extend(self._extract_m3u8_formats(
- stream['serviceUrl'], video_id, 'mp4',
- fatal=False, live=True))
- self._sort_formats(formats)
-
- info = self._get_common_fields(webpage, params)
- info.update({
- 'title': self._live_title(info['title']),
- 'id': video_id,
- 'formats': formats,
- 'is_live': True,
- })
- return info
+ raise ExtractorError('Unknown status ' + status)
- def _replay(self, video_id, webpage, params, video_params):
- long_video_id = video_params["vodId"]
- VOD_KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id
- key_json = self._download_json(VOD_KEY_ENDPOINT, video_id,
- headers={"referer": "https://www.vlive.tv"})
- key = key_json["inkey"]
-
- return merge_dicts(
- self._get_common_fields(webpage, params),
- self._extract_video_info(video_id, long_video_id, key))
-
-
-class VLiveChannelIE(InfoExtractor):
+class VLiveChannelIE(VLiveBaseIE):
IE_NAME = 'vlive:channel'
- _VALID_URL = r'https?://(?:(?:www|m)\.)?(?:channels\.vlive\.tv/|vlive\.tv/channels?/)(?P<id>[0-9A-Z]+)'
+ _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<id>[0-9A-Z]+)'
_TESTS = [{
- 'url': 'https://channels.vlive.tv/FCD4B',
+ 'url': 'http://channels.vlive.tv/FCD4B',
'info_dict': {
'id': 'FCD4B',
'title': 'MAMAMOO',
'playlist_mincount': 110
}, {
'url': 'https://www.vlive.tv/channel/FCD4B',
- 'info_dict': {
- 'id': 'FCD4B',
- 'title': 'MAMAMOO',
- },
- 'playlist_mincount': 110
+ 'only_matching': True,
}]
- _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
+
+ def _call_api(self, path, channel_key_suffix, channel_value, note, query):
+ q = {
+ 'app_id': self._APP_ID,
+ 'channel' + channel_key_suffix: channel_value,
+ }
+ q.update(query)
+ return self._download_json(
+ 'http://api.vfan.vlive.tv/vproxy/channelplus/' + path,
+ channel_value, note='Downloading ' + note, query=q)['result']
def _real_extract(self, url):
channel_code = self._match_id(url)
- webpage = self._download_webpage(
- 'http://channels.vlive.tv/%s/video' % channel_code, channel_code)
-
- app_id = None
-
- app_js_url = self._search_regex(
- r'<script[^>]+src=(["\'])(?P<url>http.+?/app\.js.*?)\1',
- webpage, 'app js', default=None, group='url')
-
- if app_js_url:
- app_js = self._download_webpage(
- app_js_url, channel_code, 'Downloading app JS', fatal=False)
- if app_js:
- app_id = self._search_regex(
- r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]',
- app_js, 'app id', default=None)
-
- app_id = app_id or self._APP_ID
-
- channel_info = self._download_json(
- 'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode',
- channel_code, note='Downloading decode channel code',
- query={
- 'app_id': app_id,
- 'channelCode': channel_code,
- '_': int(time.time())
- })
+ channel_seq = self._call_api(
+ 'decodeChannelCode', 'Code', channel_code,
+ 'decode channel code', {})['channelSeq']
- channel_seq = channel_info['result']['channelSeq']
channel_name = None
entries = []
for page_num in itertools.count(1):
- video_list = self._download_json(
- 'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList',
- channel_code, note='Downloading channel list page #%d' % page_num,
- query={
- 'app_id': app_id,
- 'channelSeq': channel_seq,
+ video_list = self._call_api(
+ 'getChannelVideoList', 'Seq', channel_seq,
+ 'channel list page #%d' % page_num, {
# Large values of maxNumOfRows (~300 or above) may cause
# empty responses (see [1]), e.g. this happens for [2] that
# has more than 300 videos.
# 1. https://github.com/ytdl-org/youtube-dl/issues/13830
# 2. http://channels.vlive.tv/EDBF.
'maxNumOfRows': 100,
- '_': int(time.time()),
'pageNo': page_num
}
)
if not channel_name:
channel_name = try_get(
video_list,
- lambda x: x['result']['channelInfo']['channelName'],
+ lambda x: x['channelInfo']['channelName'],
compat_str)
videos = try_get(
- video_list, lambda x: x['result']['videoList'], list)
+ video_list, lambda x: x['videoList'], list)
if not videos:
break
entries, channel_code, channel_name)
-class VLivePlaylistIE(InfoExtractor):
+# old extractor. Rewrite?
+
+class VLivePlaylistIE(VLiveBaseIE):
IE_NAME = 'vlive:playlist'
_VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)'
_VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s'
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
int_or_none,
js_to_json,
orderedSet,
'title': 'strange erotica',
'description': 'contains:an ET kind of thing',
'uploader': 'greenshowers',
- 'duration': 449,
+ 'duration': 450,
'view_count': int,
'comment_count': int,
'age_limit': 18,
title, thumbnail, duration = [None] * 3
- json_config_string = self._search_regex(
- r'playerConf=({.+?}),loaderConf',
- webpage, 'config', default=None)
- if not json_config_string:
- raise ExtractorError("Could not extract video player data")
-
- json_config_string = json_config_string.replace("!0", "true").replace("!1", "false")
-
- config = self._parse_json(json_config_string, video_id, transform_source=js_to_json, fatal=False)
- if not config:
- raise ExtractorError("Could not extract video player data")
-
- config = config.get('mainRoll')
- if isinstance(config, dict):
- title = config.get('title')
- thumbnail = config.get('poster')
- duration = int_or_none(config.get('duration'))
- sources = config.get('sources') or config.get('format')
+ config = self._parse_json(self._search_regex(
+ r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config',
+ default='{}'), video_id, transform_source=js_to_json, fatal=False)
+ if config:
+ config = config.get('mainRoll')
+ if isinstance(config, dict):
+ title = config.get('title')
+ thumbnail = config.get('poster')
+ duration = int_or_none(config.get('duration'))
+ sources = config.get('sources') or config.get('format')
if not isinstance(sources, dict):
sources = self._parse_json(self._search_regex(
'upload_date': '20101217',
'average_rating': int,
'view_count': int,
- 'comment_count': int,
'categories': list,
'tags': list,
'age_limit': 18,
'upload_date': '20110418',
'average_rating': int,
'view_count': int,
- 'comment_count': int,
'categories': list,
'tags': list,
'age_limit': 18,
r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>',
webpage, 'uploader', fatal=False)
upload_date = unified_strdate(self._html_search_regex(
- [r'Date\s+[Aa]dded:\s*<span>([^<]+)',
+ [r'UPLOADED:\s*<span>([^<]+)',
+ r'Date\s+[Aa]dded:\s*<span>([^<]+)',
r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'],
webpage, 'upload date', fatal=False))
webpage, 'view count', fatal=False, group='count'))
comment_count = str_to_int(self._search_regex(
r'>All [Cc]omments? \(([\d,.]+)\)',
- webpage, 'comment count', fatal=False))
+ webpage, 'comment count', default=None))
def extract_tag_box(regex, title):
tag_box = self._search_regex(regex, webpage, title, default=None)
from ..swfinterp import SWFInterpreter
from ..compat import (
compat_chr,
- compat_HTTPError,
compat_kwargs,
compat_parse_qs,
compat_urllib_parse_unquote,
bool_or_none,
clean_html,
error_to_compat_str,
- extract_attributes,
ExtractorError,
float_or_none,
- get_element_by_attribute,
get_element_by_id,
int_or_none,
- js_to_json,
mimetype2ext,
orderedSet,
parse_codecs,
unescapeHTML,
unified_strdate,
unsmuggle_url,
+ update_url_query,
uppercase_escape,
url_or_none,
urlencode_postdata,
+ urljoin,
)
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
- _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
- _INITIAL_DATA_RE = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
- _YTCFG_DATA_RE = r"ytcfg.set\(({.*?})\)"
+ _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM)'
_YOUTUBE_CLIENT_HEADERS = {
'x-youtube-client-name': '1',
if not self._login():
return
+ _DEFAULT_API_DATA = {
+ 'context': {
+ 'client': {
+ 'clientName': 'WEB',
+ 'clientVersion': '2.20201021.03.00',
+ }
+ },
+ }
-class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
-
- def _find_entries_in_json(self, extracted):
- entries = []
- c = {}
-
- def _real_find(obj):
- if obj is None or isinstance(obj, str):
- return
-
- if type(obj) is list:
- for elem in obj:
- _real_find(elem)
-
- if type(obj) is dict:
- if self._is_entry(obj):
- entries.append(obj)
- return
-
- if 'continuationCommand' in obj:
- c['continuation'] = obj
- return
-
- for _, o in obj.items():
- _real_find(o)
-
- _real_find(extracted)
-
- return entries, try_get(c, lambda x: x["continuation"])
-
- def _entries(self, page, playlist_id, max_pages=None):
- seen = []
-
- yt_conf = {}
- for m in re.finditer(self._YTCFG_DATA_RE, page):
- parsed = self._parse_json(m.group(1), playlist_id,
- transform_source=js_to_json, fatal=False)
- if parsed:
- yt_conf.update(parsed)
-
- data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None)
-
- for page_num in range(1, max_pages + 1) if max_pages is not None else itertools.count(1):
- entries, continuation = self._find_entries_in_json(data_json)
- processed = self._process_entries(entries, seen)
-
- if not processed:
- break
- for entry in processed:
- yield entry
-
- if not continuation or not yt_conf:
- break
- continuation_token = try_get(continuation, lambda x: x['continuationCommand']['token'])
- continuation_url = try_get(continuation, lambda x: x['commandMetadata']['webCommandMetadata']['apiUrl'])
- if not continuation_token or not continuation_url:
- break
-
- count = 0
- retries = 3
- while count <= retries:
- try:
- # Downloading page may result in intermittent 5xx HTTP error
- # that is usually worked around with a retry
- data_json = self._download_json(
- 'https://www.youtube.com%s' % continuation_url,
- playlist_id,
- 'Downloading continuation page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''),
-
- transform_source=uppercase_escape,
- query={
- 'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY'])
- },
- data=str(json.dumps({
- 'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']),
- 'continuation': continuation_token
- })).encode(encoding='UTF-8', errors='strict'),
- headers={
- 'Content-Type': 'application/json'
- }
- )
- break
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
- count += 1
- if count <= retries:
- continue
- raise
-
- def _extract_title(self, renderer):
- title = try_get(renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
- if title:
- return title
- return try_get(renderer, lambda x: x['title']['simpleText'], compat_str)
-
-
-class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
- def _is_entry(self, obj):
- return 'videoId' in obj
-
- def _process_entries(self, entries, seen):
- ids_in_page = []
- titles_in_page = []
- for renderer in entries:
- video_id = try_get(renderer, lambda x: x['videoId'])
- video_title = self._extract_title(renderer)
-
- if video_id is None or video_title is None:
- # we do not have a videoRenderer or title extraction broke
- continue
-
- video_title = video_title.strip()
-
- try:
- idx = ids_in_page.index(video_id)
- if video_title and not titles_in_page[idx]:
- titles_in_page[idx] = video_title
- except ValueError:
- ids_in_page.append(video_id)
- titles_in_page.append(video_title)
-
- for video_id, video_title in zip(ids_in_page, titles_in_page):
- yield self.url_result(video_id, 'Youtube', video_id, video_title)
-
-
-class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
- def _is_entry(self, obj):
- return 'playlistId' in obj
+ def _call_api(self, ep, query, video_id):
+ data = self._DEFAULT_API_DATA.copy()
+ data.update(query)
- def _process_entries(self, entries, seen):
- for playlist_id in orderedSet(try_get(r, lambda x: x['playlistId']) for r in entries):
+ response = self._download_json(
+ 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
+ note='Downloading API JSON', errnote='Unable to download API page',
+ data=json.dumps(data).encode('utf8'),
+ headers={'content-type': 'application/json'},
+ query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
- yield self.url_result(
- 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
+ return response
- def _real_extract(self, url):
- playlist_id = self._match_id(url)
- webpage = self._download_webpage(url, playlist_id)
- title = self._og_search_title(webpage, fatal=False)
- return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
+ def _extract_yt_initial_data(self, video_id, webpage):
+ return self._parse_json(
+ self._search_regex(
+ r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;',
+ webpage, 'yt initial data'),
+ video_id)
class YoutubeIE(YoutubeBaseInfoExtractor):
|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
)
)? # all until now is optional -> you can pass the naked ID
- ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
+ (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
(?!.*?\blist=
(?:
%(playlist_id)s| # combined list/video URLs are handled by the playlist IE
}
},
{
- 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
+ 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
'note': 'Use the first video ID in the URL',
'info_dict': {
'id': 'BaW_jenozKc',
},
'skip': 'format 141 not served anymore',
},
+ # DASH manifest with encrypted signature
+ {
+ 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
+ 'info_dict': {
+ 'id': 'IB3lcPjvWLA',
+ 'ext': 'm4a',
+ 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
+ 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
+ 'duration': 244,
+ 'uploader': 'AfrojackVEVO',
+ 'uploader_id': 'AfrojackVEVO',
+ 'upload_date': '20131011',
+ },
+ 'params': {
+ 'youtube_include_dash_manifest': True,
+ 'format': '141/bestaudio[ext=m4a]',
+ },
+ },
# Controversy video
{
'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
'age_limit': 18,
},
},
+ # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
+ # YouTube Red ad is not captured for creator
+ {
+ 'url': '__2ABJjxzNo',
+ 'info_dict': {
+ 'id': '__2ABJjxzNo',
+ 'ext': 'mp4',
+ 'duration': 266,
+ 'upload_date': '20100430',
+ 'uploader_id': 'deadmau5',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
+ 'creator': 'Dada Life, deadmau5',
+ 'description': 'md5:12c56784b8032162bb936a5f76d55360',
+ 'uploader': 'deadmau5',
+ 'title': 'Deadmau5 - Some Chords (HD)',
+ 'alt_title': 'This Machine Kills Some Chords',
+ },
+ 'expected_warnings': [
+ 'DASH manifest missing',
+ ]
+ },
# Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
{
'url': 'lqQg6PlCWgI',
'url': 'sJL6WA-aGkQ',
'only_matching': True,
},
- {
- 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
- 'only_matching': True,
- },
{
'url': 'https://invidio.us/watch?v=BaW_jenozKc',
'only_matching': True,
'skip_download': True,
},
},
- {
- # Youtube Music Auto-generated description
- # Retrieve 'artist' field from 'Artist:' in video description
- # when it is present on youtube music video
- 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
- 'info_dict': {
- 'id': 'k0jLE7tTwjY',
- 'ext': 'mp4',
- 'title': 'Latch Feat. Sam Smith',
- 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
- 'upload_date': '20150110',
- 'uploader': 'Various Artists - Topic',
- 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
- 'artist': 'Disclosure',
- 'track': 'Latch Feat. Sam Smith',
- 'album': 'Latch Featuring Sam Smith',
- 'release_date': '20121008',
- 'release_year': 2012,
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- # Youtube Music Auto-generated description
- # handle multiple artists on youtube music video
- 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
- 'info_dict': {
- 'id': '74qn0eJSjpA',
- 'ext': 'mp4',
- 'title': 'Eastside',
- 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
- 'upload_date': '20180710',
- 'uploader': 'Benny Blanco - Topic',
- 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
- 'artist': 'benny blanco, Halsey, Khalid',
- 'track': 'Eastside',
- 'album': 'Eastside',
- 'release_date': '20180713',
- 'release_year': 2018,
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- # Youtube Music Auto-generated description
- # handle youtube music video with release_year and no release_date
- 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
- 'info_dict': {
- 'id': '-hcAI0g-f5M',
- 'ext': 'mp4',
- 'title': 'Put It On Me',
- 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
- 'upload_date': '20180426',
- 'uploader': 'Matt Maeson - Topic',
- 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
- 'artist': 'Matt Maeson',
- 'track': 'Put It On Me',
- 'album': 'The Hearse',
- 'release_date': None,
- 'release_year': 2018,
- },
- 'params': {
- 'skip_download': True,
- },
- },
{
'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
'only_matching': True,
# https://github.com/ytdl-org/youtube-dl/pull/7599)
r';ytplayer\.config\s*=\s*({.+?});ytplayer',
r';ytplayer\.config\s*=\s*({.+?});',
- r'ytInitialPlayerResponse\s*=\s*({.+?});var meta'
+ r'ytInitialPlayerResponse\s*=\s*({.+?});var meta' # Needed???
)
config = self._search_regex(
patterns, webpage, 'ytplayer.config', default=None)
self._downloader.report_warning(err_msg)
return {}
try:
- if "args" in player_config and "ttsurl" in player_config["args"]:
- args = player_config['args']
- caption_url = args['ttsurl']
+ args = player_config['args']
+ caption_url = args.get('ttsurl')
+ if caption_url:
timestamp = args['timestamp']
-
# We get the available subtitles
list_params = compat_urllib_parse_urlencode({
'type': 'list',
return captions
# New captions format as of 22.06.2017
- if "args" in player_config:
- player_response = player_config["args"].get('player_response')
- else:
- # New player system (ytInitialPlayerResponse) as of October 2020
- player_response = player_config
-
- if player_response:
- if isinstance(player_response, compat_str):
- player_response = self._parse_json(
- player_response, video_id, fatal=False)
-
- renderer = player_response['captions']['playerCaptionsTracklistRenderer']
- caption_tracks = renderer['captionTracks']
- for caption_track in caption_tracks:
- if 'kind' not in caption_track:
- # not an automatic transcription
- continue
- base_url = caption_track['baseUrl']
+ player_response = args.get('player_response')
+ if player_response and isinstance(player_response, compat_str):
+ player_response = self._parse_json(
+ player_response, video_id, fatal=False)
+ if player_response:
+ renderer = player_response['captions']['playerCaptionsTracklistRenderer']
+ base_url = renderer['captionTracks'][0]['baseUrl']
sub_lang_list = []
for lang in renderer['translationLanguages']:
lang_code = lang.get('languageCode')
sub_lang_list.append(lang_code)
return make_captions(base_url, sub_lang_list)
- self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
- return {}
-
- if "args" in player_config:
- args = player_config["args"]
-
- # Some videos don't provide ttsurl but rather caption_tracks and
- # caption_translation_languages (e.g. 20LmZk1hakA)
- # Does not used anymore as of 22.06.2017
- caption_tracks = args['caption_tracks']
- caption_translation_languages = args['caption_translation_languages']
- caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
- sub_lang_list = []
- for lang in caption_translation_languages.split(','):
- lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
- sub_lang = lang_qs.get('lc', [None])[0]
- if sub_lang:
- sub_lang_list.append(sub_lang)
- return make_captions(caption_url, sub_lang_list)
+ # Some videos don't provide ttsurl but rather caption_tracks and
+ # caption_translation_languages (e.g. 20LmZk1hakA)
+ # Does not used anymore as of 22.06.2017
+ caption_tracks = args['caption_tracks']
+ caption_translation_languages = args['caption_translation_languages']
+ caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0]
+ sub_lang_list = []
+ for lang in caption_translation_languages.split(','):
+ lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang))
+ sub_lang = lang_qs.get('lc', [None])[0]
+ if sub_lang:
+ sub_lang_list.append(sub_lang)
+ return make_captions(caption_url, sub_lang_list)
# An extractor error can be raise by the download process if there are
# no automatic captions but there are subtitles
except (KeyError, IndexError, ExtractorError):
def _extract_chapters_from_json(self, webpage, video_id, duration):
if not webpage:
return
- initial_data = self._parse_json(
- self._search_regex(
- r'window\["ytInitialData"\] = (.+);\n', webpage,
- 'player args', default='{}'),
- video_id, fatal=False)
- if not initial_data or not isinstance(initial_data, dict):
+ data = self._extract_yt_initial_data(video_id, webpage)
+ if not data or not isinstance(data, dict):
return
chapters_list = try_get(
- initial_data,
+ data,
lambda x: x['playerOverlays']
['playerOverlayRenderer']
['decoratedPlayerBarRenderer']
age_gate = False
# Try looking directly into the video webpage
ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
- args = ytplayer_config.get("args")
- if args is not None:
+ if ytplayer_config:
+ args = ytplayer_config.get('args', {})
if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
# Convert to the same format returned by compat_parse_qs
video_info = dict((k, [v]) for k, v in args.items())
is_live = True
if not player_response:
player_response = extract_player_response(args.get('player_response'), video_id)
- elif not player_response:
- player_response = ytplayer_config
if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
add_dash_mpd_pr(player_response)
+ if not video_info and not player_response:
+ player_response = extract_player_response(
+ self._search_regex(
+ r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage,
+ 'initial player response', default='{}'),
+ video_id)
+
def extract_unavailable_message():
messages = []
for tag, kind in (('h1', 'message'), ('div', 'submessage')):
if cipher:
if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
- ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))'
+ ASSETS_RE = (
+ r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
+ r'"jsUrl"\s*:\s*("[^"]+")',
+ r'"assets":.+?"js":\s*("[^"]+")')
jsplayer_url_json = self._search_regex(
ASSETS_RE,
embed_webpage if age_gate else video_webpage,
def _extract_count(count_name):
return str_to_int(self._search_regex(
- r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}'
+ r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
% re.escape(count_name),
video_webpage, count_name, default=None))
}
-class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
- IE_DESC = 'YouTube.com playlists'
- _VALID_URL = r"""(?x)(?:
- (?:https?://)?
- (?:\w+\.)?
- (?:
- (?:
- youtube(?:kids)?\.com|
- invidio\.us
- )
- /
- (?:
- (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
- \? (?:.*?[&;])*? (?:p|a|list)=
- | p/
- )|
- youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
- )
- (
- (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
- # Top tracks, they can also include dots
- |(?:MC)[\w\.]*
- )
- .*
- |
- (%(playlist_id)s)
- )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
- _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
- _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
- _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
- IE_NAME = 'youtube:playlist'
- _YTM_PLAYLIST_PREFIX = 'RDCLAK5uy_'
- _YTM_CHANNEL_INFO = {
- 'uploader': 'Youtube Music',
- 'uploader_id': 'music', # or "UC-9-kyTW8ZkZNDHQJ6FgpwQ"
- 'uploader_url': 'https://www.youtube.com/music'
- }
+class YoutubeTabIE(YoutubeBaseInfoExtractor):
+ IE_DESC = 'YouTube.com tab'
+ _VALID_URL = r'https?://(?:\w+\.)?(?:youtube(?:kids)?\.com|invidio\.us)/(?:(?:channel|c|user)/|(?:playlist|watch)\?.*?\blist=)(?P<id>[^/?#&]+)'
+ IE_NAME = 'youtube:tab'
+
_TESTS = [{
+ # playlists, multipage
+ 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
+ 'playlist_mincount': 94,
+ 'info_dict': {
+ 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
+ 'title': 'Игорь Клейнер - Playlists',
+ 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
+ },
+ }, {
+ # playlists, multipage, different order
+ 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
+ 'playlist_mincount': 94,
+ 'info_dict': {
+ 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
+ 'title': 'Игорь Клейнер - Playlists',
+ 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
+ },
+ }, {
+ # playlists, singlepage
+ 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
+ 'playlist_mincount': 4,
+ 'info_dict': {
+ 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
+ 'title': 'ThirstForScience - Playlists',
+ 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
+ }
+ }, {
+ 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
+ 'only_matching': True,
+ }, {
+ # basic, single video playlist
'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
'info_dict': {
'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
},
'playlist_count': 1,
}, {
+ # empty playlist
'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
'info_dict': {
'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
},
'playlist_count': 0,
}, {
- 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
- 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
+ # Home tab
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
'info_dict': {
- 'title': '29C3: Not my department',
- 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
- 'uploader': 'Christiaan008',
- 'uploader_id': 'ChRiStIaAn008',
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Home',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
},
- 'playlist_count': 96,
+ 'playlist_mincount': 2,
}, {
- 'note': 'issue #673',
- 'url': 'PLBB231211A4F62143',
+ # Videos tab
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
'info_dict': {
- 'title': '[OLD]Team Fortress 2 (Class-based LP)',
- 'id': 'PLBB231211A4F62143',
- 'uploader': 'Wickydoo',
- 'uploader_id': 'Wickydoo',
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Videos',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
},
- 'playlist_mincount': 26,
+ 'playlist_mincount': 975,
}, {
- 'note': 'Large playlist',
- 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
+ # Videos tab, sorted by popular
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
'info_dict': {
- 'title': 'Uploads from Cauchemar',
- 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
- 'uploader': 'Cauchemar',
- 'uploader_id': 'Cauchemar89',
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Videos',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
},
- 'playlist_mincount': 799,
+ 'playlist_mincount': 199,
}, {
- 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
+ # Playlists tab
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
'info_dict': {
- 'title': 'YDL_safe_search',
- 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Playlists',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
},
- 'playlist_count': 2,
- 'skip': 'This playlist is private',
+ 'playlist_mincount': 17,
}, {
- 'note': 'embedded',
- 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
- 'playlist_count': 4,
+ # Community tab
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
'info_dict': {
- 'title': 'JODA15',
- 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
- 'uploader': 'milan',
- 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
- }
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Community',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
+ },
+ 'playlist_mincount': 18,
}, {
- 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
- 'playlist_mincount': 485,
+ # Channels tab
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
'info_dict': {
- 'title': '2018 Chinese New Singles (11/6 updated)',
- 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
- 'uploader': 'LBK',
- 'uploader_id': 'sdragonfang',
- }
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Channels',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
+ },
+ 'playlist_mincount': 138,
}, {
- 'note': 'Embedded SWF player',
- 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
- 'playlist_count': 4,
+ 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://music.youtube.com/channel/UCT-K0qO8z6NzWrywqefBPBQ',
+ 'only_matching': True,
+ }, {
+ 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
+ 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
+ 'info_dict': {
+ 'title': '29C3: Not my department',
+ 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
+ 'uploader': 'Christiaan008',
+ 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
+ },
+ 'playlist_count': 96,
+ }, {
+ 'note': 'Large playlist',
+ 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
'info_dict': {
- 'title': 'JODA7',
- 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
+ 'title': 'Uploads from Cauchemar',
+ 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
+ 'uploader': 'Cauchemar',
+ 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
},
- 'skip': 'This playlist does not exist',
+ 'playlist_mincount': 1123,
+ }, {
+ # even larger playlist, 8832 videos
+ 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
+ 'only_matching': True,
}, {
'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
'title': 'Uploads from Interstellar Movie',
'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
'uploader': 'Interstellar Movie',
- 'uploader_id': 'InterstellarMovie1',
+ 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
},
'playlist_mincount': 21,
+ }, {
+ # https://github.com/ytdl-org/youtube-dl/issues/21844
+ 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
+ 'info_dict': {
+ 'title': 'Data Analysis with Dr Mike Pound',
+ 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
+ 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
+ 'uploader': 'Computerphile',
+ },
+ 'playlist_mincount': 11,
+ }, {
+ 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
+ 'only_matching': True,
}, {
# Playlist URL that does not actually serve a playlist
'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
'skip': 'This video is not available.',
'add_ie': [YoutubeIE.ie_key()],
}, {
- 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
- 'info_dict': {
- 'id': 'yeWKywCrFtk',
- 'ext': 'mp4',
- 'title': 'Small Scale Baler and Braiding Rugs',
- 'uploader': 'Backus-Page House Museum',
- 'uploader_id': 'backuspagemuseum',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
- 'upload_date': '20161008',
- 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
- 'categories': ['Nonprofits & Activism'],
- 'tags': list,
- 'like_count': int,
- 'dislike_count': int,
- },
- 'params': {
- 'noplaylist': True,
- 'skip_download': True,
- },
- }, {
- # https://github.com/ytdl-org/youtube-dl/issues/21844
- 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
- 'info_dict': {
- 'title': 'Data Analysis with Dr Mike Pound',
- 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
- 'uploader_id': 'Computerphile',
- 'uploader': 'Computerphile',
- },
- 'playlist_mincount': 11,
- }, {
- 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
- 'only_matching': True,
- }, {
- 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
- 'only_matching': True,
- }, {
- # music album playlist
- 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
- 'only_matching': True,
- }, {
- 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
+ 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
'only_matching': True,
}, {
- 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
+ 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
'only_matching': True,
}]
- def _real_initialize(self):
- self._login()
-
- def extract_videos_from_page(self, page):
- ids_in_page = []
- titles_in_page = []
-
- for item in re.findall(
- r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
- attrs = extract_attributes(item)
- video_id = attrs['data-video-id']
- video_title = unescapeHTML(attrs.get('data-title'))
- if video_title:
- video_title = video_title.strip()
- ids_in_page.append(video_id)
- titles_in_page.append(video_title)
-
- # Fallback with old _VIDEO_RE
- self.extract_videos_from_page_impl(
- self._VIDEO_RE, page, ids_in_page, titles_in_page)
-
- # Relaxed fallbacks
- self.extract_videos_from_page_impl(
- r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
- ids_in_page, titles_in_page)
- self.extract_videos_from_page_impl(
- r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
- ids_in_page, titles_in_page)
-
- return zip(ids_in_page, titles_in_page)
-
- def _extract_mix_ids_from_yt_initial(self, yt_initial):
- ids = []
- playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents'], list)
- if playlist_contents:
- for item in playlist_contents:
- videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId'], compat_str)
- if videoId:
- ids.append(videoId)
- return ids
-
- def _extract_mix(self, playlist_id):
- # The mixes are generated from a single video
- # the id of the playlist is just 'RD' + video_id
- ids = []
- yt_initial = None
- last_id = playlist_id[-11:]
- for n in itertools.count(1):
- url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
- webpage = self._download_webpage(
- url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
- new_ids = orderedSet(re.findall(
- r'''(?xs)data-video-username=".*?".*?
- href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id),
- webpage))
-
- # if no ids in html of page, try using embedded json
- if (len(new_ids) == 0):
- yt_initial = self._get_yt_initial_data(playlist_id, webpage)
- if yt_initial:
- new_ids = self._extract_mix_ids_from_yt_initial(yt_initial)
-
- # Fetch new pages until all the videos are repeated, it seems that
- # there are always 51 unique videos.
- new_ids = [_id for _id in new_ids if _id not in ids]
- if not new_ids:
- break
- ids.extend(new_ids)
- last_id = ids[-1]
-
- url_results = self._ids_to_results(ids)
-
- search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
- title_span = (
- search_title('playlist-title')
- or search_title('title long-title')
- or search_title('title'))
- title = clean_html(title_span)
-
- if not title:
- title = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['title'], compat_str)
+ @classmethod
+ def suitable(cls, url):
+ return False if YoutubeLiveIE.suitable(url) else super(
+ YoutubeTabIE, cls).suitable(url)
+
+ def _extract_channel_id(self, webpage):
+ channel_id = self._html_search_meta(
+ 'channelId', webpage, 'channel id', default=None)
+ if channel_id:
+ return channel_id
+ channel_url = self._html_search_meta(
+ ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
+ 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
+ 'twitter:app:url:googleplay'), webpage, 'channel url')
+ return self._search_regex(
+ r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
+ channel_url, 'channel id')
- return self.playlist_result(url_results, playlist_id, title)
+ @staticmethod
+ def _extract_grid_item_renderer(item):
+ for item_kind in ('Playlist', 'Video', 'Channel'):
+ renderer = item.get('grid%sRenderer' % item_kind)
+ if renderer:
+ return renderer
+
+ def _extract_video(self, renderer):
+ video_id = renderer.get('videoId')
+ title = try_get(
+ renderer,
+ (lambda x: x['title']['runs'][0]['text'],
+ lambda x: x['title']['simpleText']), compat_str)
+ description = try_get(
+ renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
+ compat_str)
+ duration = parse_duration(try_get(
+ renderer, lambda x: x['lengthText']['simpleText'], compat_str))
+ view_count_text = try_get(
+ renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
+ view_count = str_to_int(self._search_regex(
+ r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
+ 'view count', default=None))
+ uploader = try_get(
+ renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': YoutubeIE.ie_key(),
+ 'id': video_id,
+ 'url': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'uploader': uploader,
+ }
- def _extract_playlist(self, playlist_id):
- url = self._TEMPLATE_URL % playlist_id
- page = self._download_webpage(url, playlist_id)
-
- # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
- for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
- match = match.strip()
- # Check if the playlist exists or is private
- mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
- if mobj:
- reason = mobj.group('reason')
- message = 'This playlist %s' % reason
- if 'private' in reason:
- message += ', use --username or --netrc to access it'
- message += '.'
- raise ExtractorError(message, expected=True)
- elif re.match(r'[^<]*Invalid parameters[^<]*', match):
- raise ExtractorError(
- 'Invalid parameters. Maybe URL is incorrect.',
- expected=True)
- elif re.match(r'[^<]*Choose your language[^<]*', match):
+ def _grid_entries(self, grid_renderer):
+ for item in grid_renderer['items']:
+ if not isinstance(item, dict):
continue
- else:
- self.report_warning('Youtube gives an alert message: ' + match)
-
- playlist_title = self._html_search_regex(
- r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
- page, 'title', default=None)
+ renderer = self._extract_grid_item_renderer(item)
+ if not isinstance(renderer, dict):
+ continue
+ title = try_get(
+ renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
+ # playlist
+ playlist_id = renderer.get('playlistId')
+ if playlist_id:
+ yield self.url_result(
+ 'https://www.youtube.com/playlist?list=%s' % playlist_id,
+ ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
+ video_title=title)
+ # video
+ video_id = renderer.get('videoId')
+ if video_id:
+ yield self._extract_video(renderer)
+ # channel
+ channel_id = renderer.get('channelId')
+ if channel_id:
+ title = try_get(
+ renderer, lambda x: x['title']['simpleText'], compat_str)
+ yield self.url_result(
+ 'https://www.youtube.com/channel/%s' % channel_id,
+ ie=YoutubeTabIE.ie_key(), video_title=title)
+
+ def _shelf_entries_trimmed(self, shelf_renderer):
+ renderer = try_get(
+ shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict)
+ if not renderer:
+ return
+ # TODO: add support for nested playlists so each shelf is processed
+ # as separate playlist
+ # TODO: this includes only first N items
+ for entry in self._grid_entries(renderer):
+ yield entry
+
+ def _shelf_entries(self, shelf_renderer):
+ ep = try_get(
+ shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
+ compat_str)
+ shelf_url = urljoin('https://www.youtube.com', ep)
+ if not shelf_url:
+ return
+ title = try_get(
+ shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
+ yield self.url_result(shelf_url, video_title=title)
- _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
- uploader = self._html_search_regex(
- r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
- page, 'uploader', default=None)
- mobj = re.search(
- r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
- page)
- if mobj:
- uploader_id = mobj.group('uploader_id')
- uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
- else:
- uploader_id = uploader_url = None
+ def _playlist_entries(self, video_list_renderer):
+ for content in video_list_renderer['contents']:
+ if not isinstance(content, dict):
+ continue
+ renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
+ if not isinstance(renderer, dict):
+ continue
+ video_id = renderer.get('videoId')
+ if not video_id:
+ continue
+ yield self._extract_video(renderer)
- has_videos = True
+ def _video_entry(self, video_renderer):
+ video_id = video_renderer.get('videoId')
+ if video_id:
+ return self._extract_video(video_renderer)
- if not playlist_title:
- try:
- # Some playlist URLs don't actually serve a playlist (e.g.
- # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
- next(self._entries(page, playlist_id))
- except StopIteration:
- has_videos = False
+ def _post_thread_entries(self, post_thread_renderer):
+ post_renderer = try_get(
+ post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
+ if not post_renderer:
+ return
+ # video attachment
+ video_renderer = try_get(
+ post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
+ video_id = None
+ if video_renderer:
+ entry = self._video_entry(video_renderer)
+ if entry:
+ yield entry
+ # inline video links
+ runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
+ for run in runs:
+ if not isinstance(run, dict):
+ continue
+ ep_url = try_get(
+ run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
+ if not ep_url:
+ continue
+ if not YoutubeIE.suitable(ep_url):
+ continue
+ ep_video_id = YoutubeIE._match_id(ep_url)
+ if video_id == ep_video_id:
+ continue
+ yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
- playlist = self.playlist_result(
- self._entries(page, playlist_id), playlist_id, playlist_title)
- playlist.update({
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'uploader_url': uploader_url,
- })
- if playlist_id.startswith(self._YTM_PLAYLIST_PREFIX):
- playlist.update(self._YTM_CHANNEL_INFO)
+ def _post_thread_continuation_entries(self, post_thread_continuation):
+ contents = post_thread_continuation.get('contents')
+ if not isinstance(contents, list):
+ return
+ for content in contents:
+ renderer = content.get('backstagePostThreadRenderer')
+ if not isinstance(renderer, dict):
+ continue
+ for entry in self._post_thread_entries(renderer):
+ yield entry
- return has_videos, playlist
+ @staticmethod
+ def _extract_next_continuation_data(renderer):
+ next_continuation = try_get(
+ renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
+ if not next_continuation:
+ return
+ continuation = next_continuation.get('continuation')
+ if not continuation:
+ return
+ ctp = next_continuation.get('clickTrackingParams')
+ return {
+ 'ctoken': continuation,
+ 'continuation': continuation,
+ 'itct': ctp,
+ }
- def _check_download_just_video(self, url, playlist_id):
- # Check if it's a video-specific URL
- query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
- video_id = query_dict.get('v', [None])[0] or self._search_regex(
- r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
- 'video id', default=None)
- if video_id:
- if self._downloader.params.get('noplaylist'):
- self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
- return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
- else:
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
- return video_id, None
- return None, None
+ @classmethod
+ def _extract_continuation(cls, renderer):
+ next_continuation = cls._extract_next_continuation_data(renderer)
+ if next_continuation:
+ return next_continuation
+ contents = renderer.get('contents')
+ if not isinstance(contents, list):
+ return
+ for content in contents:
+ if not isinstance(content, dict):
+ continue
+ continuation_ep = try_get(
+ content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
+ dict)
+ if not continuation_ep:
+ continue
+ continuation = try_get(
+ continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
+ if not continuation:
+ continue
+ ctp = continuation_ep.get('clickTrackingParams')
+ if not ctp:
+ continue
+ return {
+ 'ctoken': continuation,
+ 'continuation': continuation,
+ 'itct': ctp,
+ }
- def _real_extract(self, url):
- # Extract playlist id
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError('Invalid URL: %s' % url)
- playlist_id = mobj.group(1) or mobj.group(2)
+ def _entries(self, tab, identity_token):
+ continuation = None
+ slr_contents = try_get(tab, lambda x: x['sectionListRenderer']['contents'], list) or []
+ for slr_content in slr_contents:
+ if not isinstance(slr_content, dict):
+ continue
+ is_renderer = try_get(slr_content, lambda x: x['itemSectionRenderer'], dict)
+ if not is_renderer:
+ continue
+ isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
+ for isr_content in isr_contents:
+ if not isinstance(isr_content, dict):
+ continue
+ renderer = isr_content.get('playlistVideoListRenderer')
+ if renderer:
+ for entry in self._playlist_entries(renderer):
+ yield entry
+ continuation = self._extract_continuation(renderer)
+ continue
+ renderer = isr_content.get('gridRenderer')
+ if renderer:
+ for entry in self._grid_entries(renderer):
+ yield entry
+ continuation = self._extract_continuation(renderer)
+ continue
+ renderer = isr_content.get('shelfRenderer')
+ if renderer:
+ for entry in self._shelf_entries(renderer):
+ yield entry
+ continue
+ renderer = isr_content.get('backstagePostThreadRenderer')
+ if renderer:
+ for entry in self._post_thread_entries(renderer):
+ yield entry
+ continuation = self._extract_continuation(renderer)
+ continue
+ renderer = isr_content.get('videoRenderer')
+ if renderer:
+ entry = self._video_entry(renderer)
+ if entry:
+ yield entry
+
+ if not continuation:
+ continuation = self._extract_continuation(is_renderer)
+
+ headers = {
+ 'x-youtube-client-name': '1',
+ 'x-youtube-client-version': '2.20201112.04.01',
+ }
+ if identity_token:
+ headers['x-youtube-identity-token'] = identity_token
- video_id, video = self._check_download_just_video(url, playlist_id)
- if video:
- return video
+ for page_num in itertools.count(1):
+ if not continuation:
+ break
+ browse = self._download_json(
+ 'https://www.youtube.com/browse_ajax', None,
+ 'Downloading page %d' % page_num,
+ headers=headers, query=continuation, fatal=False)
+ if not browse:
+ break
+ response = try_get(browse, lambda x: x[1]['response'], dict)
+ if not response:
+ break
- if playlist_id.startswith(('RD', 'UL', 'PU')):
- if not playlist_id.startswith(self._YTM_PLAYLIST_PREFIX):
- # Mixes require a custom extraction process,
- # Youtube Music playlists act like normal playlists (with randomized order)
- return self._extract_mix(playlist_id)
-
- has_videos, playlist = self._extract_playlist(playlist_id)
- if has_videos or not video_id:
- return playlist
-
- # Some playlist URLs don't actually serve a playlist (see
- # https://github.com/ytdl-org/youtube-dl/issues/10537).
- # Fallback to plain video extraction if there is a video id
- # along with playlist id.
- return self.url_result(video_id, 'Youtube', video_id=video_id)
-
-
-class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
- IE_DESC = 'YouTube.com channels'
- _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
- _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
- _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
- IE_NAME = 'youtube:channel'
- _TESTS = [{
- 'note': 'paginated channel',
- 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
- 'playlist_mincount': 91,
- 'info_dict': {
- 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
- 'title': 'Uploads from lex will',
- 'uploader': 'lex will',
- 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- }
- }, {
- 'note': 'Age restricted channel',
- # from https://www.youtube.com/user/DeusExOfficial
- 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
- 'playlist_mincount': 64,
- 'info_dict': {
- 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
- 'title': 'Uploads from Deus Ex',
- 'uploader': 'Deus Ex',
- 'uploader_id': 'DeusExOfficial',
- },
- }, {
- 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
- 'only_matching': True,
- }]
+ continuation_contents = try_get(
+ response, lambda x: x['continuationContents'], dict)
+ if continuation_contents:
+ continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
+ if continuation_renderer:
+ for entry in self._playlist_entries(continuation_renderer):
+ yield entry
+ continuation = self._extract_continuation(continuation_renderer)
+ continue
+ continuation_renderer = continuation_contents.get('gridContinuation')
+ if continuation_renderer:
+ for entry in self._grid_entries(continuation_renderer):
+ yield entry
+ continuation = self._extract_continuation(continuation_renderer)
+ continue
+ continuation_renderer = continuation_contents.get('itemSectionContinuation')
+ if continuation_renderer:
+ for entry in self._post_thread_continuation_entries(continuation_renderer):
+ yield entry
+ continuation = self._extract_continuation(continuation_renderer)
+ continue
- @classmethod
- def suitable(cls, url):
- return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
- else super(YoutubeChannelIE, cls).suitable(url))
+ continuation_items = try_get(
+ response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
+ if continuation_items:
+ continuation_item = continuation_items[0]
+ if not isinstance(continuation_item, dict):
+ continue
+ renderer = continuation_item.get('playlistVideoRenderer')
+ if renderer:
+ video_list_renderer = {'contents': continuation_items}
+ for entry in self._playlist_entries(video_list_renderer):
+ yield entry
+ continuation = self._extract_continuation(video_list_renderer)
+ continue
- def _build_template_url(self, url, channel_id):
- return self._TEMPLATE_URL % channel_id
+ break
- def _real_extract(self, url):
- channel_id = self._match_id(url)
-
- url = self._build_template_url(url, channel_id)
-
- # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
- # Workaround by extracting as a playlist if managed to obtain channel playlist URL
- # otherwise fallback on channel by page extraction
- channel_page = self._download_webpage(
- url + '?view=57', channel_id,
- 'Downloading channel page', fatal=False)
- if channel_page is False:
- channel_playlist_id = False
+ @staticmethod
+ def _extract_selected_tab(tabs):
+ for tab in tabs:
+ if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
+ return tab['tabRenderer']
else:
- channel_playlist_id = self._html_search_meta(
- 'channelId', channel_page, 'channel id', default=None)
- if not channel_playlist_id:
- channel_url = self._html_search_meta(
- ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
- channel_page, 'channel url', default=None)
- if channel_url:
- channel_playlist_id = self._search_regex(
- r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
- channel_url, 'channel id', default=None)
- if channel_playlist_id and channel_playlist_id.startswith('UC'):
- playlist_id = 'UU' + channel_playlist_id[2:]
- return self.url_result(
- compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
-
- channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
- autogenerated = re.search(r'''(?x)
- class="[^"]*?(?:
- channel-header-autogenerated-label|
- yt-channel-title-autogenerated
- )[^"]*"''', channel_page) is not None
-
- if autogenerated:
- # The videos are contained in a single page
- # the ajax pages can't be used, they are empty
- entries = [
- self.url_result(
- video_id, 'Youtube', video_id=video_id,
- video_title=video_title)
- for video_id, video_title in self.extract_videos_from_page(channel_page)]
- return self.playlist_result(entries, channel_id)
+ raise ExtractorError('Unable to find selected tab')
- try:
- next(self._entries(channel_page, channel_id))
- except StopIteration:
- alert_message = self._html_search_regex(
- r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
- channel_page, 'alert', default=None, group='alert')
- if alert_message:
- raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
+ @staticmethod
+ def _extract_uploader(data):
+ uploader = {}
+ sidebar_renderer = try_get(
+ data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
+ if sidebar_renderer:
+ for item in sidebar_renderer:
+ if not isinstance(item, dict):
+ continue
+ renderer = item.get('playlistSidebarSecondaryInfoRenderer')
+ if not isinstance(renderer, dict):
+ continue
+ owner = try_get(
+ renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
+ if owner:
+ uploader['uploader'] = owner.get('text')
+ uploader['uploader_id'] = try_get(
+ owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
+ uploader['uploader_url'] = urljoin(
+ 'https://www.youtube.com/',
+ try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
+ return uploader
+
+ def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
+ selected_tab = self._extract_selected_tab(tabs)
+ renderer = try_get(
+ data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
+ if renderer:
+ channel_title = renderer.get('title') or item_id
+ tab_title = selected_tab.get('title')
+ title = channel_title or item_id
+ if tab_title:
+ title += ' - %s' % tab_title
+ description = renderer.get('description')
+ playlist_id = renderer.get('externalId')
+ renderer = try_get(
+ data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
+ if renderer:
+ title = renderer.get('title')
+ description = None
+ playlist_id = item_id
+ playlist = self.playlist_result(
+ self._entries(selected_tab['content'], identity_token),
+ playlist_id=playlist_id, playlist_title=title,
+ playlist_description=description)
+ playlist.update(self._extract_uploader(data))
+ return playlist
- return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
+ def _extract_from_playlist(self, item_id, data, playlist):
+ title = playlist.get('title') or try_get(
+ data, lambda x: x['titleText']['simpleText'], compat_str)
+ playlist_id = playlist.get('playlistId') or item_id
+ return self.playlist_result(
+ self._playlist_entries(playlist), playlist_id=playlist_id,
+ playlist_title=title)
+ def _real_extract(self, url):
+ item_id = self._match_id(url)
+ url = compat_urlparse.urlunparse(
+ compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
+ # Handle both video/playlist URLs
+ qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ video_id = qs.get('v', [None])[0]
+ playlist_id = qs.get('list', [None])[0]
+ if video_id and playlist_id:
+ if self._downloader.params.get('noplaylist'):
+ self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+ return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
+ self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+ webpage = self._download_webpage(url, item_id)
+ identity_token = self._search_regex(
+ r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
+ 'identity token', default=None)
+ data = self._extract_yt_initial_data(item_id, webpage)
+ tabs = try_get(
+ data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
+ if tabs:
+ return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
+ playlist = try_get(
+ data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
+ if playlist:
+ return self._extract_from_playlist(item_id, data, playlist)
+ # Fallback to video extraction if no playlist alike page is recognized
+ if video_id:
+ return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
+ # Failed to recognize
+ raise ExtractorError('Unable to recognize tab page')
-class YoutubeUserIE(YoutubeChannelIE):
- IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
- _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
- _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
- IE_NAME = 'youtube:user'
+class YoutubePlaylistIE(InfoExtractor):
+ IE_DESC = 'YouTube.com playlists'
+ _VALID_URL = r'''(?x)(?:
+ (?:https?://)?
+ (?:\w+\.)?
+ (?:
+ (?:
+ youtube(?:kids)?\.com|
+ invidio\.us|
+ youtu\.be
+ )
+ /.*?\?.*?\blist=
+ )?
+ (?P<id>%(playlist_id)s)
+ )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
+ IE_NAME = 'youtube:playlist'
_TESTS = [{
- 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
- 'playlist_mincount': 320,
+ 'note': 'issue #673',
+ 'url': 'PLBB231211A4F62143',
'info_dict': {
- 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
- 'title': 'Uploads from The Linux Foundation',
- 'uploader': 'The Linux Foundation',
- 'uploader_id': 'TheLinuxFoundation',
- }
+ 'title': '[OLD]Team Fortress 2 (Class-based LP)',
+ 'id': 'PLBB231211A4F62143',
+ 'uploader': 'Wickydoo',
+ 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
+ },
+ 'playlist_mincount': 29,
+ }, {
+ 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
+ 'info_dict': {
+ 'title': 'YDL_safe_search',
+ 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
+ },
+ 'playlist_count': 2,
+ 'skip': 'This playlist is private',
}, {
- # Only available via https://www.youtube.com/c/12minuteathlete/videos
- # but not https://www.youtube.com/user/12minuteathlete/videos
- 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
- 'playlist_mincount': 249,
+ 'note': 'embedded',
+ 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
+ 'playlist_count': 4,
'info_dict': {
- 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
- 'title': 'Uploads from 12 Minute Athlete',
- 'uploader': '12 Minute Athlete',
- 'uploader_id': 'the12minuteathlete',
+ 'title': 'JODA15',
+ 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
+ 'uploader': 'milan',
+ 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
}
}, {
- 'url': 'ytuser:phihag',
- 'only_matching': True,
+ 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
+ 'playlist_mincount': 982,
+ 'info_dict': {
+ 'title': '2018 Chinese New Singles (11/6 updated)',
+ 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
+ 'uploader': 'LBK',
+ 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
+ }
}, {
- 'url': 'https://www.youtube.com/c/gametrailers',
- 'only_matching': True,
+ 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
+ 'info_dict': {
+ 'id': 'yeWKywCrFtk',
+ 'ext': 'mp4',
+ 'title': 'Small Scale Baler and Braiding Rugs',
+ 'uploader': 'Backus-Page House Museum',
+ 'uploader_id': 'backuspagemuseum',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
+ 'upload_date': '20161008',
+ 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
+ 'categories': ['Nonprofits & Activism'],
+ 'tags': list,
+ 'like_count': int,
+ 'dislike_count': int,
+ },
+ 'params': {
+ 'noplaylist': True,
+ 'skip_download': True,
+ },
}, {
- 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
+ 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
'only_matching': True,
}, {
- 'url': 'https://www.youtube.com/gametrailers',
+ 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
'only_matching': True,
}, {
- # This channel is not available, geo restricted to JP
- 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
+ # music album playlist
+ 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
- # Don't return True if the url can be extracted with other youtube
- # extractor, the regex would is too permissive and it would match.
- other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
- if any(ie.suitable(url) for ie in other_yt_ies):
- return False
- else:
- return super(YoutubeUserIE, cls).suitable(url)
+ return False if YoutubeTabIE.suitable(url) else super(
+ YoutubePlaylistIE, cls).suitable(url)
- def _build_template_url(self, url, channel_id):
- mobj = re.match(self._VALID_URL, url)
- return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ if not qs:
+ qs = {'list': playlist_id}
+ return self.url_result(
+ update_url_query('https://www.youtube.com/playlist', qs),
+ ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
+
+
+class YoutubeYtUserIE(InfoExtractor):
+ _VALID_URL = r'ytuser:(?P<id>.+)'
+ _TESTS = [{
+ 'url': 'ytuser:phihag',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ user_id = self._match_id(url)
+ return self.url_result(
+ 'https://www.youtube.com/user/%s' % user_id,
+ ie=YoutubeTabIE.ie_key(), video_id=user_id)
class YoutubeLiveIE(YoutubeBaseInfoExtractor):
return self.url_result(base_url)
-class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
- IE_DESC = 'YouTube.com user/channel playlists'
- _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
- IE_NAME = 'youtube:playlists'
-
- _TESTS = [{
- 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
- 'playlist_mincount': 4,
- 'info_dict': {
- 'id': 'ThirstForScience',
- 'title': 'ThirstForScience',
- },
- }, {
- # with "Load more" button
- 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
- 'playlist_mincount': 70,
- 'info_dict': {
- 'id': 'igorkle1',
- 'title': 'Игорь Клейнер',
- },
- }, {
- 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
- 'playlist_mincount': 17,
- 'info_dict': {
- 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
- 'title': 'Chem Player',
- },
- 'skip': 'Blocked',
- }, {
- 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
- 'only_matching': True,
- }]
-
-
-class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistBaseInfoExtractor):
+class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com searches'
# there doesn't appear to be a real limit, for example if you search for
# 'python' you get more than 8.000.000 results
_SEARCH_PARAMS = 'CAI%3D'
-class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor):
- IE_DESC = 'YouTube.com search URLs'
- IE_NAME = 'youtube:search_url'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
- _TESTS = [{
- 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
- 'playlist_mincount': 5,
- 'info_dict': {
- 'title': 'youtube-dl test video',
- }
- }, {
- 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
- 'only_matching': True,
- }]
-
- def _process_json_dict(self, obj, videos, c):
- if "videoId" in obj:
- videos.append(obj)
- return
-
- if "nextContinuationData" in obj:
- c["continuation"] = obj["nextContinuationData"]
- return
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- query = compat_urllib_parse_unquote_plus(mobj.group('query'))
- webpage = self._download_webpage(url, query)
- return self.playlist_result(self._entries(webpage, query, max_pages=5), playlist_title=query)
-
-
-class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
- IE_DESC = 'YouTube.com (multi-season) shows'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
- IE_NAME = 'youtube:show'
- _TESTS = [{
- 'url': 'https://www.youtube.com/show/airdisasters',
- 'playlist_mincount': 5,
- 'info_dict': {
- 'id': 'airdisasters',
- 'title': 'Air Disasters',
- }
- }]
-
- def _real_extract(self, url):
- playlist_id = self._match_id(url)
- return super(YoutubeShowIE, self)._real_extract(
- 'https://www.youtube.com/show/%s/playlists' % playlist_id)
-
-
-class YoutubeFeedsInfoExtractor(YoutubePlaylistBaseInfoExtractor):
+class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
"""
Base class for feed extractors
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
def _real_initialize(self):
self._login()
- def _process_entries(self, entries, seen):
- new_info = []
- for v in entries:
- v_id = try_get(v, lambda x: x['videoId'])
- if not v_id:
- continue
+ def _entries(self, page):
+ # The extraction process is the same as for playlists, but the regex
+ # for the video ids doesn't contain an index
+ ids = []
+ more_widget_html = content_html = page
+ for page_num in itertools.count(1):
+ matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
- have_video = False
- for old in seen:
- if old['videoId'] == v_id:
- have_video = True
- break
+ # 'recommended' feed has infinite 'load more' and each new portion spins
+ # the same videos in (sometimes) slightly different order, so we'll check
+ # for unicity and break when portion has no new videos
+ new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
+ if not new_ids:
+ break
- if not have_video:
- new_info.append(v)
+ ids.extend(new_ids)
- if not new_info:
- return
+ for entry in self._ids_to_results(new_ids):
+ yield entry
- seen.extend(new_info)
- for video in new_info:
- yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=self._extract_title(video))
+ mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+ if not mobj:
+ break
+
+ more = self._download_json(
+ 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
+ 'Downloading page #%s' % page_num,
+ transform_source=uppercase_escape,
+ headers=self._YOUTUBE_CLIENT_HEADERS)
+ content_html = more['content_html']
+ more_widget_html = more['load_more_widget_html']
def _real_extract(self, url):
page = self._download_webpage(
'https://www.youtube.com/feed/%s' % self._FEED_NAME,
self._PLAYLIST_TITLE)
- return self.playlist_result(self._entries(page, self._PLAYLIST_TITLE),
- playlist_title=self._PLAYLIST_TITLE)
+ return self.playlist_result(
+ self._entries(page), playlist_title=self._PLAYLIST_TITLE)
-class YoutubeWatchLaterIE(YoutubePlaylistIE):
+class YoutubeWatchLaterIE(InfoExtractor):
IE_NAME = 'youtube:watchlater'
IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater'
_TESTS = [{
- 'url': 'https://www.youtube.com/playlist?list=WL',
+ 'url': 'https://www.youtube.com/feed/watch_later',
'only_matching': True,
}, {
- 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
+ 'url': ':ytwatchlater',
'only_matching': True,
}]
def _real_extract(self, url):
+ return self.url_result(
+ 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
_, video = self._check_download_just_video(url, 'WL')
if video:
return video
return playlist
-class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
- IE_NAME = 'youtube:favorites'
- IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
- _LOGIN_REQUIRED = True
-
- def _real_extract(self, url):
- webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
- playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
- return self.url_result(playlist_id, 'YoutubePlaylist')
-
-
class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
raise ExtractorError(
'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
expected=True)
+
+
+# Old extractors. Are these cases handled elsewhere?
+
+class YoutubeSearchURLIE(YoutubeSearchIE):
+ IE_DESC = 'YouTube.com search URLs'
+ IE_NAME = 'youtube:search_url'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'title': 'youtube-dl test video',
+ }
+ }, {
+ 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB',
+ 'only_matching': True,
+ }]
+
+ def _process_json_dict(self, obj, videos, c):
+ if "videoId" in obj:
+ videos.append(obj)
+ return
+
+ if "nextContinuationData" in obj:
+ c["continuation"] = obj["nextContinuationData"]
+ return
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ query = compat_urllib_parse_unquote_plus(mobj.group('query'))
+ webpage = self._download_webpage(url, query)
+ return self.playlist_result(self._entries(webpage, query, max_pages=5), playlist_title=query)
+
+
+class YoutubeShowIE(InfoExtractor):
+ IE_DESC = 'YouTube.com (multi-season) shows'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
+ IE_NAME = 'youtube:show'
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/show/airdisasters',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': 'airdisasters',
+ 'title': 'Air Disasters',
+ }
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ return super(YoutubeShowIE, self)._real_extract(
+ 'https://www.youtube.com/show/%s/playlists' % playlist_id)
+
+
+class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
+ IE_NAME = 'youtube:favorites'
+ IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
+ _LOGIN_REQUIRED = True
+
+ def _real_extract(self, url):
+ webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
+ playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
+ return self.url_result(playlist_id, 'YoutubePlaylist')
v = m.group(0)
if v in ('true', 'false', 'null'):
return v
- elif v.startswith('/*') or v.startswith('//') or v == ',':
+ elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
return ""
if v[0] in ("'", '"'):
'\\\n': '',
'\\x': '\\u00',
}.get(m.group(0), m.group(0)), v[1:-1])
-
- for regex, base in INTEGER_TABLE:
- im = re.match(regex, v)
- if im:
- i = int(im.group(1), base)
- return '"%d":' % i if v.endswith(':') else '%d' % i
+ else:
+ for regex, base in INTEGER_TABLE:
+ im = re.match(regex, v)
+ if im:
+ i = int(im.group(1), base)
+ return '"%d":' % i if v.endswith(':') else '%d' % i
return '"%s"' % v
{comment}|,(?={skip}[\]}}])|
(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
\b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
- [0-9]+(?={skip}:)
+ [0-9]+(?={skip}:)|
+ !+
'''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)