- **Aparat**
- **AppleConnect**
- **AppleDaily**: 臺灣蘋果日報
+ - **ApplePodcasts**
- **appletrailers**
- **appletrailers:section**
- **archive.org**: archive.org videos
- **BellMedia**
- **Bet**
- **bfi:player**
+ - **bfmtv**
+ - **bfmtv:article**
+ - **bfmtv:live**
+ - **BibelTV**
- **Bigflix**
- **Bild**: Bild.de
- **BiliBili**
- **Go**
- **GodTube**
- **Golem**
+ - **google:podcasts**
+ - **google:podcasts:feed**
- **GoogleDrive**
- **Goshgay**
- **GPUTechConf**
- **HungamaSong**
- **Hypem**
- **ign.com**
+ - **IHeartRadio**
+ - **iheartradio:podcast**
- **imdb**: Internet Movie Database trailers
- **imdb:list**: Internet Movie Database lists
- **Imgur**
- **Playwire**
- **pluralsight**
- **pluralsight:course**
- - **plus.google**: Google Plus
- **podomatic**
- **Pokemon**
- **PokemonWatch**
- **WWE**
- **XBef**
- **XboxClips**
- - **XFileShare**: XFileShare based sites: ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, XVideoSharing
+ - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, XVideoSharing
- **XHamster**
- **XHamsterEmbed**
- **XHamsterUser**
class TestRaiPlaySubtitles(BaseTestSubtitles):
- url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html'
IE = RaiPlayIE
- def test_allsubtitles(self):
+ def test_subtitles_key(self):
+ self.url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html'
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['it']))
self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a')
+ def test_subtitles_array_key(self):
+ self.url = 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html'
+ self.DL.params['writesubtitles'] = True
+ self.DL.params['allsubtitles'] = True
+ subtitles = self.getSubtitles()
+ self.assertEqual(set(subtitles.keys()), set(['it']))
+ self.assertEqual(md5(subtitles['it']), '4b3264186fbb103508abe5311cfcb9cd')
+
class TestVikiSubtitles(BaseTestSubtitles):
url = 'http://www.viki.com/videos/1060846v-punch-episode-18'
encode_base_n,
caesar,
clean_html,
+ clean_podcast_url,
date_from_str,
DateRange,
detect_exe_version,
iri_to_uri('http://导航.中国/'),
'http://xn--fet810g.xn--fiqs8s/')
+ def test_clean_podcast_url(self):
+ self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3')
+ self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3')
+
if __name__ == '__main__':
unittest.main()
iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence)
decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen(
self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read()
- frag_content = AES.new(
- decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content)
+ # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block
+ # size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded,
+ # not what it decrypts to.
+ if not test:
+ frag_content = AES.new(
+ decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content)
self._append_fragment(ctx, frag_content)
# We only download the first fragment during the test
if test:
from .common import InfoExtractor
from ..utils import (
clean_html,
+ clean_podcast_url,
int_or_none,
parse_iso8601,
)
info = {
'id': episode['id'],
'display_id': episode.get('episodeUrl'),
- 'url': episode['url'],
+ 'url': clean_podcast_url(episode['url']),
'title': title,
'description': clean_html(episode.get('description') or episode.get('summary')),
'thumbnail': episode.get('image'),
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_podcast_url,
+ int_or_none,
+ parse_iso8601,
+ try_get,
+)
+
+
+class ApplePodcastsIE(InfoExtractor):
+ _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
+ 'md5': 'df02e6acb11c10e844946a39e7222b08',
+ 'info_dict': {
+ 'id': '1000482637777',
+ 'ext': 'mp3',
+ 'title': '207 - Whitney Webb Returns',
+ 'description': 'md5:13a73bade02d2e43737751e3987e1399',
+ 'upload_date': '20200705',
+ 'timestamp': 1593921600,
+ 'duration': 6425,
+ 'series': 'The Tim Dillon Show',
+ }
+ }, {
+ 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ episode_id = self._match_id(url)
+ webpage = self._download_webpage(url, episode_id)
+ ember_data = self._parse_json(self._search_regex(
+ r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<',
+ webpage, 'ember data'), episode_id)
+ episode = ember_data['data']['attributes']
+ description = episode.get('description') or {}
+
+ series = None
+ for inc in (ember_data.get('included') or []):
+ if inc.get('type') == 'media/podcast':
+ series = try_get(inc, lambda x: x['attributes']['name'])
+
+ return {
+ 'id': episode_id,
+ 'title': episode['name'],
+ 'url': clean_podcast_url(episode['assetUrl']),
+ 'description': description.get('standard') or description.get('short'),
+ 'timestamp': parse_iso8601(episode.get('releaseDateTime')),
+ 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000),
+ 'series': series,
+ }
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import extract_attributes
+
+
+class BFMTVBaseIE(InfoExtractor):
+ _VALID_URL_BASE = r'https?://(?:www\.)?bfmtv\.com/'
+ _VALID_URL_TMPL = _VALID_URL_BASE + r'(?:[^/]+/)*[^/?&#]+_%s[A-Z]-(?P<id>\d{12})\.html'
+ _VIDEO_BLOCK_REGEX = r'(<div[^>]+class="video_block"[^>]*>)'
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
+
+ def _brightcove_url_result(self, video_id, video_block):
+ account_id = video_block.get('accountid') or '876450612001'
+ player_id = video_block.get('playerid') or 'I2qBTln4u'
+ return self.url_result(
+ self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id),
+ 'BrightcoveNew', video_id)
+
+
+class BFMTVIE(BFMTVBaseIE):
+ IE_NAME = 'bfmtv'
+ _VALID_URL = BFMTVBaseIE._VALID_URL_TMPL % 'V'
+ _TESTS = [{
+ 'url': 'https://www.bfmtv.com/politique/emmanuel-macron-l-islam-est-une-religion-qui-vit-une-crise-aujourd-hui-partout-dans-le-monde_VN-202010020146.html',
+ 'info_dict': {
+ 'id': '6196747868001',
+ 'ext': 'mp4',
+ 'title': 'Emmanuel Macron: "L\'Islam est une religion qui vit une crise aujourd’hui, partout dans le monde"',
+ 'description': 'Le Président s\'exprime sur la question du séparatisme depuis les Mureaux, dans les Yvelines.',
+ 'uploader_id': '876450610001',
+ 'upload_date': '20201002',
+ 'timestamp': 1601629620,
+ },
+ }]
+
+ def _real_extract(self, url):
+ bfmtv_id = self._match_id(url)
+ webpage = self._download_webpage(url, bfmtv_id)
+ video_block = extract_attributes(self._search_regex(
+ self._VIDEO_BLOCK_REGEX, webpage, 'video block'))
+ return self._brightcove_url_result(video_block['videoid'], video_block)
+
+
+class BFMTVLiveIE(BFMTVIE):
+ IE_NAME = 'bfmtv:live'
+ _VALID_URL = BFMTVBaseIE._VALID_URL_BASE + '(?P<id>(?:[^/]+/)?en-direct)'
+ _TESTS = [{
+ 'url': 'https://www.bfmtv.com/en-direct/',
+ 'info_dict': {
+ 'id': '5615950982001',
+ 'ext': 'mp4',
+ 'title': r're:^le direct BFMTV WEB \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+ 'uploader_id': '876450610001',
+ 'upload_date': '20171018',
+ 'timestamp': 1508329950,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.bfmtv.com/economie/en-direct/',
+ 'only_matching': True,
+ }]
+
+
+class BFMTVArticleIE(BFMTVBaseIE):
+ IE_NAME = 'bfmtv:article'
+ _VALID_URL = BFMTVBaseIE._VALID_URL_TMPL % 'A'
+ _TESTS = [{
+ 'url': 'https://www.bfmtv.com/sante/covid-19-un-responsable-de-l-institut-pasteur-se-demande-quand-la-france-va-se-reconfiner_AV-202101060198.html',
+ 'info_dict': {
+ 'id': '202101060198',
+ 'title': 'Covid-19: un responsable de l\'Institut Pasteur se demande "quand la France va se reconfiner"',
+ 'description': 'md5:947974089c303d3ac6196670ae262843',
+ },
+ 'playlist_count': 2,
+ }, {
+ 'url': 'https://www.bfmtv.com/international/pour-bolsonaro-le-bresil-est-en-faillite-mais-il-ne-peut-rien-faire_AD-202101060232.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.bfmtv.com/sante/covid-19-oui-le-vaccin-de-pfizer-distribue-en-france-a-bien-ete-teste-sur-des-personnes-agees_AN-202101060275.html',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ bfmtv_id = self._match_id(url)
+ webpage = self._download_webpage(url, bfmtv_id)
+
+ entries = []
+ for video_block_el in re.findall(self._VIDEO_BLOCK_REGEX, webpage):
+ video_block = extract_attributes(video_block_el)
+ video_id = video_block.get('videoid')
+ if not video_id:
+ continue
+ entries.append(self._brightcove_url_result(video_id, video_block))
+
+ return self.playlist_result(
+ entries, bfmtv_id, self._og_search_title(webpage, fatal=False),
+ self._html_search_meta(['og:description', 'description'], webpage))
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class BibelTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?:crn/)?(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.bibeltv.de/mediathek/videos/329703-sprachkurs-in-malaiisch',
+ 'md5': '252f908192d611de038b8504b08bf97f',
+ 'info_dict': {
+ 'id': 'ref:329703',
+ 'ext': 'mp4',
+ 'title': 'Sprachkurs in Malaiisch',
+ 'description': 'md5:3e9f197d29ee164714e67351cf737dfe',
+ 'timestamp': 1608316701,
+ 'uploader_id': '5840105145001',
+ 'upload_date': '20201218',
+ }
+ }, {
+ 'url': 'https://www.bibeltv.de/mediathek/videos/crn/326374',
+ 'only_matching': True,
+ }]
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5840105145001/default_default/index.html?videoId=ref:%s'
+
+ def _real_extract(self, url):
+ crn_id = self._match_id(url)
+ return self.url_result(
+ self.BRIGHTCOVE_URL_TEMPLATE % crn_id, 'BrightcoveNew')
from .gigya import GigyaBaseIE
from ..compat import compat_HTTPError
from ..utils import (
+ extract_attributes,
ExtractorError,
strip_or_none,
float_or_none,
int_or_none,
merge_dicts,
- parse_iso8601,
str_or_none,
url_or_none,
)
'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
'only_matching': True,
}]
+ _GEO_BYPASS = False
_HLS_ENTRY_PROTOCOLS_MAP = {
'HLS': 'm3u8_native',
'HLS_AES': 'm3u8',
mobj = re.match(self._VALID_URL, url)
site_id, video_id = mobj.group('site_id'), mobj.group('id')
- # Old API endpoint, serves more formats but may fail for some videos
- data = self._download_json(
- 'https://mediazone.vrt.be/api/v1/%s/assets/%s'
- % (site_id, video_id), video_id, 'Downloading asset JSON',
- 'Unable to download asset JSON', fatal=False)
+ data = None
+ if site_id != 'vrtvideo':
+ # Old API endpoint, serves more formats but may fail for some videos
+ data = self._download_json(
+ 'https://mediazone.vrt.be/api/v1/%s/assets/%s'
+ % (site_id, video_id), video_id, 'Downloading asset JSON',
+ 'Unable to download asset JSON', fatal=False)
# New API endpoint
if not data:
+ headers = self.geo_verification_headers()
+ headers.update({'Content-Type': 'application/json'})
token = self._download_json(
'%s/tokens' % self._REST_API_BASE, video_id,
- 'Downloading token', data=b'',
- headers={'Content-Type': 'application/json'})['vrtPlayerToken']
+ 'Downloading token', data=b'', headers=headers)['vrtPlayerToken']
data = self._download_json(
'%s/videos/%s' % (self._REST_API_BASE, video_id),
- video_id, 'Downloading video JSON', fatal=False, query={
+ video_id, 'Downloading video JSON', query={
'vrtPlayerToken': token,
'client': '%s@PROD' % site_id,
}, expected_status=400)
- message = data.get('message')
- if message and not data.get('title'):
- if data.get('code') == 'AUTHENTICATION_REQUIRED':
- self.raise_login_required(message)
- raise ExtractorError(message, expected=True)
+ if not data.get('title'):
+ code = data.get('code')
+ if code == 'AUTHENTICATION_REQUIRED':
+ self.raise_login_required()
+ elif code == 'INVALID_LOCATION':
+ self.raise_geo_restricted(countries=['BE'])
+ raise ExtractorError(data.get('message') or code, expected=True)
title = data['title']
description = data.get('description')
class VrtNUIE(GigyaBaseIE):
IE_DESC = 'VrtNU.be'
- _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)'
_TESTS = [{
# Available via old API endpoint
- 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/',
+ 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/',
'info_dict': {
- 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de',
+ 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de',
'ext': 'mp4',
- 'title': 'De zwarte weduwe',
- 'description': 'md5:db1227b0f318c849ba5eab1fef895ee4',
+ 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)',
+ 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7',
'duration': 1457.04,
'thumbnail': r're:^https?://.*\.jpg$',
- 'season': 'Season 1',
- 'season_number': 1,
+ 'series': 'Postbus X',
+ 'season': 'Seizoen 1989',
+ 'season_number': 1989,
+ 'episode': 'De zwarte weduwe',
'episode_number': 1,
+ 'timestamp': 1595822400,
+ 'upload_date': '20200727',
},
'skip': 'This video is only available for registered users',
'params': {
def _real_extract(self, url):
display_id = self._match_id(url)
- webpage, urlh = self._download_webpage_handle(url, display_id)
-
- info = self._search_json_ld(webpage, display_id, default={})
-
- # title is optional here since it may be extracted by extractor
- # that is delegated from here
- title = strip_or_none(self._html_search_regex(
- r'(?ms)<h1 class="content__heading">(.+?)</h1>',
- webpage, 'title', default=None))
-
- description = self._html_search_regex(
- r'(?ms)<div class="content__description">(.+?)</div>',
- webpage, 'description', default=None)
-
- season = self._html_search_regex(
- [r'''(?xms)<div\ class="tabs__tab\ tabs__tab--active">\s*
- <span>seizoen\ (.+?)</span>\s*
- </div>''',
- r'<option value="seizoen (\d{1,3})" data-href="[^"]+?" selected>'],
- webpage, 'season', default=None)
-
- season_number = int_or_none(season)
-
- episode_number = int_or_none(self._html_search_regex(
- r'''(?xms)<div\ class="content__episode">\s*
- <abbr\ title="aflevering">afl</abbr>\s*<span>(\d+)</span>
- </div>''',
- webpage, 'episode_number', default=None))
-
- release_date = parse_iso8601(self._html_search_regex(
- r'(?ms)<div class="content__broadcastdate">\s*<time\ datetime="(.+?)"',
- webpage, 'release_date', default=None))
-
- # If there's a ? or a # in the URL, remove them and everything after
- clean_url = urlh.geturl().split('?')[0].split('#')[0].strip('/')
- securevideo_url = clean_url + '.mssecurevideo.json'
-
- try:
- video = self._download_json(securevideo_url, display_id)
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
- self.raise_login_required()
- raise
+ webpage = self._download_webpage(url, display_id)
- # We are dealing with a '../<show>.relevant' URL
- redirect_url = video.get('url')
- if redirect_url:
- return self.url_result(self._proto_relative_url(redirect_url, 'https:'))
+ attrs = extract_attributes(self._search_regex(
+ r'(<nui-media[^>]+>)', webpage, 'media element'))
+ video_id = attrs['videoid']
+ publication_id = attrs.get('publicationid')
+ if publication_id:
+ video_id = publication_id + '$' + video_id
- # There is only one entry, but with an unknown key, so just get
- # the first one
- video_id = list(video.values())[0].get('videoid')
+ page = (self._parse_json(self._search_regex(
+ r'digitalData\s*=\s*({.+?});', webpage, 'digial data',
+ default='{}'), video_id, fatal=False) or {}).get('page') or {}
+ info = self._search_json_ld(webpage, display_id, default={})
return merge_dicts(info, {
'_type': 'url_transparent',
'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id,
'ie_key': CanvasIE.ie_key(),
'id': video_id,
'display_id': display_id,
- 'title': title,
- 'description': description,
- 'season': season,
- 'season_number': season_number,
- 'episode_number': episode_number,
- 'release_date': release_date,
+ 'season_number': int_or_none(page.get('episode_season')),
})
class DPlayIE(InfoExtractor):
_VALID_URL = r'''(?x)https?://
(?P<domain>
- (?:www\.)?(?P<host>dplay\.(?P<country>dk|fi|jp|se|no))|
+ (?:www\.)?(?P<host>d
+ (?:
+ play\.(?P<country>dk|fi|jp|se|no)|
+ iscoveryplus\.(?P<plus_country>dk|es|fi|it|se|no)
+ )
+ )|
(?P<subdomain_country>es|it)\.dplay\.com
)/[^/]+/(?P<id>[^/]+/[^/?#]+)'''
}, {
'url': 'https://www.dplay.jp/video/gold-rush/24086',
'only_matching': True,
+ }, {
+ 'url': 'https://www.discoveryplus.se/videos/nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.discoveryplus.dk/videoer/ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.discoveryplus.no/videoer/i-kongens-klr/sesong-1-episode-7',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.discoveryplus.it/videos/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.discoveryplus.es/videos/la-fiebre-del-oro/temporada-8-episodio-1',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.discoveryplus.fi/videot/shifting-gears-with-aaron-kaufman/episode-16',
+ 'only_matching': True,
}]
def _get_disco_api_info(self, url, display_id, disco_host, realm, country):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('id')
domain = mobj.group('domain').lstrip('www.')
- country = mobj.group('country') or mobj.group('subdomain_country')
- host = 'disco-api.' + domain if domain.startswith('dplay.') else 'eu2-prod.disco-api.com'
+ country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country')
+ host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com'
return self._get_disco_api_info(
url, display_id, host, 'dplay' + country, country)
AppleTrailersIE,
AppleTrailersSectionIE,
)
+from .applepodcasts import ApplePodcastsIE
from .archiveorg import ArchiveOrgIE
from .arcpublishing import ArcPublishingIE
from .arkena import ArkenaIE
from .beatport import BeatportIE
from .bet import BetIE
from .bfi import BFIPlayerIE
+from .bfmtv import (
+ BFMTVIE,
+ BFMTVLiveIE,
+ BFMTVArticleIE,
+)
+from .bibeltv import BibelTVIE
from .bigflix import BigflixIE
from .bild import BildIE
from .bilibili import (
from .godtube import GodTubeIE
from .golem import GolemIE
from .googledrive import GoogleDriveIE
-from .googleplus import GooglePlusIE
+from .googlepodcasts import (
+ GooglePodcastsIE,
+ GooglePodcastsFeedIE,
+)
from .googlesearch import GoogleSearchIE
from .goshgay import GoshgayIE
from .gputechconf import GPUTechConfIE
OneUPIE,
PCMagIE,
)
+from .iheart import (
+ IHeartRadioIE,
+ IHeartRadioPodcastIE,
+)
from .imdb import (
ImdbIE,
ImdbListIE
+++ /dev/null
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-import codecs
-
-from .common import InfoExtractor
-from ..utils import unified_strdate
-
-
-class GooglePlusIE(InfoExtractor):
- IE_DESC = 'Google Plus'
- _VALID_URL = r'https?://plus\.google\.com/(?:[^/]+/)*?posts/(?P<id>\w+)'
- IE_NAME = 'plus.google'
- _TEST = {
- 'url': 'https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH',
- 'info_dict': {
- 'id': 'ZButuJc6CtH',
- 'ext': 'flv',
- 'title': '嘆きの天使 降臨',
- 'upload_date': '20120613',
- 'uploader': '井上ヨシマサ',
- }
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- # Step 1, Retrieve post webpage to extract further information
- webpage = self._download_webpage(url, video_id, 'Downloading entry webpage')
-
- title = self._og_search_description(webpage).splitlines()[0]
- upload_date = unified_strdate(self._html_search_regex(
- r'''(?x)<a.+?class="o-U-s\s[^"]+"\s+style="display:\s*none"\s*>
- ([0-9]{4}-[0-9]{2}-[0-9]{2})</a>''',
- webpage, 'upload date', fatal=False, flags=re.VERBOSE))
- uploader = self._html_search_regex(
- r'rel="author".*?>(.*?)</a>', webpage, 'uploader', fatal=False)
-
- # Step 2, Simulate clicking the image box to launch video
- DOMAIN = 'https://plus.google.com/'
- video_page = self._search_regex(
- r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN),
- webpage, 'video page URL')
- if not video_page.startswith(DOMAIN):
- video_page = DOMAIN + video_page
-
- webpage = self._download_webpage(video_page, video_id, 'Downloading video page')
-
- def unicode_escape(s):
- decoder = codecs.getdecoder('unicode_escape')
- return re.sub(
- r'\\u[0-9a-fA-F]{4,}',
- lambda m: decoder(m.group(0))[0],
- s)
-
- # Extract video links all sizes
- formats = [{
- 'url': unicode_escape(video_url),
- 'ext': 'flv',
- 'width': int(width),
- 'height': int(height),
- } for width, height, video_url in re.findall(
- r'\d+,(\d+),(\d+),"(https?://[^.]+\.googleusercontent\.com.*?)"', webpage)]
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'uploader': uploader,
- 'upload_date': upload_date,
- 'formats': formats,
- }
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_podcast_url,
+ int_or_none,
+ try_get,
+ urlencode_postdata,
+)
+
+
+class GooglePodcastsBaseIE(InfoExtractor):
+ _VALID_URL_BASE = r'https?://podcasts\.google\.com/feed/'
+
+ def _batch_execute(self, func_id, video_id, params):
+ return json.loads(self._download_json(
+ 'https://podcasts.google.com/_/PodcastsUi/data/batchexecute',
+ video_id, data=urlencode_postdata({
+ 'f.req': json.dumps([[[func_id, json.dumps(params), None, '1']]]),
+ }), transform_source=lambda x: self._search_regex(r'(?s)(\[.+\])', x, 'data'))[0][2])
+
+ def _extract_episode(self, episode):
+ return {
+ 'id': episode[4][3],
+ 'title': episode[8],
+ 'url': clean_podcast_url(episode[13]),
+ 'thumbnail': episode[2],
+ 'description': episode[9],
+ 'creator': try_get(episode, lambda x: x[14]),
+ 'timestamp': int_or_none(episode[11]),
+ 'duration': int_or_none(episode[12]),
+ 'series': episode[1],
+ }
+
+
+class GooglePodcastsIE(GooglePodcastsBaseIE):
+ IE_NAME = 'google:podcasts'
+ _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<feed_url>[^/]+)/episode/(?P<id>[^/?&#]+)'
+ _TEST = {
+ 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA/episode/MzBlNWRlN2UtOWE4Yy00ODcwLTk2M2MtM2JlMmUyNmViOTRh',
+ 'md5': 'fa56b2ee8bd0703e27e42d4b104c4766',
+ 'info_dict': {
+ 'id': '30e5de7e-9a8c-4870-963c-3be2e26eb94a',
+ 'ext': 'mp3',
+ 'title': 'WWDTM New Year 2021',
+ 'description': 'We say goodbye to 2020 with Christine Baranksi, Doug Jones, Jonna Mendez, and Kellee Edwards.',
+ 'upload_date': '20210102',
+ 'timestamp': 1609606800,
+ 'duration': 2901,
+ 'series': "Wait Wait... Don't Tell Me!",
+ }
+ }
+
+ def _real_extract(self, url):
+ b64_feed_url, b64_guid = re.match(self._VALID_URL, url).groups()
+ episode = self._batch_execute(
+ 'oNjqVe', b64_guid, [b64_feed_url, b64_guid])[1]
+ return self._extract_episode(episode)
+
+
+class GooglePodcastsFeedIE(GooglePodcastsBaseIE):
+ IE_NAME = 'google:podcasts:feed'
+ _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<id>[^/?&#]+)/?(?:[?#&]|$)'
+ _TEST = {
+ 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA',
+ 'info_dict': {
+ 'title': "Wait Wait... Don't Tell Me!",
+ 'description': "NPR's weekly current events quiz. Have a laugh and test your news knowledge while figuring out what's real and what we've made up.",
+ },
+ 'playlist_mincount': 20,
+ }
+
+ def _real_extract(self, url):
+ b64_feed_url = self._match_id(url)
+ data = self._batch_execute('ncqJEe', b64_feed_url, [b64_feed_url])
+
+ entries = []
+ for episode in (try_get(data, lambda x: x[1][0]) or []):
+ entries.append(self._extract_episode(episode))
+
+ feed = try_get(data, lambda x: x[3]) or []
+ return self.playlist_result(
+ entries, playlist_title=try_get(feed, lambda x: x[0]),
+ playlist_description=try_get(feed, lambda x: x[2]))
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ clean_podcast_url,
+ int_or_none,
+ str_or_none,
+)
+
+
+class IHeartRadioBaseIE(InfoExtractor):
+ def _call_api(self, path, video_id, fatal=True, query=None):
+ return self._download_json(
+ 'https://api.iheart.com/api/v3/podcast/' + path,
+ video_id, fatal=fatal, query=query)
+
+ def _extract_episode(self, episode):
+ return {
+ 'thumbnail': episode.get('imageUrl'),
+ 'description': clean_html(episode.get('description')),
+ 'timestamp': int_or_none(episode.get('startDate'), 1000),
+ 'duration': int_or_none(episode.get('duration')),
+ }
+
+
+class IHeartRadioIE(IHeartRadioBaseIE):
+ IENAME = 'iheartradio'
+ _VALID_URL = r'(?:https?://(?:www\.)?iheart\.com/podcast/[^/]+/episode/(?P<display_id>[^/?&#]+)-|iheartradio:)(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://www.iheart.com/podcast/105-behind-the-bastards-29236323/episode/part-one-alexander-lukashenko-the-dictator-70346499/?embed=true',
+ 'md5': 'c8609c92c8688dcb69d8541042b8abca',
+ 'info_dict': {
+ 'id': '70346499',
+ 'ext': 'mp3',
+ 'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus',
+ 'description': 'md5:96cc7297b3a5a9ebae28643801c96fae',
+ 'timestamp': 1597741200,
+ 'upload_date': '20200818',
+ }
+ }
+
+ def _real_extract(self, url):
+ episode_id = self._match_id(url)
+ episode = self._call_api(
+ 'episodes/' + episode_id, episode_id)['episode']
+ info = self._extract_episode(episode)
+ info.update({
+ 'id': episode_id,
+ 'title': episode['title'],
+ 'url': clean_podcast_url(episode['mediaUrl']),
+ })
+ return info
+
+
+class IHeartRadioPodcastIE(IHeartRadioBaseIE):
+ IE_NAME = 'iheartradio:podcast'
+ _VALID_URL = r'https?://(?:www\.)?iheart(?:podcastnetwork)?\.com/podcast/[^/?&#]+-(?P<id>\d+)/?(?:[?#&]|$)'
+ _TESTS = [{
+ 'url': 'https://www.iheart.com/podcast/1119-it-could-happen-here-30717896/',
+ 'info_dict': {
+ 'id': '30717896',
+ 'title': 'It Could Happen Here',
+ 'description': 'md5:5842117412a967eb0b01f8088eb663e2',
+ },
+ 'playlist_mincount': 11,
+ }, {
+ 'url': 'https://www.iheartpodcastnetwork.com/podcast/105-stuff-you-should-know-26940277',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ podcast_id = self._match_id(url)
+ path = 'podcasts/' + podcast_id
+ episodes = self._call_api(
+ path + '/episodes', podcast_id, query={'limit': 1000000000})['data']
+
+ entries = []
+ for episode in episodes:
+ episode_id = str_or_none(episode.get('id'))
+ if not episode_id:
+ continue
+ info = self._extract_episode(episode)
+ info.update({
+ '_type': 'url',
+ 'id': episode_id,
+ 'title': episode.get('title'),
+ 'url': 'iheartradio:' + episode_id,
+ 'ie_key': IHeartRadioIE.ie_key(),
+ })
+ entries.append(info)
+
+ podcast = self._call_api(path, podcast_id, False) or {}
+
+ return self.playlist_result(
+ entries, podcast_id, podcast.get('title'), podcast.get('description'))
from .canvas import CanvasIE
from .common import InfoExtractor
+from ..compat import compat_urllib_parse_unquote
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+)
class KetnetIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P<id>(?:[^/]+/)*[^/?#&]+)'
_TESTS = [{
- 'url': 'https://www.ketnet.be/kijken/zomerse-filmpjes',
- 'md5': '6bdeb65998930251bbd1c510750edba9',
+ 'url': 'https://www.ketnet.be/kijken/n/nachtwacht/3/nachtwacht-s3a1-de-greystook',
+ 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9',
'info_dict': {
- 'id': 'zomerse-filmpjes',
+ 'id': 'pbs-pub-aef8b526-115e-4006-aa24-e59ff6c6ef6f$vid-ddb815bf-c8e7-467b-8879-6bad7a32cebd',
'ext': 'mp4',
- 'title': 'Gluur mee op de filmset en op Pennenzakkenrock',
- 'description': 'Gluur mee met Ghost Rockers op de filmset',
+ 'title': 'Nachtwacht - Reeks 3: Aflevering 1',
+ 'description': 'De Nachtwacht krijgt te maken met een parasiet',
'thumbnail': r're:^https?://.*\.jpg$',
- }
- }, {
- # mzid in playerConfig instead of sources
- 'url': 'https://www.ketnet.be/kijken/nachtwacht/de-greystook',
- 'md5': '90139b746a0a9bd7bb631283f6e2a64e',
- 'info_dict': {
- 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
- 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
- 'ext': 'flv',
- 'title': 'Nachtwacht: De Greystook',
- 'description': 'md5:1db3f5dc4c7109c821261e7512975be7',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 1468.03,
+ 'duration': 1468.02,
+ 'timestamp': 1609225200,
+ 'upload_date': '20201229',
+ 'series': 'Nachtwacht',
+ 'season': 'Reeks 3',
+ 'episode': 'De Greystook',
+ 'episode_number': 1,
},
'expected_warnings': ['is not a supported codec', 'Unknown MIME type'],
}, {
- 'url': 'https://www.ketnet.be/kijken/karrewiet/uitzending-8-september-2016',
- 'only_matching': True,
- }, {
- 'url': 'https://www.ketnet.be/achter-de-schermen/sien-repeteert-voor-stars-for-life',
- 'only_matching': True,
- }, {
- # mzsource, geo restricted to Belgium
- 'url': 'https://www.ketnet.be/kijken/nachtwacht/de-bermadoe',
+ 'url': 'https://www.ketnet.be/themas/karrewiet/jaaroverzicht-20200/karrewiet-het-jaar-van-black-mamba',
'only_matching': True,
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
-
- config = self._parse_json(
- self._search_regex(
- r'(?s)playerConfig\s*=\s*({.+?})\s*;', webpage,
- 'player config'),
- video_id)
-
- mzid = config.get('mzid')
- if mzid:
- return self.url_result(
- 'https://mediazone.vrt.be/api/v1/ketnet/assets/%s' % mzid,
- CanvasIE.ie_key(), video_id=mzid)
+ display_id = self._match_id(url)
- title = config['title']
+ video = self._download_json(
+ 'https://senior-bff.ketnet.be/graphql', display_id, query={
+ 'query': '''{
+ video(id: "content/ketnet/nl/%s.model.json") {
+ description
+ episodeNr
+ imageUrl
+ mediaReference
+ programTitle
+ publicationDate
+ seasonTitle
+ subtitleVideodetail
+ titleVideodetail
+ }
+}''' % display_id,
+ })['data']['video']
- formats = []
- for source_key in ('', 'mz'):
- source = config.get('%ssource' % source_key)
- if not isinstance(source, dict):
- continue
- for format_id, format_url in source.items():
- if format_id == 'hls':
- formats.extend(self._extract_m3u8_formats(
- format_url, video_id, 'mp4',
- entry_protocol='m3u8_native', m3u8_id=format_id,
- fatal=False))
- elif format_id == 'hds':
- formats.extend(self._extract_f4m_formats(
- format_url, video_id, f4m_id=format_id, fatal=False))
- else:
- formats.append({
- 'url': format_url,
- 'format_id': format_id,
- })
- self._sort_formats(formats)
+ mz_id = compat_urllib_parse_unquote(video['mediaReference'])
return {
- 'id': video_id,
- 'title': title,
- 'description': config.get('description'),
- 'thumbnail': config.get('image'),
- 'series': config.get('program'),
- 'episode': config.get('episode'),
- 'formats': formats,
+ '_type': 'url_transparent',
+ 'id': mz_id,
+ 'title': video['titleVideodetail'],
+ 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/' + mz_id,
+ 'thumbnail': video.get('imageUrl'),
+ 'description': video.get('description'),
+ 'timestamp': parse_iso8601(video.get('publicationDate')),
+ 'series': video.get('programTitle'),
+ 'season': video.get('seasonTitle'),
+ 'episode': video.get('subtitleVideodetail'),
+ 'episode_number': int_or_none(video.get('episodeNr')),
+ 'ie_key': CanvasIE.ie_key(),
}
# no keywords
'url': 'http://motherless.com/8B4BBC1',
'only_matching': True,
+ }, {
+ # see https://motherless.com/videos/recent for recent videos with
+ # uploaded date in "ago" format
+ 'url': 'https://motherless.com/3C3E2CF',
+ 'info_dict': {
+ 'id': '3C3E2CF',
+ 'ext': 'mp4',
+ 'title': 'a/ Hot Teens',
+ 'categories': list,
+ 'upload_date': '20210104',
+ 'uploader_id': 'yonbiw',
+ 'thumbnail': r're:https?://.*\.jpg',
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id)
age_limit = self._rta_search(webpage)
view_count = str_to_int(self._html_search_regex(
- (r'>(\d+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'),
+ (r'>([\d,.]+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'),
webpage, 'view count', fatal=False))
like_count = str_to_int(self._html_search_regex(
- (r'>(\d+)\s+Favorites<', r'<strong>Favorited</strong>\s+([^<]+)<'),
+ (r'>([\d,.]+)\s+Favorites<',
+ r'<strong>Favorited</strong>\s+([^<]+)<'),
webpage, 'like count', fatal=False))
- upload_date = self._html_search_regex(
- (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<',
- r'<strong>Uploaded</strong>\s+([^<]+)<'), webpage, 'upload date')
- if 'Ago' in upload_date:
- days = int(re.search(r'([0-9]+)', upload_date).group(1))
- upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d')
- else:
- upload_date = unified_strdate(upload_date)
+ upload_date = unified_strdate(self._search_regex(
+ r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage,
+ 'upload date', default=None))
+ if not upload_date:
+ uploaded_ago = self._search_regex(
+ r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago',
+ default=None)
+ if uploaded_ago:
+ delta = int(uploaded_ago[:-1])
+ _AGO_UNITS = {
+ 'h': 'hours',
+ 'd': 'days',
+ }
+ kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta}
+ upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d')
comment_count = webpage.count('class="media-comment-contents"')
uploader_id = self._html_search_regex(
legal_age = try_get(
data, lambda x: x['legalAge']['body']['rating']['code'], compat_str)
# https://en.wikipedia.org/wiki/Norwegian_Media_Authority
- if legal_age == 'A':
- age_limit = 0
- elif legal_age.isdigit():
- age_limit = int_or_none(legal_age)
- else:
- age_limit = None
+ age_limit = None
+ if legal_age:
+ if legal_age == 'A':
+ age_limit = 0
+ elif legal_age.isdigit():
+ age_limit = int_or_none(legal_age)
is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series'
'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce',
'duration': 2223.44,
'age_limit': 6,
+ 'subtitles': {
+ 'nb-nor': [{
+ 'ext': 'vtt',
+ }],
+ 'nb-ttv': [{
+ 'ext': 'vtt',
+ }]
+ },
},
}, {
'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
}.items() if v is not None)
@staticmethod
- def _extract_subtitles(url, subtitle_url):
+ def _extract_subtitles(url, video_data):
+ STL_EXT = 'stl'
+ SRT_EXT = 'srt'
subtitles = {}
- if subtitle_url and isinstance(subtitle_url, compat_str):
- subtitle_url = urljoin(url, subtitle_url)
- STL_EXT = '.stl'
- SRT_EXT = '.srt'
- subtitles['it'] = [{
- 'ext': 'stl',
- 'url': subtitle_url,
- }]
- if subtitle_url.endswith(STL_EXT):
- srt_url = subtitle_url[:-len(STL_EXT)] + SRT_EXT
- subtitles['it'].append({
- 'ext': 'srt',
- 'url': srt_url,
+ subtitles_array = video_data.get('subtitlesArray') or []
+ for k in ('subtitles', 'subtitlesUrl'):
+ subtitles_array.append({'url': video_data.get(k)})
+ for subtitle in subtitles_array:
+ sub_url = subtitle.get('url')
+ if sub_url and isinstance(sub_url, compat_str):
+ sub_lang = subtitle.get('language') or 'it'
+ sub_url = urljoin(url, sub_url)
+ sub_ext = determine_ext(sub_url, SRT_EXT)
+ subtitles.setdefault(sub_lang, []).append({
+ 'ext': sub_ext,
+ 'url': sub_url,
})
+ if STL_EXT == sub_ext:
+ subtitles[sub_lang].append({
+ 'ext': SRT_EXT,
+ 'url': sub_url[:-len(STL_EXT)] + SRT_EXT,
+ })
return subtitles
'duration': 6160,
'series': 'Report',
'season': '2013/14',
+ 'subtitles': {
+ 'it': 'count:2',
+ },
},
'params': {
'skip_download': True,
}, {
'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?',
'only_matching': True,
+ }, {
+ # subtitles at 'subtitlesArray' key (see #27698)
+ 'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
if date_published and time_published:
date_published += ' ' + time_published
- subtitles = self._extract_subtitles(url, video.get('subtitles'))
+ subtitles = self._extract_subtitles(url, video)
program_info = media.get('program_info') or {}
season = media.get('season')
'params': {
'skip_download': True,
},
+ }, {
+ # ContentItem in iframe (see #12652) and subtitle at 'subtitlesUrl' key
+ 'url': 'http://www.presadiretta.rai.it/dl/portali/site/puntata/ContentItem-3ed19d13-26c2-46ff-a551-b10828262f1b.html',
+ 'info_dict': {
+ 'id': '1ad6dc64-444a-42a4-9bea-e5419ad2f5fd',
+ 'ext': 'mp4',
+ 'title': 'Partiti acchiappavoti - Presa diretta del 13/09/2015',
+ 'description': 'md5:d291b03407ec505f95f27970c0b025f4',
+ 'upload_date': '20150913',
+ 'subtitles': {
+ 'it': 'count:2',
+ },
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}, {
# Direct MMS URL
'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html',
'url': compat_urlparse.urljoin(url, thumbnail_url),
})
- subtitles = self._extract_subtitles(url, media.get('subtitlesUrl'))
+ subtitles = self._extract_subtitles(url, media)
info = {
'id': content_id,
r'''(?x)
(?:
(?:initEdizione|drawMediaRaiTV)\(|
- <(?:[^>]+\bdata-id|var\s+uniquename)=
+ <(?:[^>]+\bdata-id|var\s+uniquename)=|
+ <iframe[^>]+\bsrc=
)
(["\'])
(?:(?!\1).)*\bContentItem-(?P<id>%s)
class SBSIE(InfoExtractor):
IE_DESC = 'sbs.com.au'
- _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand|news)/video/(?:single/)?(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=)|news/(?:embeds/)?video/)(?P<id>[0-9]+)'
_TESTS = [{
# Original URL is handled by the generic IE which finds the iframe:
'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed',
'md5': '3150cf278965eeabb5b4cea1c963fe0a',
'info_dict': {
- 'id': '320403011771',
+ 'id': '_rFBPRPO4pMR',
'ext': 'mp4',
'title': 'Dingo Conservation (The Feed)',
'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5',
}, {
'url': 'http://www.sbs.com.au/news/video/471395907773/The-Feed-July-9',
'only_matching': True,
+ }, {
+ 'url': 'https://www.sbs.com.au/ondemand/?play=1836638787723',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.sbs.com.au/ondemand/program/inside-windsor-castle?play=1283505731842',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.sbs.com.au/news/embeds/video/1840778819866',
+ 'only_matching': True,
}]
def _real_extract(self, url):
compat_str,
float_or_none,
int_or_none,
+ smuggle_url,
+ str_or_none,
+ try_get,
)
class STVPlayerIE(InfoExtractor):
IE_NAME = 'stv:player'
_VALID_URL = r'https?://player\.stv\.tv/(?P<type>episode|video)/(?P<id>[a-z0-9]{4})'
- _TEST = {
+ _TESTS = [{
+ # shortform
'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/',
'md5': '5adf9439c31d554f8be0707c7abe7e0a',
'info_dict': {
'uploader_id': '1486976045',
},
'skip': 'this resource is unavailable outside of the UK',
- }
+ }, {
+ # episodes
+ 'url': 'https://player.stv.tv/episode/4125/jennifer-saunders-memory-lane',
+ 'only_matching': True,
+ }]
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s'
_PTYPE_MAP = {
'episode': 'episodes',
def _real_extract(self, url):
ptype, video_id = re.match(self._VALID_URL, url).groups()
- resp = self._download_json(
- 'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], video_id),
- video_id)
- result = resp['results']
+ webpage = self._download_webpage(url, video_id, fatal=False) or ''
+ props = (self._parse_json(self._search_regex(
+ r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>',
+ webpage, 'next data', default='{}'), video_id,
+ fatal=False) or {}).get('props') or {}
+ player_api_cache = try_get(
+ props, lambda x: x['initialReduxState']['playerApiCache']) or {}
+
+ api_path, resp = None, {}
+ for k, v in player_api_cache.items():
+ if k.startswith('/episodes/') or k.startswith('/shortform/'):
+ api_path, resp = k, v
+ break
+ else:
+ episode_id = str_or_none(try_get(
+ props, lambda x: x['pageProps']['episodeId']))
+ api_path = '/%s/%s' % (self._PTYPE_MAP[ptype], episode_id or video_id)
+
+ result = resp.get('results')
+ if not result:
+ resp = self._download_json(
+ 'https://player.api.stv.tv/v1' + api_path, video_id)
+ result = resp['results']
+
video = result['video']
video_id = compat_str(video['id'])
return {
'_type': 'url_transparent',
'id': video_id,
- 'url': self.BRIGHTCOVE_URL_TEMPLATE % video_id,
+ 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['GB']}),
'description': result.get('summary'),
'duration': float_or_none(video.get('length'), 1000),
'subtitles': subtitles,
from .common import InfoExtractor
from ..compat import (
- compat_kwargs,
compat_parse_qs,
compat_str,
compat_urlparse,
_CLIENT_ID = 'kimne78kx3ncx6brgo4mv6wki5h1ko'
_NETRC_MACHINE = 'twitch'
- def _handle_error(self, response):
- if not isinstance(response, dict):
- return
- error = response.get('error')
- if error:
- raise ExtractorError(
- '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')),
- expected=True)
-
- def _call_api(self, path, item_id, *args, **kwargs):
- headers = kwargs.get('headers', {}).copy()
- headers.update({
- 'Accept': 'application/vnd.twitchtv.v5+json; charset=UTF-8',
- 'Client-ID': self._CLIENT_ID,
- })
- kwargs.update({
- 'headers': headers,
- 'expected_status': (400, 410),
- })
- response = self._download_json(
- '%s/%s' % (self._API_BASE, path), item_id,
- *args, **compat_kwargs(kwargs))
- self._handle_error(response)
- return response
+ _OPERATION_HASHES = {
+ 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14',
+ 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb',
+ 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777',
+ 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84',
+ 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e',
+ 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01',
+ 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c',
+ 'VideoMetadata': '226edb3e692509f727fd56821f5653c05740242c82b0388883e0c0e75dcbf687',
+ }
def _real_initialize(self):
self._login()
})
self._sort_formats(formats)
- def _download_access_token(self, channel_name):
- return self._call_api(
- 'api/channels/%s/access_token' % channel_name, channel_name,
- 'Downloading access token JSON')
+ def _download_base_gql(self, video_id, ops, note, fatal=True):
+ return self._download_json(
+ 'https://gql.twitch.tv/gql', video_id, note,
+ data=json.dumps(ops).encode(),
+ headers={
+ 'Content-Type': 'text/plain;charset=UTF-8',
+ 'Client-ID': self._CLIENT_ID,
+ }, fatal=fatal)
- def _extract_channel_id(self, token, channel_name):
- return compat_str(self._parse_json(token, channel_name)['channel_id'])
+ def _download_gql(self, video_id, ops, note, fatal=True):
+ for op in ops:
+ op['extensions'] = {
+ 'persistedQuery': {
+ 'version': 1,
+ 'sha256Hash': self._OPERATION_HASHES[op['operationName']],
+ }
+ }
+ return self._download_base_gql(video_id, ops, note)
+
+ def _download_access_token(self, video_id, token_kind, param_name):
+ method = '%sPlaybackAccessToken' % token_kind
+ ops = {
+ 'query': '''{
+ %s(
+ %s: "%s",
+ params: {
+ platform: "web",
+ playerBackend: "mediaplayer",
+ playerType: "site"
+ }
+ )
+ {
+ value
+ signature
+ }
+ }''' % (method, param_name, video_id),
+ }
+ return self._download_base_gql(
+ video_id, ops,
+ 'Downloading %s access token GraphQL' % token_kind)['data'][method]
class TwitchVodIE(TwitchBaseIE):
)
(?P<id>\d+)
'''
- _ITEM_TYPE = 'vod'
- _ITEM_SHORTCUT = 'v'
_TESTS = [{
'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s',
'title': 'LCK Summer Split - Week 6 Day 1',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 17208,
- 'timestamp': 1435131709,
+ 'timestamp': 1435131734,
'upload_date': '20150624',
'uploader': 'Riot Games',
'uploader_id': 'riotgames',
}]
def _download_info(self, item_id):
- return self._extract_info(
- self._call_api(
- 'kraken/videos/%s' % item_id, item_id,
- 'Downloading video info JSON'))
+ data = self._download_gql(
+ item_id, [{
+ 'operationName': 'VideoMetadata',
+ 'variables': {
+ 'channelLogin': '',
+ 'videoID': item_id,
+ },
+ }],
+ 'Downloading stream metadata GraphQL')[0]['data']
+ video = data.get('video')
+ if video is None:
+ raise ExtractorError(
+ 'Video %s does not exist' % item_id, expected=True)
+ return self._extract_info_gql(video, item_id)
@staticmethod
def _extract_info(info):
'is_live': is_live,
}
+ @staticmethod
+ def _extract_info_gql(info, item_id):
+ vod_id = info.get('id') or item_id
+ # id backward compatibility for download archives
+ if vod_id[0] != 'v':
+ vod_id = 'v%s' % vod_id
+ thumbnail = url_or_none(info.get('previewThumbnailURL'))
+ if thumbnail:
+ for p in ('width', 'height'):
+ thumbnail = thumbnail.replace('{%s}' % p, '0')
+ return {
+ 'id': vod_id,
+ 'title': info.get('title') or 'Untitled Broadcast',
+ 'description': info.get('description'),
+ 'duration': int_or_none(info.get('lengthSeconds')),
+ 'thumbnail': thumbnail,
+ 'uploader': try_get(info, lambda x: x['owner']['displayName'], compat_str),
+ 'uploader_id': try_get(info, lambda x: x['owner']['login'], compat_str),
+ 'timestamp': unified_timestamp(info.get('publishedAt')),
+ 'view_count': int_or_none(info.get('viewCount')),
+ }
+
def _real_extract(self, url):
vod_id = self._match_id(url)
info = self._download_info(vod_id)
- access_token = self._call_api(
- 'api/vods/%s/access_token' % vod_id, vod_id,
- 'Downloading %s access token' % self._ITEM_TYPE)
+ access_token = self._download_access_token(vod_id, 'video', 'id')
formats = self._extract_m3u8_formats(
'%s/vod/%s.m3u8?%s' % (
'allow_spectre': 'true',
'player': 'twitchweb',
'playlist_include_framerate': 'true',
- 'nauth': access_token['token'],
- 'nauthsig': access_token['sig'],
+ 'nauth': access_token['value'],
+ 'nauthsig': access_token['signature'],
})),
vod_id, 'mp4', entry_protocol='m3u8_native')
}
-class TwitchGraphQLBaseIE(TwitchBaseIE):
- _PAGE_LIMIT = 100
-
- _OPERATION_HASHES = {
- 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14',
- 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb',
- 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777',
- 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84',
- 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e',
- 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01',
- 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c',
- }
-
- def _download_gql(self, video_id, ops, note, fatal=True):
- for op in ops:
- op['extensions'] = {
- 'persistedQuery': {
- 'version': 1,
- 'sha256Hash': self._OPERATION_HASHES[op['operationName']],
- }
- }
- return self._download_json(
- 'https://gql.twitch.tv/gql', video_id, note,
- data=json.dumps(ops).encode(),
- headers={
- 'Content-Type': 'text/plain;charset=UTF-8',
- 'Client-ID': self._CLIENT_ID,
- }, fatal=fatal)
-
-
-class TwitchCollectionIE(TwitchGraphQLBaseIE):
+class TwitchCollectionIE(TwitchBaseIE):
_VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/collections/(?P<id>[^/]+)'
_TESTS = [{
entries, playlist_id=collection_id, playlist_title=title)
-class TwitchPlaylistBaseIE(TwitchGraphQLBaseIE):
+class TwitchPlaylistBaseIE(TwitchBaseIE):
+ _PAGE_LIMIT = 100
+
def _entries(self, channel_name, *args):
cursor = None
variables_common = self._make_variables(channel_name, *args)
if not cursor or not isinstance(cursor, compat_str):
break
- # Deprecated kraken v5 API
- def _entries_kraken(self, channel_name, broadcast_type, sort):
- access_token = self._download_access_token(channel_name)
- channel_id = self._extract_channel_id(access_token['token'], channel_name)
- offset = 0
- counter_override = None
- for counter in itertools.count(1):
- response = self._call_api(
- 'kraken/channels/%s/videos/' % channel_id,
- channel_id,
- 'Downloading video JSON page %s' % (counter_override or counter),
- query={
- 'offset': offset,
- 'limit': self._PAGE_LIMIT,
- 'broadcast_type': broadcast_type,
- 'sort': sort,
- })
- videos = response.get('videos')
- if not isinstance(videos, list):
- break
- for video in videos:
- if not isinstance(video, dict):
- continue
- video_url = url_or_none(video.get('url'))
- if not video_url:
- continue
- yield {
- '_type': 'url_transparent',
- 'ie_key': TwitchVodIE.ie_key(),
- 'id': video.get('_id'),
- 'url': video_url,
- 'title': video.get('title'),
- 'description': video.get('description'),
- 'timestamp': unified_timestamp(video.get('published_at')),
- 'duration': float_or_none(video.get('length')),
- 'view_count': int_or_none(video.get('views')),
- 'language': video.get('language'),
- }
- offset += self._PAGE_LIMIT
- total = int_or_none(response.get('_total'))
- if total and offset >= total:
- break
-
class TwitchVideosIE(TwitchPlaylistBaseIE):
_VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/(?P<id>[^/]+)/(?:videos|profile)'
playlist_title='%s - Collections' % channel_name)
-class TwitchStreamIE(TwitchGraphQLBaseIE):
+class TwitchStreamIE(TwitchBaseIE):
IE_NAME = 'twitch:stream'
_VALID_URL = r'''(?x)
https?://
if not stream:
raise ExtractorError('%s is offline' % channel_name, expected=True)
- access_token = self._download_access_token(channel_name)
- token = access_token['token']
+ access_token = self._download_access_token(
+ channel_name, 'stream', 'channelName')
+ token = access_token['value']
stream_id = stream.get('id') or channel_name
query = {
'player': 'twitchweb',
'playlist_include_framerate': 'true',
'segment_preference': '4',
- 'sig': access_token['sig'].encode('utf-8'),
+ 'sig': access_token['signature'].encode('utf-8'),
'token': token.encode('utf-8'),
}
formats = self._extract_m3u8_formats(
def _real_extract(self, url):
video_id = self._match_id(url)
- clip = self._download_json(
- 'https://gql.twitch.tv/gql', video_id, data=json.dumps({
+ clip = self._download_base_gql(
+ video_id, {
'query': '''{
clip(slug: "%s") {
broadcaster {
}
viewCount
}
-}''' % video_id,
- }).encode(), headers={
- 'Client-ID': self._CLIENT_ID,
- })['data']['clip']
+}''' % video_id}, 'Downloading clip GraphQL')['data']['clip']
if not clip:
raise ExtractorError(
'info_dict': {
'id': '700207533655363584',
'ext': 'mp4',
- 'title': 'simon vetugo - BEAT PROD: @suhmeduh #Damndaniel',
+ 'title': 'simon vertugo - BEAT PROD: @suhmeduh #Damndaniel',
'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ',
'thumbnail': r're:^https?://.*\.jpg',
- 'uploader': 'simon vetugo',
+ 'uploader': 'simon vertugo',
'uploader_id': 'simonvertugo',
'duration': 30.0,
'timestamp': 1455777459,
'timestamp': 1492000653,
'upload_date': '20170412',
},
+ 'skip': 'Account suspended',
}, {
'url': 'https://twitter.com/i/web/status/910031516746514432',
'info_dict': {
# promo_video_website card
'url': 'https://twitter.com/GunB1g/status/1163218564784017422',
'only_matching': True,
+ }, {
+ # promo_video_convo card
+ 'url': 'https://twitter.com/poco_dandy/status/1047395834013384704',
+ 'only_matching': True,
+ }, {
+ # appplayer card
+ 'url': 'https://twitter.com/poco_dandy/status/1150646424461176832',
+ 'only_matching': True,
}]
def _real_extract(self, url):
return try_get(o, lambda x: x[x['type'].lower() + '_value'])
card_name = card['name'].split(':')[-1]
- if card_name in ('amplify', 'promo_video_website'):
+ if card_name == 'player':
+ info.update({
+ '_type': 'url',
+ 'url': get_binding_value('player_url'),
+ })
+ elif card_name == 'periscope_broadcast':
+ info.update({
+ '_type': 'url',
+ 'url': get_binding_value('url') or get_binding_value('player_url'),
+ 'ie_key': PeriscopeIE.ie_key(),
+ })
+ elif card_name == 'broadcast':
+ info.update({
+ '_type': 'url',
+ 'url': get_binding_value('broadcast_url'),
+ 'ie_key': TwitterBroadcastIE.ie_key(),
+ })
+ elif card_name == 'summary':
+ info.update({
+ '_type': 'url',
+ 'url': get_binding_value('card_url'),
+ })
+ # amplify, promo_video_website, promo_video_convo, appplayer, ...
+ else:
is_amplify = card_name == 'amplify'
vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url')
content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player'))
'duration': int_or_none(get_binding_value(
'content_duration_seconds')),
})
- elif card_name == 'player':
- info.update({
- '_type': 'url',
- 'url': get_binding_value('player_url'),
- })
- elif card_name == 'periscope_broadcast':
- info.update({
- '_type': 'url',
- 'url': get_binding_value('url') or get_binding_value('player_url'),
- 'ie_key': PeriscopeIE.ie_key(),
- })
- elif card_name == 'broadcast':
- info.update({
- '_type': 'url',
- 'url': get_binding_value('broadcast_url'),
- 'ie_key': TwitterBroadcastIE.ie_key(),
- })
- else:
- raise ExtractorError('Unsupported Twitter Card.')
else:
expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url'])
if not expanded_url:
class XFileShareIE(InfoExtractor):
_SITES = (
+ (r'aparat\.cam', 'Aparat'),
(r'clipwatching\.com', 'ClipWatching'),
(r'gounlimited\.to', 'GoUnlimited'),
(r'govid\.me', 'GoVid'),
'title': 'sample',
'thumbnail': r're:http://.*\.jpg',
},
+ }, {
+ 'url': 'https://aparat.cam/n4d6dh0wvlpr',
+ 'only_matching': True,
}]
@staticmethod
if func and val not in ignore:
val = func(val)
return template % val if val not in ignore else default
+
+
+def clean_podcast_url(url):
+ return re.sub(r'''(?x)
+ (?:
+ (?:
+ chtbl\.com/track|
+ media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
+ play\.podtrac\.com
+ )/[^/]+|
+ (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
+ flex\.acast\.com|
+ pd(?:
+ cn\.co| # https://podcorn.com/analytics-prefix/
+ st\.fm # https://podsights.com/docs/
+ )/e
+ )/''', '', url)