ExtractorError,
float_or_none,
HEADRequest,
+ int_or_none,
is_html,
js_to_json,
KNOWN_EXTENSIONS,
merge_dicts,
mimetype2ext,
orderedSet,
+ parse_duration,
sanitized_Request,
smuggle_url,
unescapeHTML,
- unified_strdate,
+ unified_timestamp,
unsmuggle_url,
UnsupportedError,
+ url_or_none,
+ xpath_attr,
xpath_text,
+ xpath_with_ns,
)
from .commonprotocols import RtmpIE
from .brightcove import (
from .rutv import RUTVIE
from .tvc import TVCIE
from .sportbox import SportBoxIE
-from .smotri import SmotriIE
from .myvi import MyviIE
from .condenast import CondeNastIE
from .udn import UDNEmbedIE
from .mofosex import MofosexEmbedIE
from .spankwire import SpankwireIE
from .youporn import YouPornIE
-from .vimeo import VimeoIE
+from .vimeo import (
+ VimeoIE,
+ VHXEmbedIE,
+)
from .dailymotion import DailymotionIE
from .dailymail import DailyMailIE
from .onionstudios import OnionStudiosIE
from .zype import ZypeIE
from .odnoklassniki import OdnoklassnikiIE
from .kinja import KinjaEmbedIE
+from .gedi import GediEmbedsIE
+from .rcs import RCSEmbedsIE
from .bitchute import BitChuteIE
+from .rumble import RumbleEmbedIE
+from .arcpublishing import ArcPublishingIE
+from .medialaan import MedialaanIE
+from .simplecast import SimplecastIE
class GenericIE(InfoExtractor):
{
'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
'info_dict': {
- 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
- 'ext': 'm4v',
- 'upload_date': '20150228',
- 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
- }
+ 'id': 'http://podcastfeeds.nbcnews.com/nbcnews/video/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
+ 'title': 'MSNBC Rachel Maddow (video)',
+ 'description': 're:.*her unique approach to storytelling.*',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'ext': 'mov',
+ 'id': 'pdv_maddow_netcast_mov-12-03-2020-223726',
+ 'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726',
+ 'description': 're:.*her unique approach to storytelling.*',
+ 'upload_date': '20201204',
+ },
+ }],
+ },
+ # RSS feed with item with description and thumbnails
+ {
+ 'url': 'https://anchor.fm/s/dd00e14/podcast/rss',
+ 'info_dict': {
+ 'id': 'https://anchor.fm/s/dd00e14/podcast/rss',
+ 'title': 're:.*100% Hydrogen.*',
+ 'description': 're:.*In this episode.*',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'ext': 'm4a',
+ 'id': 'c1c879525ce2cb640b344507e682c36d',
+ 'title': 're:Hydrogen!',
+ 'description': 're:.*In this episode we are going.*',
+ 'timestamp': 1567977776,
+ 'upload_date': '20190908',
+ 'duration': 459,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'episode_number': 1,
+ 'season_number': 1,
+ 'age_limit': 0,
+ },
+ }],
+ 'params': {
+ 'skip_download': True,
+ },
},
# RSS feed with enclosures and unsupported link URLs
{
},
'add_ie': [SpringboardPlatformIE.ie_key()],
},
- {
- 'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU',
- 'info_dict': {
- 'id': 'uPDB5I9wfp8',
- 'ext': 'webm',
- 'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3',
- 'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d',
- 'upload_date': '20160219',
- 'uploader': 'Pocoyo - Português (BR)',
- 'uploader_id': 'PocoyoBrazil',
- },
- 'add_ie': [YoutubeIE.ie_key()],
- 'params': {
- 'skip_download': True,
- },
- },
{
'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html',
'info_dict': {
'skip_download': True,
},
},
- {
- # Zype embed
- 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites',
- 'info_dict': {
- 'id': '5b400b834b32992a310622b9',
- 'ext': 'mp4',
- 'title': 'Smoky Barbecue Favorites',
- 'thumbnail': r're:^https?://.*\.jpe?g',
- 'description': 'md5:5ff01e76316bd8d46508af26dc86023b',
- 'upload_date': '20170909',
- 'timestamp': 1504915200,
- },
- 'add_ie': [ZypeIE.ie_key()],
- 'params': {
- 'skip_download': True,
- },
- },
+ # {
+ # # Zype embed
+ # 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites',
+ # 'info_dict': {
+ # 'id': '5b400b834b32992a310622b9',
+ # 'ext': 'mp4',
+ # 'title': 'Smoky Barbecue Favorites',
+ # 'thumbnail': r're:^https?://.*\.jpe?g',
+ # 'description': 'md5:5ff01e76316bd8d46508af26dc86023b',
+ # 'upload_date': '20170909',
+ # 'timestamp': 1504915200,
+ # },
+ # 'add_ie': [ZypeIE.ie_key()],
+ # 'params': {
+ # 'skip_download': True,
+ # },
+ # },
{
# videojs embed
'url': 'https://video.sibnet.ru/shell.php?videoid=3422904',
# 'params': {
# 'force_generic_extractor': True,
# },
- # }
+ # },
+ {
+ # VHX Embed
+ 'url': 'https://demo.vhx.tv/category-c/videos/file-example-mp4-480-1-5mg-copy',
+ 'info_dict': {
+ 'id': '858208',
+ 'ext': 'mp4',
+ 'title': 'Untitled',
+ 'uploader_id': 'user80538407',
+ 'uploader': 'OTT Videos',
+ },
+ },
+ {
+ # ArcPublishing PoWa video player
+ 'url': 'https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/',
+ 'md5': 'b03b2fac8680e1e5a7cc81a5c27e71b3',
+ 'info_dict': {
+ 'id': '8c99cb6e-b29c-4bc9-9173-7bf9979225ab',
+ 'ext': 'mp4',
+ 'title': 'Senate candidates wave to voters on Anchorage streets',
+ 'description': 'md5:91f51a6511f090617353dc720318b20e',
+ 'timestamp': 1604378735,
+ 'upload_date': '20201103',
+ 'duration': 1581,
+ },
+ },
+ {
+ # MyChannels SDK embed
+ # https://www.24kitchen.nl/populair/deskundige-dit-waarom-sommigen-gevoelig-zijn-voor-voedselallergieen
+ 'url': 'https://www.demorgen.be/nieuws/burgemeester-rotterdam-richt-zich-in-videoboodschap-tot-relschoppers-voelt-het-goed~b0bcfd741/',
+ 'md5': '90c0699c37006ef18e198c032d81739c',
+ 'info_dict': {
+ 'id': '194165',
+ 'ext': 'mp4',
+ 'title': 'Burgemeester Aboutaleb spreekt relschoppers toe',
+ 'timestamp': 1611740340,
+ 'upload_date': '20210127',
+ 'duration': 159,
+ },
+ },
+ {
+ # Simplecast player embed
+ 'url': 'https://www.bio.org/podcast',
+ 'info_dict': {
+ 'id': 'podcast',
+ 'title': 'I AM BIO Podcast | BIO',
+ },
+ 'playlist_mincount': 52,
+ },
]
def report_following_redirect(self, new_url):
playlist_desc_el = doc.find('./channel/description')
playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
+ NS_MAP = {
+ 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd',
+ }
+
entries = []
for it in doc.findall('./channel/item'):
next_url = None
if not next_url:
continue
+ def itunes(key):
+ return xpath_text(
+ it, xpath_with_ns('./itunes:%s' % key, NS_MAP),
+ default=None)
+
+ duration = itunes('duration')
+ explicit = (itunes('explicit') or '').lower()
+ if explicit in ('true', 'yes'):
+ age_limit = 18
+ elif explicit in ('false', 'no'):
+ age_limit = 0
+ else:
+ age_limit = None
+
entries.append({
'_type': 'url_transparent',
'url': next_url,
'title': it.find('title').text,
+ 'description': xpath_text(it, 'description', default=None),
+ 'timestamp': unified_timestamp(
+ xpath_text(it, 'pubDate', default=None)),
+ 'duration': int_or_none(duration) or parse_duration(duration),
+ 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')),
+ 'episode': itunes('title'),
+ 'episode_number': int_or_none(itunes('episode')),
+ 'season_number': int_or_none(itunes('season')),
+ 'age_limit': age_limit,
})
return {
info_dict = {
'id': video_id,
'title': self._generic_title(url),
- 'upload_date': unified_strdate(head_response.headers.get('Last-Modified'))
+ 'timestamp': unified_timestamp(head_response.headers.get('Last-Modified'))
}
# Check for direct link to a video
webpage = self._webpage_read_content(
full_response, url, video_id, prefix=first_bytes)
+ if '<title>DPG Media Privacy Gate</title>' in webpage:
+ webpage = self._download_webpage(url, video_id)
+
self.report_extraction(video_id)
# Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest?
# Sometimes embedded video player is hidden behind percent encoding
# (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
# Unescaping the whole page allows to handle those cases in a generic way
- webpage = compat_urllib_parse_unquote(webpage)
+ # FIXME: unescaping the whole page may break URLs, commenting out for now.
+ # There probably should be a second run of generic extractor on unescaped webpage.
+ # webpage = compat_urllib_parse_unquote(webpage)
# Unescape squarespace embeds to be detected by generic extractor,
# see https://github.com/ytdl-org/youtube-dl/issues/21294
if tp_urls:
return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform')
+ arc_urls = ArcPublishingIE._extract_urls(webpage)
+ if arc_urls:
+ return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key())
+
+ mychannels_urls = MedialaanIE._extract_urls(webpage)
+ if mychannels_urls:
+ return self.playlist_from_matches(
+ mychannels_urls, video_id, video_title, ie=MedialaanIE.ie_key())
+
# Look for embedded rtl.nl player
matches = re.findall(
r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',
if vimeo_urls:
return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key())
+ vhx_url = VHXEmbedIE._extract_url(webpage)
+ if vhx_url:
+ return self.url_result(vhx_url, VHXEmbedIE.ie_key())
+
vid_me_embed_url = self._search_regex(
r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]',
webpage, 'vid.me embed', default=None)
return self.playlist_from_matches(
matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')
+ # Look for Simplecast embeds
+ simplecast_urls = SimplecastIE._extract_urls(webpage)
+ if simplecast_urls:
+ return self.playlist_from_matches(
+ simplecast_urls, video_id, video_title)
+
# Look for BBC iPlayer embed
matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
if matches:
if mobj is not None:
return self.url_result(mobj.group('url'))
- # Look for embedded smotri.com player
- smotri_url = SmotriIE._extract_url(webpage)
- if smotri_url:
- return self.url_result(smotri_url, 'Smotri')
-
# Look for embedded Myvi.ru player
myvi_url = MyviIE._extract_url(webpage)
if myvi_url:
return self.playlist_from_matches(
zype_urls, video_id, video_title, ie=ZypeIE.ie_key())
+ # Look for RCS media group embeds
+ gedi_urls = GediEmbedsIE._extract_urls(webpage)
+ if gedi_urls:
+ return self.playlist_from_matches(
+ gedi_urls, video_id, video_title, ie=GediEmbedsIE.ie_key())
+
+ rcs_urls = RCSEmbedsIE._extract_urls(webpage)
+ if rcs_urls:
+ return self.playlist_from_matches(
+ rcs_urls, video_id, video_title, ie=RCSEmbedsIE.ie_key())
+
bitchute_urls = BitChuteIE._extract_urls(webpage)
if bitchute_urls:
return self.playlist_from_matches(
bitchute_urls, video_id, video_title, ie=BitChuteIE.ie_key())
+ rumble_urls = RumbleEmbedIE._extract_urls(webpage)
+ if len(rumble_urls) == 1:
+ return self.url_result(rumble_urls[0], RumbleEmbedIE.ie_key())
+ if rumble_urls:
+ return self.playlist_from_matches(
+ rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key())
+
# Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries: