from youtube_dlc.extractor import (
YoutubePlaylistIE,
+ YoutubeTabIE,
YoutubeIE,
)
entries = result['entries']
self.assertEqual(len(entries), 100)
- def test_youtube_flat_playlist_titles(self):
+ def test_youtube_flat_playlist_extraction(self):
dl = FakeYDL()
dl.params['extract_flat'] = True
- ie = YoutubePlaylistIE(dl)
- result = ie.extract('https://www.youtube.com/playlist?list=PL-KKIb8rvtMSrAO9YFbeM6UQrAqoFTUWv')
+ ie = YoutubeTabIE(dl)
+ result = ie.extract('https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc')
self.assertIsPlaylist(result)
- for entry in result['entries']:
- self.assertTrue(entry.get('title'))
+ entries = list(result['entries'])
+ self.assertTrue(len(entries) == 1)
+ video = entries[0]
+ self.assertEqual(video['_type'], 'url_transparent')
+ self.assertEqual(video['ie_key'], 'Youtube')
+ self.assertEqual(video['id'], 'BaW_jenozKc')
+ self.assertEqual(video['url'], 'BaW_jenozKc')
+ self.assertEqual(video['title'], 'youtube-dl test video "\'/\\ä↭𝕐')
+ self.assertEqual(video['duration'], 10)
+ self.assertEqual(video['uploader'], 'Philipp Hagemeister')
if __name__ == '__main__':
formats = []
for a in video_node.findall('.//asset'):
+ file_name = xpath_text(a, './fileName', default=None)
+ if not file_name:
+ continue
+ format_type = a.attrib.get('type')
+ format_url = url_or_none(file_name)
+ if format_url:
+ ext = determine_ext(file_name)
+ if ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, display_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id=format_type or 'hls', fatal=False))
+ continue
+ elif ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ update_url_query(format_url, {'hdcore': '3.7.0'}),
+ display_id, f4m_id=format_type or 'hds', fatal=False))
+ continue
f = {
- 'format_id': a.attrib['type'],
- 'width': int_or_none(a.find('./frameWidth').text),
- 'height': int_or_none(a.find('./frameHeight').text),
- 'vbr': int_or_none(a.find('./bitrateVideo').text),
- 'abr': int_or_none(a.find('./bitrateAudio').text),
- 'vcodec': a.find('./codecVideo').text,
- 'tbr': int_or_none(a.find('./totalBitrate').text),
+ 'format_id': format_type,
+ 'width': int_or_none(xpath_text(a, './frameWidth')),
+ 'height': int_or_none(xpath_text(a, './frameHeight')),
+ 'vbr': int_or_none(xpath_text(a, './bitrateVideo')),
+ 'abr': int_or_none(xpath_text(a, './bitrateAudio')),
+ 'vcodec': xpath_text(a, './codecVideo'),
+ 'tbr': int_or_none(xpath_text(a, './totalBitrate')),
}
- if a.find('./serverPrefix').text:
- f['url'] = a.find('./serverPrefix').text
- f['playpath'] = a.find('./fileName').text
+ server_prefix = xpath_text(a, './serverPrefix', default=None)
+ if server_prefix:
+ f.update({
+ 'url': server_prefix,
+ 'playpath': file_name,
+ })
else:
- f['url'] = a.find('./fileName').text
+ if not format_url:
+ continue
+ f['url'] = format_url
formats.append(f)
self._sort_formats(formats)
from .gigya import GigyaBaseIE
from ..compat import compat_HTTPError
from ..utils import (
- extract_attributes,
ExtractorError,
- strip_or_none,
+ clean_html,
+ extract_attributes,
float_or_none,
+ get_element_by_class,
int_or_none,
merge_dicts,
str_or_none,
+ strip_or_none,
url_or_none,
)
class CanvasIE(InfoExtractor):
- _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza|dako)/assets/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
'md5': '68993eda72ef62386a15ea2cf3c93107',
'display_id': display_id,
'season_number': int_or_none(page.get('episode_season')),
})
+
+
+class DagelijkseKostIE(InfoExtractor):
+ IE_DESC = 'dagelijksekost.een.be'
+ _VALID_URL = r'https?://dagelijksekost\.een\.be/gerechten/(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'https://dagelijksekost.een.be/gerechten/hachis-parmentier-met-witloof',
+ 'md5': '30bfffc323009a3e5f689bef6efa2365',
+ 'info_dict': {
+ 'id': 'md-ast-27a4d1ff-7d7b-425e-b84f-a4d227f592fa',
+ 'display_id': 'hachis-parmentier-met-witloof',
+ 'ext': 'mp4',
+ 'title': 'Hachis parmentier met witloof',
+ 'description': 'md5:9960478392d87f63567b5b117688cdc5',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 283.02,
+ },
+ 'expected_warnings': ['is not a supported codec'],
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ title = strip_or_none(get_element_by_class(
+ 'dish-metadata__title', webpage
+ ) or self._html_search_meta(
+ 'twitter:title', webpage))
+
+ description = clean_html(get_element_by_class(
+ 'dish-description', webpage)
+ ) or self._html_search_meta(
+ ('description', 'twitter:description', 'og:description'),
+ webpage)
+
+ video_id = self._html_search_regex(
+ r'data-url=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
+ group='id')
+
+ return {
+ '_type': 'url_transparent',
+ 'url': 'https://mediazone.vrt.be/api/v1/dako/assets/%s' % video_id,
+ 'ie_key': CanvasIE.ie_key(),
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ }
# coding: utf-8
from __future__ import unicode_literals
+import calendar
import datetime
import re
from .common import InfoExtractor
from ..utils import (
clean_html,
+ extract_timezone,
int_or_none,
parse_duration,
parse_resolution,
timestamp = None
data_utc = try_get(informacio, lambda x: x['data_emissio']['utc'])
try:
- timestamp = datetime.datetime.strptime(
- data_utc, '%Y-%d-%mT%H:%M:%S%z').timestamp()
+ timezone, data_utc = extract_timezone(data_utc)
+ timestamp = calendar.timegm((datetime.datetime.strptime(
+ data_utc, '%Y-%d-%mT%H:%M:%S') - timezone).timetuple())
except TypeError:
pass
# coding: utf-8
from __future__ import unicode_literals
+import json
import re
from .common import InfoExtractor
ExtractorError,
float_or_none,
int_or_none,
+ strip_or_none,
unified_timestamp,
)
class DPlayIE(InfoExtractor):
+ _PATH_REGEX = r'/(?P<id>[^/]+/[^/?#]+)'
_VALID_URL = r'''(?x)https?://
(?P<domain>
(?:www\.)?(?P<host>d
)
)|
(?P<subdomain_country>es|it)\.dplay\.com
- )/[^/]+/(?P<id>[^/]+/[^/?#]+)'''
+ )/[^/]+''' + _PATH_REGEX
_TESTS = [{
# non geo restricted, via secure api, unsigned download hls URL
'only_matching': True,
}]
+ def _process_errors(self, e, geo_countries):
+ info = self._parse_json(e.cause.read().decode('utf-8'), None)
+ error = info['errors'][0]
+ error_code = error.get('code')
+ if error_code == 'access.denied.geoblocked':
+ self.raise_geo_restricted(countries=geo_countries)
+ elif error_code in ('access.denied.missingpackage', 'invalid.token'):
+ raise ExtractorError(
+ 'This video is only available for registered users. You may want to use --cookies.', expected=True)
+ raise ExtractorError(info['errors'][0]['detail'], expected=True)
+
+ def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
+ headers['Authorization'] = 'Bearer ' + self._download_json(
+ disco_base + 'token', display_id, 'Downloading token',
+ query={
+ 'realm': realm,
+ })['data']['attributes']['token']
+
+ def _download_video_playback_info(self, disco_base, video_id, headers):
+ streaming = self._download_json(
+ disco_base + 'playback/videoPlaybackInfo/' + video_id,
+ video_id, headers=headers)['data']['attributes']['streaming']
+ streaming_list = []
+ for format_id, format_dict in streaming.items():
+ streaming_list.append({
+ 'type': format_id,
+ 'url': format_dict.get('url'),
+ })
+ return streaming_list
+
def _get_disco_api_info(self, url, display_id, disco_host, realm, country):
geo_countries = [country.upper()]
self._initialize_geo_bypass({
'countries': geo_countries,
})
disco_base = 'https://%s/' % disco_host
- token = self._download_json(
- disco_base + 'token', display_id, 'Downloading token',
- query={
- 'realm': realm,
- })['data']['attributes']['token']
headers = {
'Referer': url,
- 'Authorization': 'Bearer ' + token,
}
- video = self._download_json(
- disco_base + 'content/videos/' + display_id, display_id,
- headers=headers, query={
- 'fields[channel]': 'name',
- 'fields[image]': 'height,src,width',
- 'fields[show]': 'name',
- 'fields[tag]': 'name',
- 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration',
- 'include': 'images,primaryChannel,show,tags'
- })
+ self._update_disco_api_headers(headers, disco_base, display_id, realm)
+ try:
+ video = self._download_json(
+ disco_base + 'content/videos/' + display_id, display_id,
+ headers=headers, query={
+ 'fields[channel]': 'name',
+ 'fields[image]': 'height,src,width',
+ 'fields[show]': 'name',
+ 'fields[tag]': 'name',
+ 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration',
+ 'include': 'images,primaryChannel,show,tags'
+ })
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+ self._process_errors(e, geo_countries)
+ raise
video_id = video['data']['id']
info = video['data']['attributes']
title = info['name'].strip()
formats = []
try:
- streaming = self._download_json(
- disco_base + 'playback/videoPlaybackInfo/' + video_id,
- display_id, headers=headers)['data']['attributes']['streaming']
+ streaming = self._download_video_playback_info(
+ disco_base, video_id, headers)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
- error = info['errors'][0]
- error_code = error.get('code')
- if error_code == 'access.denied.geoblocked':
- self.raise_geo_restricted(countries=geo_countries)
- elif error_code == 'access.denied.missingpackage':
- self.raise_login_required()
- raise ExtractorError(info['errors'][0]['detail'], expected=True)
+ self._process_errors(e, geo_countries)
raise
- for format_id, format_dict in streaming.items():
+ for format_dict in streaming:
if not isinstance(format_dict, dict):
continue
format_url = format_dict.get('url')
if not format_url:
continue
+ format_id = format_dict.get('type')
ext = determine_ext(format_url)
if format_id == 'dash' or ext == 'mpd':
formats.extend(self._extract_mpd_formats(
'id': video_id,
'display_id': display_id,
'title': title,
- 'description': info.get('description'),
+ 'description': strip_or_none(info.get('description')),
'duration': float_or_none(info.get('videoDuration'), 1000),
'timestamp': unified_timestamp(info.get('publishStart')),
'series': series,
host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com'
return self._get_disco_api_info(
url, display_id, host, 'dplay' + country, country)
+
+
+class DiscoveryPlusIE(DPlayIE):
+ _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family',
+ 'info_dict': {
+ 'id': '1140794',
+ 'display_id': 'property-brothers-forever-home/food-and-family',
+ 'ext': 'mp4',
+ 'title': 'Food and Family',
+ 'description': 'The brothers help a Richmond family expand their single-level home.',
+ 'duration': 2583.113,
+ 'timestamp': 1609304400,
+ 'upload_date': '20201230',
+ 'creator': 'HGTV',
+ 'series': 'Property Brothers: Forever Home',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'skip': 'Available for Premium users',
+ }]
+
+ def _update_disco_api_headers(self, headers, disco_base, display_id, realm):
+ headers['x-disco-client'] = 'WEB:UNKNOWN:dplus_us:15.0.0'
+
+ def _download_video_playback_info(self, disco_base, video_id, headers):
+ return self._download_json(
+ disco_base + 'playback/v3/videoPlaybackInfo',
+ video_id, headers=headers, data=json.dumps({
+ 'deviceInfo': {
+ 'adBlocker': False,
+ },
+ 'videoId': video_id,
+ 'wisteriaProperties': {
+ 'platform': 'desktop',
+ },
+ }).encode('utf-8'))['data']['attributes']['streaming']
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ return self._get_disco_api_info(
+ url, display_id, 'us1-prod-direct.discoveryplus.com', 'go', 'us')
+
+
+class HGTVDeIE(DPlayIE):
+ _VALID_URL = r'https?://de\.hgtv\.com/sendungen' + DPlayIE._PATH_REGEX
+ _TESTS = [{
+ 'url': 'https://de.hgtv.com/sendungen/tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette/',
+ 'info_dict': {
+ 'id': '151205',
+ 'display_id': 'tiny-house-klein-aber-oho/wer-braucht-schon-eine-toilette',
+ 'ext': 'mp4',
+ 'title': 'Wer braucht schon eine Toilette',
+ 'description': 'md5:05b40a27e7aed2c9172de34d459134e2',
+ 'duration': 1177.024,
+ 'timestamp': 1595705400,
+ 'upload_date': '20200725',
+ 'creator': 'HGTV',
+ 'series': 'Tiny House - klein, aber oho',
+ 'season_number': 3,
+ 'episode_number': 3,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ return self._get_disco_api_info(
+ url, display_id, 'eu1-prod.disco-api.com', 'hgtv', 'de')
--- /dev/null
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unified_strdate,
+ xpath_text,
+ determine_ext,
+ float_or_none,
+ ExtractorError,
+)
+
+
+class DreiSatIE(InfoExtractor):
+ IE_NAME = '3sat'
+ _GEO_COUNTRIES = ['DE']
+ _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)'
+ _TESTS = [
+ {
+ 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918',
+ 'md5': 'be37228896d30a88f315b638900a026e',
+ 'info_dict': {
+ 'id': '45918',
+ 'ext': 'mp4',
+ 'title': 'Waidmannsheil',
+ 'description': 'md5:cce00ca1d70e21425e72c86a98a56817',
+ 'uploader': 'SCHWEIZWEIT',
+ 'uploader_id': '100000210',
+ 'upload_date': '20140913'
+ },
+ 'params': {
+ 'skip_download': True, # m3u8 downloads
+ }
+ },
+ {
+ 'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066',
+ 'only_matching': True,
+ },
+ ]
+
+ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+ param_groups = {}
+ for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)):
+ group_id = param_group.get(self._xpath_ns(
+ 'id', 'http://www.w3.org/XML/1998/namespace'))
+ params = {}
+ for param in param_group:
+ params[param.get('name')] = param.get('value')
+ param_groups[group_id] = params
+
+ formats = []
+ for video in smil.findall(self._xpath_ns('.//video', namespace)):
+ src = video.get('src')
+ if not src:
+ continue
+ bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+ group_id = video.get('paramGroup')
+ param_group = param_groups[group_id]
+ for proto in param_group['protocols'].split(','):
+ formats.append({
+ 'url': '%s://%s' % (proto, param_group['host']),
+ 'app': param_group['app'],
+ 'play_path': src,
+ 'ext': 'flv',
+ 'format_id': '%s-%d' % (proto, bitrate),
+ 'tbr': bitrate,
+ })
+ self._sort_formats(formats)
+ return formats
+
+ def extract_from_xml_url(self, video_id, xml_url):
+ doc = self._download_xml(
+ xml_url, video_id,
+ note='Downloading video info',
+ errnote='Failed to download video info')
+
+ status_code = xpath_text(doc, './status/statuscode')
+ if status_code and status_code != 'ok':
+ if status_code == 'notVisibleAnymore':
+ message = 'Video %s is not available' % video_id
+ else:
+ message = '%s returned error: %s' % (self.IE_NAME, status_code)
+ raise ExtractorError(message, expected=True)
+
+ title = xpath_text(doc, './/information/title', 'title', True)
+
+ urls = []
+ formats = []
+ for fnode in doc.findall('.//formitaeten/formitaet'):
+ video_url = xpath_text(fnode, 'url')
+ if not video_url or video_url in urls:
+ continue
+ urls.append(video_url)
+
+ is_available = 'http://www.metafilegenerator' not in video_url
+ geoloced = 'static_geoloced_online' in video_url
+ if not is_available or geoloced:
+ continue
+
+ format_id = fnode.attrib['basetype']
+ format_m = re.match(r'''(?x)
+ (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
+ (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
+ ''', format_id)
+
+ ext = determine_ext(video_url, None) or format_m.group('container')
+
+ if ext == 'meta':
+ continue
+ elif ext == 'smil':
+ formats.extend(self._extract_smil_formats(
+ video_url, video_id, fatal=False))
+ elif ext == 'm3u8':
+ # the certificates are misconfigured (see
+ # https://github.com/ytdl-org/youtube-dl/issues/8665)
+ if video_url.startswith('https://'):
+ continue
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id=format_id, fatal=False))
+ elif ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ video_url, video_id, f4m_id=format_id, fatal=False))
+ else:
+ quality = xpath_text(fnode, './quality')
+ if quality:
+ format_id += '-' + quality
+
+ abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000)
+ vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000)
+
+ tbr = int_or_none(self._search_regex(
+ r'_(\d+)k', video_url, 'bitrate', None))
+ if tbr and vbr and not abr:
+ abr = tbr - vbr
+
+ formats.append({
+ 'format_id': format_id,
+ 'url': video_url,
+ 'ext': ext,
+ 'acodec': format_m.group('acodec'),
+ 'vcodec': format_m.group('vcodec'),
+ 'abr': abr,
+ 'vbr': vbr,
+ 'tbr': tbr,
+ 'width': int_or_none(xpath_text(fnode, './width')),
+ 'height': int_or_none(xpath_text(fnode, './height')),
+ 'filesize': int_or_none(xpath_text(fnode, './filesize')),
+ 'protocol': format_m.group('proto').lower(),
+ })
+
+ geolocation = xpath_text(doc, './/details/geolocation')
+ if not formats and geolocation and geolocation != 'none':
+ self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+
+ self._sort_formats(formats)
+
+ thumbnails = []
+ for node in doc.findall('.//teaserimages/teaserimage'):
+ thumbnail_url = node.text
+ if not thumbnail_url:
+ continue
+ thumbnail = {
+ 'url': thumbnail_url,
+ }
+ thumbnail_key = node.get('key')
+ if thumbnail_key:
+ m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key)
+ if m:
+ thumbnail['width'] = int(m.group(1))
+ thumbnail['height'] = int(m.group(2))
+ thumbnails.append(thumbnail)
+
+ upload_date = unified_strdate(xpath_text(doc, './/details/airtime'))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': xpath_text(doc, './/information/detail'),
+ 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')),
+ 'thumbnails': thumbnails,
+ 'uploader': xpath_text(doc, './/details/originChannelTitle'),
+ 'uploader_id': xpath_text(doc, './/details/originChannelId'),
+ 'upload_date': upload_date,
+ 'formats': formats,
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id
+ return self.extract_from_xml_url(video_id, details_url)
CanvasIE,
CanvasEenIE,
VrtNUIE,
+ DagelijkseKostIE,
)
from .carambatv import (
CarambaTVIE,
DouyuShowIE,
DouyuTVIE,
)
-from .dplay import DPlayIE
+from .dplay import (
+ DPlayIE,
+ DiscoveryPlusIE,
+ HGTVDeIE,
+)
+from .dreisat import DreiSatIE
from .drbonanza import DRBonanzaIE
from .drtuber import DrTuberIE
from .drtv import (
VivoIE,
)
from .showroomlive import ShowRoomLiveIE
+from .simplecast import (
+ SimplecastIE,
+ SimplecastEpisodeIE,
+ SimplecastPodcastIE,
+)
from .sina import SinaIE
from .sixplay import SixPlayIE
from .skyit import (
BellatorIE,
ParamountNetworkIE,
)
-from .storyfire import (
- StoryFireIE,
- StoryFireUserIE,
- StoryFireSeriesIE,
-)
from .stitcher import StitcherIE
from .sport5 import Sport5IE
from .sportbox import SportBoxIE
from .srmediathek import SRMediathekIE
from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
+from .storyfire import (
+ StoryFireIE,
+ StoryFireUserIE,
+ StoryFireSeriesIE,
+)
from .streamable import StreamableIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
ZattooLiveIE,
)
from .zdf import ZDFIE, ZDFChannelIE
+from .zhihu import ZhihuIE
from .zingmp3 import ZingMp3IE
from .zoom import ZoomIE
from .zype import ZypeIE
from .rumble import RumbleEmbedIE
from .arcpublishing import ArcPublishingIE
from .medialaan import MedialaanIE
+from .simplecast import SimplecastIE
class GenericIE(InfoExtractor):
'duration': 159,
},
},
+ {
+ # Simplecast player embed
+ 'url': 'https://www.bio.org/podcast',
+ 'info_dict': {
+ 'id': 'podcast',
+ 'title': 'I AM BIO Podcast | BIO',
+ },
+ 'playlist_mincount': 52,
+ },
]
def report_following_redirect(self, new_url):
return self.playlist_from_matches(
matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie')
+ # Look for Simplecast embeds
+ simplecast_urls = SimplecastIE._extract_urls(webpage)
+ if simplecast_urls:
+ return self.playlist_from_matches(
+ simplecast_urls, video_id, video_title)
+
# Look for BBC iPlayer embed
matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
if matches:
from .common import InfoExtractor
from ..utils import (
- determine_ext,
ExtractorError,
+ determine_ext,
int_or_none,
try_get,
+ unescapeHTML,
url_or_none,
)
IE_NAME = '9gag'
_VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P<id>[^/?&#]+)'
- _TEST = {
+ _TESTS = [{
'url': 'https://9gag.com/gag/ae5Ag7B',
'info_dict': {
'id': 'ae5Ag7B',
'dislike_count': int,
'comment_count': int,
}
- }
+ }, {
+ # HTML escaped title
+ 'url': 'https://9gag.com/gag/av5nvyb',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
post_id = self._match_id(url)
'The given url does not contain a video',
expected=True)
- title = post['title']
+ title = unescapeHTML(post['title'])
duration = None
formats = []
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_podcast_url,
+ int_or_none,
+ parse_iso8601,
+ strip_or_none,
+ try_get,
+ urlencode_postdata,
+)
+
+
+class SimplecastBaseIE(InfoExtractor):
+ _UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}'
+ _API_BASE = 'https://api.simplecast.com/'
+
+ def _call_api(self, path_tmpl, video_id):
+ return self._download_json(
+ self._API_BASE + path_tmpl % video_id, video_id)
+
+ def _call_search_api(self, resource, resource_id, resource_url):
+ return self._download_json(
+ 'https://api.simplecast.com/%ss/search' % resource, resource_id,
+ data=urlencode_postdata({'url': resource_url}))
+
+ def _parse_episode(self, episode):
+ episode_id = episode['id']
+ title = episode['title'].strip()
+ audio_file = episode.get('audio_file') or {}
+ audio_file_url = audio_file.get('url') or episode.get('audio_file_url') or episode['enclosure_url']
+
+ season = episode.get('season') or {}
+ season_href = season.get('href')
+ season_id = None
+ if season_href:
+ season_id = self._search_regex(
+ r'https?://api.simplecast.com/seasons/(%s)' % self._UUID_REGEX,
+ season_href, 'season id', default=None)
+
+ webpage_url = episode.get('episode_url')
+ channel_url = None
+ if webpage_url:
+ channel_url = self._search_regex(
+ r'(https?://[^/]+\.simplecast\.com)',
+ webpage_url, 'channel url', default=None)
+
+ return {
+ 'id': episode_id,
+ 'display_id': episode.get('slug'),
+ 'title': title,
+ 'url': clean_podcast_url(audio_file_url),
+ 'webpage_url': webpage_url,
+ 'channel_url': channel_url,
+ 'series': try_get(episode, lambda x: x['podcast']['title']),
+ 'season_number': int_or_none(season.get('number')),
+ 'season_id': season_id,
+ 'thumbnail': episode.get('image_url'),
+ 'episode_id': episode_id,
+ 'episode_number': int_or_none(episode.get('number')),
+ 'description': strip_or_none(episode.get('description')),
+ 'timestamp': parse_iso8601(episode.get('published_at')),
+ 'duration': int_or_none(episode.get('duration')),
+ 'filesize': int_or_none(audio_file.get('size') or episode.get('audio_file_size')),
+ }
+
+
+class SimplecastIE(SimplecastBaseIE):
+ IE_NAME = 'simplecast'
+ _VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P<id>%s)' % SimplecastBaseIE._UUID_REGEX
+ _COMMON_TEST_INFO = {
+ 'display_id': 'errant-signal-chris-franklin-new-wave-video-essays',
+ 'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
+ 'ext': 'mp3',
+ 'title': 'Errant Signal - Chris Franklin & New Wave Video Essays',
+ 'episode_number': 1,
+ 'episode_id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876',
+ 'description': 'md5:34752789d3d2702e2d2c975fbd14f357',
+ 'season_number': 1,
+ 'season_id': 'e23df0da-bae4-4531-8bbf-71364a88dc13',
+ 'series': 'The RE:BIND.io Podcast',
+ 'duration': 5343,
+ 'timestamp': 1580979475,
+ 'upload_date': '20200206',
+ 'webpage_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com/episodes/errant-signal-chris-franklin-new-wave-video-essays',
+ 'channel_url': r're:^https?://the-re-bind-io-podcast\.simplecast\.com$',
+ }
+ _TESTS = [{
+ 'url': 'https://api.simplecast.com/episodes/b6dc49a2-9404-4853-9aa9-9cfc097be876',
+ 'md5': '8c93be7be54251bf29ee97464eabe61c',
+ 'info_dict': _COMMON_TEST_INFO,
+ }, {
+ 'url': 'https://player.simplecast.com/b6dc49a2-9404-4853-9aa9-9cfc097be876',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return re.findall(
+ r'''(?x)<iframe[^>]+src=["\']
+ (
+ https?://(?:embed\.simplecast\.com/[0-9a-f]{8}|
+ player\.simplecast\.com/%s
+ ))''' % SimplecastBaseIE._UUID_REGEX, webpage)
+
+ def _real_extract(self, url):
+ episode_id = self._match_id(url)
+ episode = self._call_api('episodes/%s', episode_id)
+ return self._parse_episode(episode)
+
+
+class SimplecastEpisodeIE(SimplecastBaseIE):
+ IE_NAME = 'simplecast:episode'
+ _VALID_URL = r'https?://(?!api\.)[^/]+\.simplecast\.com/episodes/(?P<id>[^/?&#]+)'
+ _TEST = {
+ 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes/errant-signal-chris-franklin-new-wave-video-essays',
+ 'md5': '8c93be7be54251bf29ee97464eabe61c',
+ 'info_dict': SimplecastIE._COMMON_TEST_INFO,
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ episode = self._call_search_api(
+ 'episode', mobj.group(1), mobj.group(0))
+ return self._parse_episode(episode)
+
+
+class SimplecastPodcastIE(SimplecastBaseIE):
+ IE_NAME = 'simplecast:podcast'
+ _VALID_URL = r'https?://(?!(?:api|cdn|embed|feeds|player)\.)(?P<id>[^/]+)\.simplecast\.com(?!/episodes/[^/?&#]+)'
+ _TESTS = [{
+ 'url': 'https://the-re-bind-io-podcast.simplecast.com',
+ 'playlist_mincount': 33,
+ 'info_dict': {
+ 'id': '07d28d26-7522-42eb-8c53-2bdcfc81c43c',
+ 'title': 'The RE:BIND.io Podcast',
+ },
+ }, {
+ 'url': 'https://the-re-bind-io-podcast.simplecast.com/episodes',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ subdomain = self._match_id(url)
+ site = self._call_search_api('site', subdomain, url)
+ podcast = site['podcast']
+ podcast_id = podcast['id']
+ podcast_title = podcast.get('title')
+
+ def entries():
+ episodes = self._call_api('podcasts/%s/episodes', podcast_id)
+ for episode in (episodes.get('collection') or []):
+ info = self._parse_episode(episode)
+ info['series'] = podcast_title
+ yield info
+
+ return self.playlist_result(entries(), podcast_id, podcast_title)
# coding: utf-8
from __future__ import unicode_literals
-import itertools
+import functools
+
from .common import InfoExtractor
+from ..utils import (
+ # HEADRequest,
+ int_or_none,
+ OnDemandPagedList,
+ smuggle_url,
+)
+
+
+class StoryFireBaseIE(InfoExtractor):
+ _VALID_URL_BASE = r'https?://(?:www\.)?storyfire\.com/'
+
+ def _call_api(self, path, video_id, resource, query=None):
+ return self._download_json(
+ 'https://storyfire.com/app/%s/%s' % (path, video_id), video_id,
+ 'Downloading %s JSON metadata' % resource, query=query)
+
+ def _parse_video(self, video):
+ title = video['title']
+ vimeo_id = self._search_regex(
+ r'https?://player\.vimeo\.com/external/(\d+)',
+ video['vimeoVideoURL'], 'vimeo id')
+
+ # video_url = self._request_webpage(
+ # HEADRequest(video['vimeoVideoURL']), video_id).geturl()
+ # formats = []
+ # for v_url, suffix in [(video_url, '_sep'), (video_url.replace('/sep/video/', '/video/'), '')]:
+ # formats.extend(self._extract_m3u8_formats(
+ # v_url, video_id, 'mp4', 'm3u8_native',
+ # m3u8_id='hls' + suffix, fatal=False))
+ # formats.extend(self._extract_mpd_formats(
+ # v_url.replace('.m3u8', '.mpd'), video_id,
+ # mpd_id='dash' + suffix, fatal=False))
+ # self._sort_formats(formats)
+ uploader_id = video.get('hostID')
-class StoryFireIE(InfoExtractor):
- _VALID_URL = r'(?:(?:https?://(?:www\.)?storyfire\.com/video-details)|(?:https://storyfire.app.link))/(?P<id>[^/\s]+)'
- _TESTS = [{
+ return {
+ '_type': 'url_transparent',
+ 'id': vimeo_id,
+ 'title': title,
+ 'description': video.get('description'),
+ 'url': smuggle_url(
+ 'https://player.vimeo.com/video/' + vimeo_id, {
+ 'http_headers': {
+ 'Referer': 'https://storyfire.com/',
+ }
+ }),
+ # 'formats': formats,
+ 'thumbnail': video.get('storyImage'),
+ 'view_count': int_or_none(video.get('views')),
+ 'like_count': int_or_none(video.get('likesCount')),
+ 'comment_count': int_or_none(video.get('commentsCount')),
+ 'duration': int_or_none(video.get('videoDuration')),
+ 'timestamp': int_or_none(video.get('publishDate')),
+ 'uploader': video.get('username'),
+ 'uploader_id': uploader_id,
+ 'uploader_url': 'https://storyfire.com/user/%s/video' % uploader_id if uploader_id else None,
+ 'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')),
+ }
+
+
+class StoryFireIE(StoryFireBaseIE):
+ _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'video-details/(?P<id>[0-9a-f]{24})'
+ _TEST = {
'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181',
- 'md5': '560953bfca81a69003cfa5e53ac8a920',
+ 'md5': 'caec54b9e4621186d6079c7ec100c1eb',
'info_dict': {
- 'id': '5df1d132b6378700117f9181',
+ 'id': '378954662',
'ext': 'mp4',
'title': 'Buzzfeed Teaches You About Memes',
'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
'timestamp': 1576129028,
- 'description': 'Mocking Buzzfeed\'s meme lesson. Reuploaded from YouTube because of their new policies',
+ 'description': 'md5:0b4e28021548e144bed69bb7539e62ea',
'uploader': 'whang!',
'upload_date': '20191212',
+ 'duration': 418,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
},
- 'params': {'format': 'bestvideo'} # There are no merged formats in the playlist.
- }, {
- 'url': 'https://storyfire.app.link/5GxAvWOQr8', # Alternate URL format, with unrelated short ID
- 'md5': '7a2dc6d60c4889edfed459c620fe690d',
- 'info_dict': {
- 'id': '5f1e11ecd78a57b6c702001d',
- 'ext': 'm4a',
- 'title': 'Weird Nintendo Prototype Leaks',
- 'description': 'A stream taking a look at some weird Nintendo Prototypes with Luigi in Mario 64 and weird Yoshis',
- 'timestamp': 1595808576,
- 'upload_date': '20200727',
- 'uploader': 'whang!',
- 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
+ 'params': {
+ 'skip_download': True,
},
- 'params': {'format': 'bestaudio'} # Verifying audio extraction
-
- }]
-
- _aformats = {
- 'audio-medium-audio': {'acodec': 'aac', 'abr': 125, 'preference': -10},
- 'audio-high-audio': {'acodec': 'aac', 'abr': 254, 'preference': -1},
+ 'expected_warnings': ['Unable to download JSON metadata']
}
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- # Extracting the json blob is mandatory to proceed with extraction.
- jsontext = self._html_search_regex(
- r'<script id="__NEXT_DATA__" type="application/json">(.+?)</script>',
- webpage, 'json_data')
-
- json = self._parse_json(jsontext, video_id)
-
- # The currentVideo field in the json is mandatory
- # because it contains the only link to the m3u playlist
- video = json['props']['initialState']['video']['currentVideo']
- videourl = video['vimeoVideoURL'] # Video URL is mandatory
-
- # Extract other fields from the json in an error tolerant fashion
- # ID may be incorrect (on short URL format), correct it.
- parsed_id = video.get('_id')
- if parsed_id:
- video_id = parsed_id
+ video = self._call_api(
+ 'generic/video-detail', video_id, 'video')['video']
+ return self._parse_video(video)
- title = video.get('title')
- description = video.get('description')
- thumbnail = video.get('storyImage')
- views = video.get('views')
- likes = video.get('likesCount')
- comments = video.get('commentsCount')
- duration = video.get('videoDuration')
- publishdate = video.get('publishDate') # Apparently epoch time, day only
-
- uploader = video.get('username')
- uploader_id = video.get('hostID')
- # Construct an uploader URL
- uploader_url = None
- if uploader_id:
- uploader_url = "https://storyfire.com/user/%s/video" % uploader_id
-
- # Collect root playlist to determine formats
- formats = self._extract_m3u8_formats(
- videourl, video_id, 'mp4', 'm3u8_native')
-
- # Modify formats to fill in missing information about audio codecs
- for format in formats:
- aformat = self._aformats.get(format['format_id'])
- if aformat:
- format['acodec'] = aformat['acodec']
- format['abr'] = aformat['abr']
- format['quality'] = aformat['preference']
- format['ext'] = 'm4a'
-
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'ext': "mp4",
- 'url': videourl,
- 'formats': formats,
-
- 'thumbnail': thumbnail,
- 'view_count': views,
- 'like_count': likes,
- 'comment_count': comments,
- 'duration': duration,
- 'timestamp': publishdate,
-
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'uploader_url': uploader_url,
-
- }
-
-
-class StoryFireUserIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?storyfire\.com/user/(?P<id>[^/\s]+)/video'
- _TESTS = [{
- 'url': 'https://storyfire.com/user/ntZAJFECERSgqHSxzonV5K2E89s1/video',
- 'info_dict': {
- 'id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
- 'title': 'whang!',
- },
- 'playlist_mincount': 18
- }, {
+class StoryFireUserIE(StoryFireBaseIE):
+ _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'user/(?P<id>[^/]+)/video'
+ _TEST = {
'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video',
'info_dict': {
'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2',
- 'title': 'McJuggerNuggets',
},
- 'playlist_mincount': 143
-
- }]
+ 'playlist_mincount': 151,
+ }
+ _PAGE_SIZE = 20
- # Generator for fetching playlist items
- def _enum_videos(self, baseurl, user_id, firstjson):
- totalVideos = int(firstjson['videosCount'])
- haveVideos = 0
- json = firstjson
-
- for page in itertools.count(1):
- for video in json['videos']:
- id = video['_id']
- url = "https://storyfire.com/video-details/%s" % id
- haveVideos += 1
- yield {
- '_type': 'url',
- 'id': id,
- 'url': url,
- 'ie_key': 'StoryFire',
-
- 'title': video.get('title'),
- 'description': video.get('description'),
- 'view_count': video.get('views'),
- 'comment_count': video.get('commentsCount'),
- 'duration': video.get('videoDuration'),
- 'timestamp': video.get('publishDate'),
- }
- # Are there more pages we could fetch?
- if haveVideos < totalVideos:
- pageurl = baseurl + ("%i" % haveVideos)
- json = self._download_json(pageurl, user_id,
- note='Downloading page %s' % page)
-
- # Are there any videos in the new json?
- videos = json.get('videos')
- if not videos or len(videos) == 0:
- break # no videos
-
- else:
- break # We have fetched all the videos, stop
+ def _fetch_page(self, user_id, page):
+ videos = self._call_api(
+ 'publicVideos', user_id, 'page %d' % (page + 1), {
+ 'skip': page * self._PAGE_SIZE,
+ })['videos']
+ for video in videos:
+ yield self._parse_video(video)
def _real_extract(self, url):
user_id = self._match_id(url)
+ entries = OnDemandPagedList(functools.partial(
+ self._fetch_page, user_id), self._PAGE_SIZE)
+ return self.playlist_result(entries, user_id)
- baseurl = "https://storyfire.com/app/publicVideos/%s?skip=" % user_id
-
- # Download first page to ensure it can be downloaded, and get user information if available.
- firstpage = baseurl + "0"
- firstjson = self._download_json(firstpage, user_id)
-
- title = None
- videos = firstjson.get('videos')
- if videos and len(videos):
- title = videos[1].get('username')
- return {
- '_type': 'playlist',
- 'entries': self._enum_videos(baseurl, user_id, firstjson),
- 'id': user_id,
- 'title': title,
- }
-
-
-class StoryFireSeriesIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?storyfire\.com/write/series/stories/(?P<id>[^/\s]+)'
+class StoryFireSeriesIE(StoryFireBaseIE):
+ _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'write/series/stories/(?P<id>[^/?&#]+)'
_TESTS = [{
'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/',
'info_dict': {
'id': '-Lq6MsuIHLODO6d2dDkr',
},
- 'playlist_mincount': 13
+ 'playlist_mincount': 13,
}, {
'url': 'https://storyfire.com/write/series/stories/the_mortal_one/',
'info_dict': {
'id': 'the_mortal_one',
},
- 'playlist_count': 0 # This playlist has entries, but no videos.
- }, {
- 'url': 'https://storyfire.com/write/series/stories/story_time',
- 'info_dict': {
- 'id': 'story_time',
- },
- 'playlist_mincount': 10
+ 'playlist_count': 0,
}]
- # Generator for returning playlist items
- # This object is substantially different than the one in the user videos page above
- def _enum_videos(self, jsonlist):
- for video in jsonlist:
- id = video['_id']
- if video.get('hasVideo'): # Boolean element
- url = "https://storyfire.com/video-details/%s" % id
- yield {
- '_type': 'url',
- 'id': id,
- 'url': url,
- 'ie_key': 'StoryFire',
-
- 'title': video.get('title'),
- 'description': video.get('description'),
- 'view_count': video.get('views'),
- 'likes_count': video.get('likesCount'),
- 'comment_count': video.get('commentsCount'),
- 'duration': video.get('videoDuration'),
- 'timestamp': video.get('publishDate'),
- }
+ def _extract_videos(self, stories):
+ for story in stories.values():
+ if story.get('hasVideo'):
+ yield self._parse_video(story)
def _real_extract(self, url):
- list_id = self._match_id(url)
-
- listurl = "https://storyfire.com/app/seriesStories/%s/list" % list_id
- json = self._download_json(listurl, list_id)
-
- return {
- '_type': 'playlist',
- 'entries': self._enum_videos(json),
- 'id': list_id
- }
+ series_id = self._match_id(url)
+ stories = self._call_api(
+ 'seriesStories', series_id, 'series stories')
+ return self.playlist_result(self._extract_videos(stories), series_id)
import re
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
determine_ext,
float_or_none,
+ int_or_none,
parse_age_limit,
qualities,
random_birthday,
- try_get,
unified_timestamp,
urljoin,
)
class VideoPressIE(InfoExtractor):
- _VALID_URL = r'https?://videopress\.com/embed/(?P<id>[\da-zA-Z]+)'
+ _ID_REGEX = r'[\da-zA-Z]{8}'
+ _PATH_REGEX = r'video(?:\.word)?press\.com/embed/'
+ _VALID_URL = r'https?://%s(?P<id>%s)' % (_PATH_REGEX, _ID_REGEX)
_TESTS = [{
'url': 'https://videopress.com/embed/kUJmAcSf',
'md5': '706956a6c875873d51010921310e4bc6',
# 17+, requires birth_* params
'url': 'https://videopress.com/embed/iH3gstfZ',
'only_matching': True,
+ }, {
+ 'url': 'https://video.wordpress.com/embed/kUJmAcSf',
+ 'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(
- r'<iframe[^>]+src=["\']((?:https?://)?videopress\.com/embed/[\da-zA-Z]+)',
+ r'<iframe[^>]+src=["\']((?:https?://)?%s%s)' % (VideoPressIE._PATH_REGEX, VideoPressIE._ID_REGEX),
webpage)
def _real_extract(self, url):
video_id = self._match_id(url)
query = random_birthday('birth_year', 'birth_month', 'birth_day')
+ query['fields'] = 'description,duration,file_url_base,files,height,original,poster,rating,title,upload_date,width'
video = self._download_json(
'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id,
video_id, query=query)
title = video['title']
- def base_url(scheme):
- return try_get(
- video, lambda x: x['file_url_base'][scheme], compat_str)
-
- base_url = base_url('https') or base_url('http')
+ file_url_base = video.get('file_url_base') or {}
+ base_url = file_url_base.get('https') or file_url_base.get('http')
QUALITIES = ('std', 'dvd', 'hd')
quality = qualities(QUALITIES)
formats = []
- for format_id, f in video['files'].items():
+ for format_id, f in (video.get('files') or {}).items():
if not isinstance(f, dict):
continue
for ext, path in f.items():
'ext': determine_ext(path, ext),
'quality': quality(format_id),
})
- original_url = try_get(video, lambda x: x['original'], compat_str)
+ original_url = video.get('original')
if original_url:
formats.append({
'url': original_url,
'format_id': 'original',
'quality': len(QUALITIES),
+ 'width': int_or_none(video.get('width')),
+ 'height': int_or_none(video.get('height')),
})
self._sort_formats(formats)
parse_iso8601,
sanitized_Request,
std_headers,
+ try_get,
)
_ERRORS = {
'geo': 'Sorry, this content is not available in your region.',
'upcoming': 'Sorry, this content is not yet available.',
- # 'paywall': 'paywall',
+ 'paywall': 'Sorry, this content is only available to Viki Pass Plus subscribers',
}
def _prepare_call(self, path, timestamp=None, post_data=None):
expected=True)
def _check_errors(self, data):
- for reason, status in data.get('blocking', {}).items():
+ for reason, status in (data.get('blocking') or {}).items():
if status and reason in self._ERRORS:
message = self._ERRORS[reason]
if reason == 'geo':
self.raise_geo_restricted(msg=message)
+ elif reason == 'paywall':
+ self.raise_login_required(message)
raise ExtractorError('%s said: %s' % (
self.IE_NAME, message), expected=True)
'info_dict': {
'id': '1023585v',
'ext': 'mp4',
- 'title': 'Heirs Episode 14',
- 'uploader': 'SBS',
- 'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e',
+ 'title': 'Heirs - Episode 14',
+ 'uploader': 'SBS Contents Hub',
+ 'timestamp': 1385047627,
'upload_date': '20131121',
'age_limit': 13,
+ 'duration': 3570,
+ 'episode_number': 14,
+ },
+ 'params': {
+ 'format': 'bestvideo',
},
'skip': 'Blocked in the US',
+ 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, {
# clip
'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference',
'uploader': 'Arirang TV',
'like_count': int,
'age_limit': 0,
- }
+ },
+ 'skip': 'Sorry. There was an error loading this video',
}, {
'url': 'http://www.viki.com/videos/1048879v-ankhon-dekhi',
'info_dict': {
}, {
# episode
'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
- 'md5': '94e0e34fd58f169f40c184f232356cfe',
+ 'md5': '0a53dc252e6e690feccd756861495a8c',
'info_dict': {
'id': '44699v',
'ext': 'mp4',
'uploader': 'group8',
'like_count': int,
'age_limit': 13,
+ 'episode_number': 1,
+ },
+ 'params': {
+ 'format': 'bestvideo',
},
'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, {
}, {
# non-English description
'url': 'http://www.viki.com/videos/158036v-love-in-magic',
- 'md5': 'adf9e321a0ae5d0aace349efaaff7691',
+ 'md5': '41faaba0de90483fb4848952af7c7d0d',
'info_dict': {
'id': '158036v',
'ext': 'mp4',
'title': 'Love In Magic',
'age_limit': 13,
},
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}]
def _real_extract(self, url):
self._check_errors(video)
title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False)
+ episode_number = int_or_none(video.get('number'))
if not title:
- title = 'Episode %d' % video.get('number') if video.get('type') == 'episode' else video.get('id') or video_id
- container_titles = video.get('container', {}).get('titles', {})
+ title = 'Episode %d' % episode_number if video.get('type') == 'episode' else video.get('id') or video_id
+ container_titles = try_get(video, lambda x: x['container']['titles'], dict) or {}
container_title = self.dict_selection(container_titles, 'en')
title = '%s - %s' % (container_title, title)
description = self.dict_selection(video.get('descriptions', {}), 'en')
- duration = int_or_none(video.get('duration'))
- timestamp = parse_iso8601(video.get('created_at'))
- uploader = video.get('author')
- like_count = int_or_none(video.get('likes', {}).get('count'))
- age_limit = parse_age_limit(video.get('rating'))
+ like_count = int_or_none(try_get(video, lambda x: x['likes']['count']))
thumbnails = []
- for thumbnail_id, thumbnail in video.get('images', {}).items():
+ for thumbnail_id, thumbnail in (video.get('images') or {}).items():
thumbnails.append({
'id': thumbnail_id,
'url': thumbnail.get('url'),
}]
except AttributeError:
# fall-back to the old way if there isn't a streamSubtitles attribute
- for subtitle_lang, _ in video.get('subtitle_completions', {}).items():
+ for subtitle_lang, _ in (video.get('subtitle_completions') or {}).items():
subtitles[subtitle_lang] = [{
'ext': subtitles_format,
'url': self._prepare_call(
'id': video_id,
'title': title,
'description': description,
- 'duration': duration,
- 'timestamp': timestamp,
- 'uploader': uploader,
+ 'duration': int_or_none(video.get('duration')),
+ 'timestamp': parse_iso8601(video.get('created_at')),
+ 'uploader': video.get('author'),
+ 'uploader_url': video.get('author_url'),
'like_count': like_count,
- 'age_limit': age_limit,
+ 'age_limit': parse_age_limit(video.get('rating')),
'thumbnails': thumbnails,
'subtitles': subtitles,
+ 'episode_number': episode_number,
}
formats = []
'info_dict': {
'id': '50c',
'title': 'Boys Over Flowers',
- 'description': 'md5:ecd3cff47967fe193cff37c0bec52790',
+ 'description': 'md5:804ce6e7837e1fd527ad2f25420f4d59',
},
'playlist_mincount': 71,
}, {
'description': 'md5:05bf5471385aa8b21c18ad450e350525',
},
'playlist_count': 127,
+ 'skip': 'Page not found',
}, {
'url': 'http://www.viki.com/news/24569c-showbiz-korea',
'only_matching': True,
'is_live': is_live,
}
- def _extract_original_format(self, url, video_id):
+ def _extract_original_format(self, url, video_id, unlisted_hash=None):
+ query = {'action': 'load_download_config'}
+ if unlisted_hash:
+ query['unlisted_hash'] = unlisted_hash
download_data = self._download_json(
- url, video_id, fatal=False,
- query={'action': 'load_download_config'},
+ url, video_id, fatal=False, query=query,
headers={'X-Requested-With': 'XMLHttpRequest'})
if download_data:
source_file = download_data.get('source_file')
{
'url': 'https://vimeo.com/160743502/abd0e13fb4',
'only_matching': True,
+ },
+ {
+ # requires passing unlisted_hash(a52724358e) to load_download_config request
+ 'url': 'https://vimeo.com/392479337/a52724358e',
+ 'only_matching': True,
}
# https://gettingthingsdone.com/workflowmap/
# vimeo embed with check-password page protected by Referer header
if config.get('view') == 4:
config = self._verify_player_video_password(redirect_url, video_id, headers)
- vod = config.get('video', {}).get('vod', {})
+ video = config.get('video') or {}
+ vod = video.get('vod') or {}
def is_rented():
if '>You rented this title.<' in webpage:
formats = []
source_format = self._extract_original_format(
- 'https://vimeo.com/' + video_id, video_id)
+ 'https://vimeo.com/' + video_id, video_id, video.get('unlisted_hash'))
if source_format:
formats.append(source_format)
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
+from ..compat import (
+ compat_parse_qs,
+ compat_urllib_parse_urlparse,
+)
from ..utils import (
int_or_none,
+ month_by_abbreviation,
parse_filesize,
- unified_strdate,
)
class XboxClipsIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?xboxclips\.com/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\w-]{36})'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?(?:xboxclips\.com|gameclips\.io)/(?:video\.php\?.*vid=|[^/]+/)(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})'
+ _TESTS = [{
'url': 'http://xboxclips.com/video.php?uid=2533274823424419&gamertag=Iabdulelah&vid=074a69a9-5faf-46aa-b93b-9909c1720325',
'md5': 'fbe1ec805e920aeb8eced3c3e657df5d',
'info_dict': {
'id': '074a69a9-5faf-46aa-b93b-9909c1720325',
'ext': 'mp4',
- 'title': 'Iabdulelah playing Titanfall',
+ 'title': 'iAbdulElah playing Titanfall',
'filesize_approx': 26800000,
'upload_date': '20140807',
'duration': 56,
}
- }
+ }, {
+ 'url': 'https://gameclips.io/iAbdulElah/074a69a9-5faf-46aa-b93b-9909c1720325',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
+ if '/video.php' in url:
+ qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ url = 'https://gameclips.io/%s/%s' % (qs['gamertag'][0], qs['vid'][0])
+
webpage = self._download_webpage(url, video_id)
+ info = self._parse_html5_media_entries(url, webpage, video_id)[0]
- video_url = self._html_search_regex(
- r'>(?:Link|Download): <a[^>]+href="([^"]+)"', webpage, 'video URL')
- title = self._html_search_regex(
- r'<title>XboxClips \| ([^<]+)</title>', webpage, 'title')
- upload_date = unified_strdate(self._html_search_regex(
- r'>Recorded: ([^<]+)<', webpage, 'upload date', fatal=False))
+ title = self._html_search_meta(['og:title', 'twitter:title'], webpage)
+ upload_date = None
+ mobj = re.search(
+ r'>Recorded: (\d{2})-(Jan|Feb|Mar|Apr|May|Ju[nl]|Aug|Sep|Oct|Nov|Dec)-(\d{4})',
+ webpage)
+ if mobj:
+ upload_date = '%s%.2d%s' % (mobj.group(3), month_by_abbreviation(mobj.group(2)), mobj.group(1))
filesize = parse_filesize(self._html_search_regex(
r'>Size: ([^<]+)<', webpage, 'file size', fatal=False))
duration = int_or_none(self._html_search_regex(
view_count = int_or_none(self._html_search_regex(
r'>Views: (\d+)<', webpage, 'view count', fatal=False))
- return {
+ info.update({
'id': video_id,
- 'url': video_url,
'title': title,
'upload_date': upload_date,
'filesize_approx': filesize,
'duration': duration,
'view_count': view_count,
- }
+ })
+ return info
# coding: utf-8
from __future__ import unicode_literals
-import re
import hashlib
+import itertools
+import re
from .common import InfoExtractor
from ..compat import compat_str
missing_track_ids = [
track_id for track_id in track_ids
if track_id not in present_track_ids]
- missing_tracks = self._call_api(
- 'track-entries', tld, url, item_id,
- 'Downloading missing tracks JSON', {
- 'entries': ','.join(missing_track_ids),
- 'lang': tld,
- 'external-domain': 'music.yandex.%s' % tld,
- 'overembed': 'false',
- 'strict': 'true',
- })
- if missing_tracks:
- tracks.extend(missing_tracks)
+ # Request missing tracks in chunks to avoid exceeding max HTTP header size,
+ # see https://github.com/ytdl-org/youtube-dl/issues/27355
+ _TRACKS_PER_CHUNK = 250
+ for chunk_num in itertools.count(0):
+ start = chunk_num * _TRACKS_PER_CHUNK
+ end = start + _TRACKS_PER_CHUNK
+ missing_track_ids_req = missing_track_ids[start:end]
+ assert missing_track_ids_req
+ missing_tracks = self._call_api(
+ 'track-entries', tld, url, item_id,
+ 'Downloading missing tracks JSON chunk %d' % (chunk_num + 1), {
+ 'entries': ','.join(missing_track_ids_req),
+ 'lang': tld,
+ 'external-domain': 'music.yandex.%s' % tld,
+ 'overembed': 'false',
+ 'strict': 'true',
+ })
+ if missing_tracks:
+ tracks.extend(missing_tracks)
+ if end >= len(missing_track_ids):
+ break
return tracks
r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
'view count', default=None))
uploader = try_get(
- renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
+ renderer,
+ (lambda x: x['ownerText']['runs'][0]['text'],
+ lambda x: x['shortBylineText']['runs'][0]['text']), compat_str)
return {
'_type': 'url_transparent',
'ie_key': YoutubeIE.ie_key(),
class YoutubeIE(YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com'
+ _INVIDIOUS_SITES = (
+ # invidious-redirect websites
+ r'(?:www\.)?redirect\.invidious\.io',
+ r'(?:(?:www|dev)\.)?invidio\.us',
+ # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md
+ r'(?:www\.)?invidious\.pussthecat\.org',
+ r'(?:www\.)?invidious\.048596\.xyz',
+ r'(?:www\.)?invidious\.zee\.li',
+ r'(?:www\.)?vid\.puffyan\.us',
+ r'(?:(?:www|au)\.)?ytprivate\.com',
+ r'(?:www\.)?invidious\.namazso\.eu',
+ r'(?:www\.)?invidious\.ethibox\.fr',
+ r'(?:www\.)?inv\.skyn3t\.in',
+ r'(?:www\.)?invidious\.himiko\.cloud',
+ r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion',
+ r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion',
+ r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion',
+ r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion',
+ # youtube-dl invidious instances list
+ r'(?:(?:www|no)\.)?invidiou\.sh',
+ r'(?:(?:www|fi)\.)?invidious\.snopyta\.org',
+ r'(?:www\.)?invidious\.kabi\.tk',
+ r'(?:www\.)?invidious\.13ad\.de',
+ r'(?:www\.)?invidious\.mastodon\.host',
+ r'(?:www\.)?invidious\.zapashcanon\.fr',
+ r'(?:www\.)?invidious\.kavin\.rocks',
+ r'(?:www\.)?invidious\.tube',
+ r'(?:www\.)?invidiou\.site',
+ r'(?:www\.)?invidious\.site',
+ r'(?:www\.)?invidious\.xyz',
+ r'(?:www\.)?invidious\.nixnet\.xyz',
+ r'(?:www\.)?invidious\.drycat\.fr',
+ r'(?:www\.)?tube\.poal\.co',
+ r'(?:www\.)?tube\.connect\.cafe',
+ r'(?:www\.)?vid\.wxzm\.sx',
+ r'(?:www\.)?vid\.mint\.lgbt',
+ r'(?:www\.)?yewtu\.be',
+ r'(?:www\.)?yt\.elukerio\.org',
+ r'(?:www\.)?yt\.lelux\.fi',
+ r'(?:www\.)?invidious\.ggc-project\.de',
+ r'(?:www\.)?yt\.maisputain\.ovh',
+ r'(?:www\.)?invidious\.toot\.koeln',
+ r'(?:www\.)?invidious\.fdn\.fr',
+ r'(?:www\.)?watch\.nettohikari\.com',
+ r'(?:www\.)?kgg2m7yk5aybusll\.onion',
+ r'(?:www\.)?qklhadlycap4cnod\.onion',
+ r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion',
+ r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion',
+ r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion',
+ r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion',
+ r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p',
+ r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion',
+ )
_VALID_URL = r"""(?x)^
(
(?:https?://|//) # http(s):// or protocol-independent URL
- (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
- (?:www\.)?deturl\.com/www\.youtube\.com/|
- (?:www\.)?pwnyoutube\.com/|
- (?:www\.)?hooktube\.com/|
- (?:www\.)?yourepeat\.com/|
- tube\.majestyc\.net/|
- # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances
- (?:www\.)?invidious\.pussthecat\.org/|
- (?:www\.)?invidious\.048596\.xyz/|
- (?:www\.)?invidious\.zee\.li/|
- (?:www\.)?vid\.puffyan\.us/|
- (?:(?:www|au)\.)?ytprivate\.com/|
- (?:www\.)?invidious\.namazso\.eu/|
- (?:www\.)?invidious\.ethibox\.fr/|
- (?:www\.)?inv\.skyn3t\.in/|
- (?:www\.)?invidious\.himiko\.cloud/|
- (?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion/|
- (?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion/|
- (?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion/|
- (?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion/|
- (?:(?:www|dev)\.)?invidio\.us/|
- (?:(?:www|no)\.)?invidiou\.sh/|
- (?:(?:www|fi)\.)?invidious\.snopyta\.org/|
- (?:www\.)?invidious\.kabi\.tk/|
- (?:www\.)?invidious\.13ad\.de/|
- (?:www\.)?invidious\.mastodon\.host/|
- (?:www\.)?invidious\.zapashcanon\.fr/|
- (?:www\.)?invidious\.kavin\.rocks/|
- (?:www\.)?invidious\.tube/|
- (?:www\.)?invidiou\.site/|
- (?:www\.)?invidious\.site/|
- (?:www\.)?invidious\.xyz/|
- (?:www\.)?invidious\.nixnet\.xyz/|
- (?:www\.)?invidious\.drycat\.fr/|
- (?:www\.)?tube\.poal\.co/|
- (?:www\.)?tube\.connect\.cafe/|
- (?:www\.)?vid\.wxzm\.sx/|
- (?:www\.)?vid\.mint\.lgbt/|
- (?:www\.)?yewtu\.be/|
- (?:www\.)?yt\.elukerio\.org/|
- (?:www\.)?yt\.lelux\.fi/|
- (?:www\.)?invidious\.ggc-project\.de/|
- (?:www\.)?yt\.maisputain\.ovh/|
- (?:www\.)?invidious\.toot\.koeln/|
- (?:www\.)?invidious\.fdn\.fr/|
- (?:www\.)?watch\.nettohikari\.com/|
- (?:www\.)?kgg2m7yk5aybusll\.onion/|
- (?:www\.)?qklhadlycap4cnod\.onion/|
- (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
- (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/|
- (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
- (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
- (?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
- (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
- youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
+ (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com|
+ (?:www\.)?deturl\.com/www\.youtube\.com|
+ (?:www\.)?pwnyoutube\.com|
+ (?:www\.)?hooktube\.com|
+ (?:www\.)?yourepeat\.com|
+ tube\.majestyc\.net|
+ %(invidious)s|
+ youtube\.googleapis\.com)/ # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
(?:(?:v|embed|e)/(?!videoseries)) # v/ or embed/ or e/
youtu\.be| # just youtu.be/xxxx
vid\.plus| # or vid.plus/xxxx
zwearz\.com/watch| # or zwearz.com/watch/xxxx
+ %(invidious)s
)/
|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
)
)
)
(?(1).+)? # if we found the ID, everything can follow
- $""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
+ $""" % {
+ 'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE,
+ 'invidious': '|'.join(_INVIDIOUS_SITES),
+ }
_PLAYER_INFO_RE = (
r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player',
r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$',
'url': 'https://invidio.us/watch?v=BaW_jenozKc',
'only_matching': True,
},
+ {
+ 'url': 'https://redirect.invidious.io/watch?v=BaW_jenozKc',
+ 'only_matching': True,
+ },
+ {
+ # from https://nitter.pussthecat.org/YouTube/status/1360363141947944964#m
+ 'url': 'https://redirect.invidious.io/Yh0AhrY9GjA',
+ 'only_matching': True,
+ },
{
# DRM protected
'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
'skip_download': True,
},
},
+ {
+ # controversial video, only works with bpctr when authenticated with cookies
+ 'url': 'https://www.youtube.com/watch?v=nGC3D_FkCmg',
+ 'only_matching': True,
+ },
]
def __init__(self, *args, **kwargs):
url, smuggled_data = unsmuggle_url(url, {})
video_id = self._match_id(url)
base_url = self.http_scheme() + '//www.youtube.com/'
- webpage_url = base_url + 'watch?v=' + video_id + '&has_verified=1'
+ webpage_url = base_url + 'watch?v=' + video_id + '&has_verified=1&bpctr=9999999999'
webpage = self._download_webpage(webpage_url, video_id, fatal=False)
player_response = None
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import float_or_none, int_or_none
+
+
+class ZhihuIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?zhihu\.com/zvideo/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'https://www.zhihu.com/zvideo/1342930761977176064',
+ 'md5': 'c8d4c9cd72dd58e6f9bc9c2c84266464',
+ 'info_dict': {
+ 'id': '1342930761977176064',
+ 'ext': 'mp4',
+ 'title': '写春联也太难了吧!',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'uploader': '桥半舫',
+ 'timestamp': 1612959715,
+ 'upload_date': '20210210',
+ 'uploader_id': '244ecb13b0fd7daf92235288c8ca3365',
+ 'duration': 146.333,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ zvideo = self._download_json(
+ 'https://www.zhihu.com/api/v4/zvideos/' + video_id, video_id)
+ title = zvideo['title']
+ video = zvideo.get('video') or {}
+
+ formats = []
+ for format_id, q in (video.get('playlist') or {}).items():
+ play_url = q.get('url') or q.get('play_url')
+ if not play_url:
+ continue
+ formats.append({
+ 'asr': int_or_none(q.get('sample_rate')),
+ 'filesize': int_or_none(q.get('size')),
+ 'format_id': format_id,
+ 'fps': int_or_none(q.get('fps')),
+ 'height': int_or_none(q.get('height')),
+ 'tbr': float_or_none(q.get('bitrate')),
+ 'url': play_url,
+ 'width': int_or_none(q.get('width')),
+ })
+ self._sort_formats(formats)
+
+ author = zvideo.get('author') or {}
+ url_token = author.get('url_token')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': video.get('thumbnail') or zvideo.get('image_url'),
+ 'uploader': author.get('name'),
+ 'timestamp': int_or_none(zvideo.get('published_at')),
+ 'uploader_id': author.get('id'),
+ 'uploader_url': 'https://www.zhihu.com/people/' + url_token if url_token else None,
+ 'duration': float_or_none(video.get('duration')),
+ 'view_count': int_or_none(zvideo.get('play_count')),
+ 'like_count': int_or_none(zvideo.get('liked_count')),
+ 'comment_count': int_or_none(zvideo.get('comment_count')),
+ }
except PostProcessingError as err:
self.report_warning('unable to embed using ffprobe & ffmpeg; %s' % error_to_compat_str(err))
- if not check_executable('AtomicParsley', ['-v']):
+ atomicparsley = next((
+ x for x in ['AtomicParsley', 'atomicparsley']
+ if check_executable(x, ['-v'])), None)
+ if atomicparsley is None:
raise EmbedThumbnailPPError('AtomicParsley was not found. Please install.')
- cmd = [encodeFilename('AtomicParsley', True),
+ cmd = [encodeFilename(atomicparsley, True),
encodeFilename(filename, True),
encodeArgument('--artwork'),
encodeFilename(thumbnail_filename, True),