2 from __future__
import unicode_literals
18 from ..compat
import (
19 compat_cookiejar_Cookie
,
22 compat_etree_fromstring
,
29 compat_urllib_parse_unquote
,
30 compat_urllib_parse_urlencode
,
31 compat_urllib_request
,
33 compat_xml_parse_error
,
35 from ..downloader
import FileDownloader
36 from ..downloader
.f4m
import (
38 remove_encrypted_media
,
66 parse_m3u8_attributes
,
88 class InfoExtractor(object):
89 """Information Extractor class.
91 Information extractors are the classes that, given a URL, extract
92 information about the video (or videos) the URL refers to. This
93 information includes the real video URL, the video title, author and
94 others. The information is stored in a dictionary which is then
95 passed to the YoutubeDL. The YoutubeDL processes this
96 information possibly downloading the video to the file system, among
97 other possible outcomes.
99 The type field determines the type of the result.
100 By far the most common value (and the default if _type is missing) is
101 "video", which indicates a single video.
103 For a video, the dictionaries must include the following fields:
105 id: Video identifier.
106 title: Video title, unescaped.
108 Additionally, it must contain either a formats entry or a url one:
110 formats: A list of dictionaries for each format available, ordered
111 from worst to best quality.
114 * url The mandatory URL representing the media:
115 for plain file media - HTTP URL of this file,
117 for HLS - URL of the M3U8 media playlist,
118 for HDS - URL of the F4M manifest,
120 - HTTP URL to plain file media (in case of
122 - URL of the MPD manifest or base URL
123 representing the media if MPD manifest
124 is parsed from a string (in case of
126 for MSS - URL of the ISM manifest.
128 The URL of the manifest file in case of
130 for HLS - URL of the M3U8 master playlist,
131 for HDS - URL of the F4M manifest,
132 for DASH - URL of the MPD manifest,
133 for MSS - URL of the ISM manifest.
134 * ext Will be calculated from URL if missing
135 * format A human-readable description of the format
136 ("mp4 container with h264/opus").
137 Calculated from the format_id, width, height.
138 and format_note fields if missing.
139 * format_id A short description of the format
140 ("mp4_h264_opus" or "19").
141 Technically optional, but strongly recommended.
142 * format_note Additional info about the format
143 ("3D" or "DASH video")
144 * width Width of the video, if known
145 * height Height of the video, if known
146 * resolution Textual description of width and height
147 * tbr Average bitrate of audio and video in KBit/s
148 * abr Average audio bitrate in KBit/s
149 * acodec Name of the audio codec in use
150 * asr Audio sampling rate in Hertz
151 * vbr Average video bitrate in KBit/s
153 * vcodec Name of the video codec in use
154 * container Name of the container format
155 * filesize The number of bytes, if known in advance
156 * filesize_approx An estimate for the number of bytes
157 * player_url SWF Player URL (used for rtmpdump).
158 * protocol The protocol that will be used for the actual
159 download, lower-case.
160 "http", "https", "rtsp", "rtmp", "rtmpe",
161 "m3u8", "m3u8_native" or "http_dash_segments".
163 Base URL for fragments. Each fragment's path
164 value (if present) will be relative to
166 * fragments A list of fragments of a fragmented media.
167 Each fragment entry must contain either an url
168 or a path. If an url is present it should be
169 considered by a client. Otherwise both path and
170 fragment_base_url must be present. Here is
171 the list of all potential fields:
172 * "url" - fragment's URL
173 * "path" - fragment's path relative to
175 * "duration" (optional, int or float)
176 * "filesize" (optional, int)
177 * preference Order number of this format. If this field is
178 present and not None, the formats get sorted
179 by this field, regardless of all other values.
180 -1 for default (order by other properties),
181 -2 or smaller for less than default.
182 < -1000 to hide the format (if there is
183 another one which is strictly better)
184 * language Language code, e.g. "de" or "en-US".
185 * language_preference Is this in the language mentioned in
187 10 if it's what the URL is about,
188 -1 for default (don't know),
189 -10 otherwise, other values reserved for now.
190 * quality Order number of the video quality of this
191 format, irrespective of the file format.
192 -1 for default (order by other properties),
193 -2 or smaller for less than default.
194 * source_preference Order number for this video source
195 (quality takes higher priority)
196 -1 for default (order by other properties),
197 -2 or smaller for less than default.
198 * http_headers A dictionary of additional HTTP headers
199 to add to the request.
200 * stretched_ratio If given and not 1, indicates that the
201 video's pixels are not square.
202 width : height ratio as float.
203 * no_resume The server does not support resuming the
204 (HTTP or RTMP) download. Boolean.
205 * downloader_options A dictionary of downloader options as
206 described in FileDownloader
208 url: Final video URL.
209 ext: Video filename extension.
210 format: The video format, defaults to ext (used for --get-format)
211 player_url: SWF Player URL (used for rtmpdump).
213 The following fields are optional:
215 alt_title: A secondary title of the video.
216 display_id An alternative identifier for the video, not necessarily
217 unique, but available before title. Typically, id is
218 something like "4234987", title "Dancing naked mole rats",
219 and display_id "dancing-naked-mole-rats"
220 thumbnails: A list of dictionaries, with the following entries:
221 * "id" (optional, string) - Thumbnail format ID
223 * "preference" (optional, int) - quality of the image
224 * "width" (optional, int)
225 * "height" (optional, int)
226 * "resolution" (optional, string "{width}x{height}",
228 * "filesize" (optional, int)
229 thumbnail: Full URL to a video thumbnail image.
230 description: Full video description.
231 uploader: Full name of the video uploader.
232 license: License name the video is licensed under.
233 creator: The creator of the video.
234 release_date: The date (YYYYMMDD) when the video was released.
235 timestamp: UNIX timestamp of the moment the video became available.
236 upload_date: Video upload date (YYYYMMDD).
237 If not explicitly set, calculated from timestamp.
238 uploader_id: Nickname or id of the video uploader.
239 uploader_url: Full URL to a personal webpage of the video uploader.
240 channel: Full name of the channel the video is uploaded on.
241 Note that channel fields may or may not repeat uploader
242 fields. This depends on a particular extractor.
243 channel_id: Id of the channel.
244 channel_url: Full URL to a channel webpage.
245 location: Physical location where the video was filmed.
246 subtitles: The available subtitles as a dictionary in the format
247 {tag: subformats}. "tag" is usually a language code, and
248 "subformats" is a list sorted from lower to higher
249 preference, each element is a dictionary with the "ext"
251 * "data": The subtitles file contents
252 * "url": A URL pointing to the subtitles file
253 "ext" will be calculated from URL if missing
254 automatic_captions: Like 'subtitles', used by the YoutubeIE for
255 automatically generated captions
256 duration: Length of the video in seconds, as an integer or float.
257 view_count: How many users have watched the video on the platform.
258 like_count: Number of positive ratings of the video
259 dislike_count: Number of negative ratings of the video
260 repost_count: Number of reposts of the video
261 average_rating: Average rating give by users, the scale used depends on the webpage
262 comment_count: Number of comments on the video
263 comments: A list of comments, each with one or more of the following
264 properties (all but one of text or html optional):
265 * "author" - human-readable name of the comment author
266 * "author_id" - user ID of the comment author
268 * "html" - Comment as HTML
269 * "text" - Plain text of the comment
270 * "timestamp" - UNIX timestamp of comment
271 * "parent" - ID of the comment this one is replying to.
272 Set to "root" to indicate that this is a
273 comment to the original video.
274 age_limit: Age restriction for the video, as an integer (years)
275 webpage_url: The URL to the video webpage, if given to yt-dlp it
276 should allow to get the same result again. (It will be set
277 by YoutubeDL if it's missing)
278 categories: A list of categories that the video falls in, for example
280 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
281 is_live: True, False, or None (=unknown). Whether this video is a
282 live stream that goes on instead of a fixed-length video.
283 was_live: True, False, or None (=unknown). Whether this video was
284 originally a live stream.
285 start_time: Time in seconds where the reproduction should start, as
286 specified in the URL.
287 end_time: Time in seconds where the reproduction should end, as
288 specified in the URL.
289 chapters: A list of dictionaries, with the following entries:
290 * "start_time" - The start time of the chapter in seconds
291 * "end_time" - The end time of the chapter in seconds
292 * "title" (optional, string)
293 playable_in_embed: Whether this video is allowed to play in embedded
294 players on other sites. Can be True (=always allowed),
295 False (=never allowed), None (=unknown), or a string
296 specifying the criteria for embedability (Eg: 'whitelist').
297 __post_extractor: A function to be called just before the metadata is
298 written to either disk, logger or console. The function
299 must return a dict which will be added to the info_dict.
300 This is usefull for additional information that is
301 time-consuming to extract. Note that the fields thus
302 extracted will not be available to output template and
303 match_filter. So, only "comments" and "comment_count" are
304 currently allowed to be extracted via this method.
306 The following fields should only be used when the video belongs to some logical
309 chapter: Name or title of the chapter the video belongs to.
310 chapter_number: Number of the chapter the video belongs to, as an integer.
311 chapter_id: Id of the chapter the video belongs to, as a unicode string.
313 The following fields should only be used when the video is an episode of some
314 series, programme or podcast:
316 series: Title of the series or programme the video episode belongs to.
317 season: Title of the season the video episode belongs to.
318 season_number: Number of the season the video episode belongs to, as an integer.
319 season_id: Id of the season the video episode belongs to, as a unicode string.
320 episode: Title of the video episode. Unlike mandatory video title field,
321 this field should denote the exact title of the video episode
322 without any kind of decoration.
323 episode_number: Number of the video episode within a season, as an integer.
324 episode_id: Id of the video episode, as a unicode string.
326 The following fields should only be used when the media is a track or a part of
329 track: Title of the track.
330 track_number: Number of the track within an album or a disc, as an integer.
331 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
333 artist: Artist(s) of the track.
334 genre: Genre(s) of the track.
335 album: Title of the album the track belongs to.
336 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
337 album_artist: List of all artists appeared on the album (e.g.
338 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
340 disc_number: Number of the disc or other physical medium the track belongs to,
342 release_year: Year (YYYY) when the album was released.
344 Unless mentioned otherwise, the fields should be Unicode strings.
346 Unless mentioned otherwise, None is equivalent to absence of information.
349 _type "playlist" indicates multiple videos.
350 There must be a key "entries", which is a list, an iterable, or a PagedList
351 object, each element of which is a valid dictionary by this specification.
353 Additionally, playlists can have "id", "title", and any other relevent
354 attributes with the same semantics as videos (see above).
357 _type "multi_video" indicates that there are multiple videos that
358 form a single show, for examples multiple acts of an opera or TV episode.
359 It must have an entries key like a playlist and contain all the keys
360 required for a video at the same time.
363 _type "url" indicates that the video must be extracted from another
364 location, possibly by a different extractor. Its only required key is:
365 "url" - the next URL to extract.
366 The key "ie_key" can be set to the class name (minus the trailing "IE",
367 e.g. "Youtube") if the extractor class is known in advance.
368 Additionally, the dictionary may have any properties of the resolved entity
369 known in advance, for example "title" if the title of the referred video is
373 _type "url_transparent" entities have the same specification as "url", but
374 indicate that the given additional information is more precise than the one
375 associated with the resolved URL.
376 This is useful when a site employs a video service that hosts the video and
377 its technical metadata, but that video service does not embed a useful
378 title, description etc.
381 Subclasses of this one should re-define the _real_initialize() and
382 _real_extract() methods and define a _VALID_URL regexp.
383 Probably, they should also be added to the list of extractors.
385 _GEO_BYPASS attribute may be set to False in order to disable
386 geo restriction bypass mechanisms for a particular extractor.
387 Though it won't disable explicit geo restriction bypass based on
388 country code provided with geo_bypass_country.
390 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
391 countries for this extractor. One of these countries will be used by
392 geo restriction bypass mechanism right away in order to bypass
393 geo restriction, of course, if the mechanism is not disabled.
395 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
396 IP blocks in CIDR notation for this extractor. One of these IP blocks
397 will be used by geo restriction bypass mechanism similarly
400 Finally, the _WORKING attribute should be set to False for broken IEs
401 in order to warn the users and skip the tests.
406 _x_forwarded_for_ip
= None
408 _GEO_COUNTRIES
= None
409 _GEO_IP_BLOCKS
= None
412 def __init__(self
, downloader
=None):
413 """Constructor. Receives an optional downloader."""
415 self
._x
_forwarded
_for
_ip
= None
416 self
.set_downloader(downloader
)
419 def suitable(cls
, url
):
420 """Receives a URL and returns True if suitable for this IE."""
422 # This does not use has/getattr intentionally - we want to know whether
423 # we have cached the regexp for *this* class, whereas getattr would also
424 # match the superclass
425 if '_VALID_URL_RE' not in cls
.__dict
__:
426 cls
._VALID
_URL
_RE
= re
.compile(cls
._VALID
_URL
)
427 return cls
._VALID
_URL
_RE
.match(url
) is not None
430 def _match_id(cls
, url
):
431 if '_VALID_URL_RE' not in cls
.__dict
__:
432 cls
._VALID
_URL
_RE
= re
.compile(cls
._VALID
_URL
)
433 m
= cls
._VALID
_URL
_RE
.match(url
)
435 return compat_str(m
.group('id'))
439 """Getter method for _WORKING."""
442 def initialize(self
):
443 """Initializes an instance (authentication, etc)."""
444 self
._initialize
_geo
_bypass
({
445 'countries': self
._GEO
_COUNTRIES
,
446 'ip_blocks': self
._GEO
_IP
_BLOCKS
,
449 self
._real
_initialize
()
452 def _initialize_geo_bypass(self
, geo_bypass_context
):
454 Initialize geo restriction bypass mechanism.
456 This method is used to initialize geo bypass mechanism based on faking
457 X-Forwarded-For HTTP header. A random country from provided country list
458 is selected and a random IP belonging to this country is generated. This
459 IP will be passed as X-Forwarded-For HTTP header in all subsequent
462 This method will be used for initial geo bypass mechanism initialization
463 during the instance initialization with _GEO_COUNTRIES and
466 You may also manually call it from extractor's code if geo bypass
467 information is not available beforehand (e.g. obtained during
468 extraction) or due to some other reason. In this case you should pass
469 this information in geo bypass context passed as first argument. It may
470 contain following fields:
472 countries: List of geo unrestricted countries (similar
474 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
475 (similar to _GEO_IP_BLOCKS)
478 if not self
._x
_forwarded
_for
_ip
:
480 # Geo bypass mechanism is explicitly disabled by user
481 if not self
._downloader
.params
.get('geo_bypass', True):
484 if not geo_bypass_context
:
485 geo_bypass_context
= {}
487 # Backward compatibility: previously _initialize_geo_bypass
488 # expected a list of countries, some 3rd party code may still use
490 if isinstance(geo_bypass_context
, (list, tuple)):
491 geo_bypass_context
= {
492 'countries': geo_bypass_context
,
495 # The whole point of geo bypass mechanism is to fake IP
496 # as X-Forwarded-For HTTP header based on some IP block or
499 # Path 1: bypassing based on IP block in CIDR notation
501 # Explicit IP block specified by user, use it right away
502 # regardless of whether extractor is geo bypassable or not
503 ip_block
= self
._downloader
.params
.get('geo_bypass_ip_block', None)
505 # Otherwise use random IP block from geo bypass context but only
506 # if extractor is known as geo bypassable
508 ip_blocks
= geo_bypass_context
.get('ip_blocks')
509 if self
._GEO
_BYPASS
and ip_blocks
:
510 ip_block
= random
.choice(ip_blocks
)
513 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(ip_block
)
514 if self
._downloader
.params
.get('verbose', False):
515 self
._downloader
.to_screen(
516 '[debug] Using fake IP %s as X-Forwarded-For.'
517 % self
._x
_forwarded
_for
_ip
)
520 # Path 2: bypassing based on country code
522 # Explicit country code specified by user, use it right away
523 # regardless of whether extractor is geo bypassable or not
524 country
= self
._downloader
.params
.get('geo_bypass_country', None)
526 # Otherwise use random country code from geo bypass context but
527 # only if extractor is known as geo bypassable
529 countries
= geo_bypass_context
.get('countries')
530 if self
._GEO
_BYPASS
and countries
:
531 country
= random
.choice(countries
)
534 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(country
)
535 if self
._downloader
.params
.get('verbose', False):
536 self
._downloader
.to_screen(
537 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
538 % (self
._x
_forwarded
_for
_ip
, country
.upper()))
540 def extract(self
, url
):
541 """Extracts URL information and returns it in list of dicts."""
546 ie_result
= self
._real
_extract
(url
)
547 if self
._x
_forwarded
_for
_ip
:
548 ie_result
['__x_forwarded_for_ip'] = self
._x
_forwarded
_for
_ip
550 except GeoRestrictedError
as e
:
551 if self
.__maybe
_fake
_ip
_and
_retry
(e
.countries
):
554 except ExtractorError
:
556 except compat_http_client
.IncompleteRead
as e
:
557 raise ExtractorError('A network error has occurred.', cause
=e
, expected
=True)
558 except (KeyError, StopIteration) as e
:
559 raise ExtractorError('An extractor error has occurred.', cause
=e
)
561 def __maybe_fake_ip_and_retry(self
, countries
):
562 if (not self
._downloader
.params
.get('geo_bypass_country', None)
564 and self
._downloader
.params
.get('geo_bypass', True)
565 and not self
._x
_forwarded
_for
_ip
567 country_code
= random
.choice(countries
)
568 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(country_code
)
569 if self
._x
_forwarded
_for
_ip
:
571 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
572 % (self
._x
_forwarded
_for
_ip
, country_code
.upper()))
576 def set_downloader(self
, downloader
):
577 """Sets the downloader for this IE."""
578 self
._downloader
= downloader
580 def _real_initialize(self
):
581 """Real initialization process. Redefine in subclasses."""
584 def _real_extract(self
, url
):
585 """Real extraction process. Redefine in subclasses."""
590 """A string for getting the InfoExtractor with get_info_extractor"""
591 return compat_str(cls
.__name
__[:-2])
595 return compat_str(type(self
).__name
__[:-2])
598 def __can_accept_status_code(err
, expected_status
):
599 assert isinstance(err
, compat_urllib_error
.HTTPError
)
600 if expected_status
is None:
602 if isinstance(expected_status
, compat_integer_types
):
603 return err
.code
== expected_status
604 elif isinstance(expected_status
, (list, tuple)):
605 return err
.code
in expected_status
606 elif callable(expected_status
):
607 return expected_status(err
.code
) is True
611 def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, data
=None, headers
={}, query={}
, expected_status
=None):
613 Return the response handle.
615 See _download_webpage docstring for arguments specification.
617 if not self
._downloader
._first
_webpage
_request
:
618 sleep_interval
= float_or_none(self
._downloader
.params
.get('sleep_interval_requests')) or 0
619 if sleep_interval
> 0:
620 self
.to_screen('Sleeping %s seconds ...' % sleep_interval
)
621 time
.sleep(sleep_interval
)
623 self
._downloader
._first
_webpage
_request
= False
626 self
.report_download_webpage(video_id
)
627 elif note
is not False:
629 self
.to_screen('%s' % (note
,))
631 self
.to_screen('%s: %s' % (video_id
, note
))
633 # Some sites check X-Forwarded-For HTTP header in order to figure out
634 # the origin of the client behind proxy. This allows bypassing geo
635 # restriction by faking this header's value to IP that belongs to some
636 # geo unrestricted country. We will do so once we encounter any
637 # geo restriction error.
638 if self
._x
_forwarded
_for
_ip
:
639 if 'X-Forwarded-For' not in headers
:
640 headers
['X-Forwarded-For'] = self
._x
_forwarded
_for
_ip
642 if isinstance(url_or_request
, compat_urllib_request
.Request
):
643 url_or_request
= update_Request(
644 url_or_request
, data
=data
, headers
=headers
, query
=query
)
647 url_or_request
= update_url_query(url_or_request
, query
)
648 if data
is not None or headers
:
649 url_or_request
= sanitized_Request(url_or_request
, data
, headers
)
650 exceptions
= [compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
]
651 if hasattr(ssl
, 'CertificateError'):
652 exceptions
.append(ssl
.CertificateError
)
654 return self
._downloader
.urlopen(url_or_request
)
655 except tuple(exceptions
) as err
:
656 if isinstance(err
, compat_urllib_error
.HTTPError
):
657 if self
.__can
_accept
_status
_code
(err
, expected_status
):
658 # Retain reference to error to prevent file object from
659 # being closed before it can be read. Works around the
660 # effects of <https://bugs.python.org/issue15002>
661 # introduced in Python 3.4.1.
668 errnote
= 'Unable to download webpage'
670 errmsg
= '%s: %s' % (errnote
, error_to_compat_str(err
))
672 raise ExtractorError(errmsg
, sys
.exc_info()[2], cause
=err
)
674 self
._downloader
.report_warning(errmsg
)
677 def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, encoding
=None, data
=None, headers
={}, query={}
, expected_status
=None):
679 Return a tuple (page content as string, URL handle).
681 See _download_webpage docstring for arguments specification.
683 # Strip hashes from the URL (#1038)
684 if isinstance(url_or_request
, (compat_str
, str)):
685 url_or_request
= url_or_request
.partition('#')[0]
687 urlh
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
, fatal
, data
=data
, headers
=headers
, query
=query
, expected_status
=expected_status
)
691 content
= self
._webpage
_read
_content
(urlh
, url_or_request
, video_id
, note
, errnote
, fatal
, encoding
=encoding
)
692 return (content
, urlh
)
695 def _guess_encoding_from_content(content_type
, webpage_bytes
):
696 m
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
)
698 encoding
= m
.group(1)
700 m
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
701 webpage_bytes[:1024])
703 encoding = m.group(1).decode('ascii')
704 elif webpage_bytes.startswith(b'\xff\xfe'):
711 def __check_blocked(self, content):
712 first_block = content[:512]
713 if ('<title>Access to this site is blocked</title>' in content
714 and 'Websense' in first_block):
715 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
716 blocked_iframe = self._html_search_regex(
717 r'<iframe src="([^
"]+)"', content,
718 'Websense information URL
', default=None)
720 msg += ' Visit
%s for more details
' % blocked_iframe
721 raise ExtractorError(msg, expected=True)
722 if '<title
>The URL you requested has been blocked
</title
>' in first_block:
724 'Access to this webpage has been blocked by Indian censorship
. '
725 'Use a VPN
or proxy
server (with --proxy
) to route around it
.')
726 block_msg = self._html_search_regex(
727 r'</h1
><p
>(.*?
)</p
>',
728 content, 'block message
', default=None)
730 msg += ' (Message
: "%s")' % block_msg.replace('\n', ' ')
731 raise ExtractorError(msg, expected=True)
732 if ('<title
>TTK
:: Доступ к ресурсу ограничен
</title
>' in content
733 and 'blocklist
.rkn
.gov
.ru
' in content):
734 raise ExtractorError(
735 'Access to this webpage has been blocked by decision of the Russian government
. '
736 'Visit http
://blocklist
.rkn
.gov
.ru
/ for a block reason
.',
739 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
740 content_type = urlh.headers.get('Content
-Type
', '')
741 webpage_bytes = urlh.read()
742 if prefix is not None:
743 webpage_bytes = prefix + webpage_bytes
745 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
746 if self._downloader.params.get('dump_intermediate_pages
', False):
747 self.to_screen('Dumping request to
' + urlh.geturl())
748 dump = base64.b64encode(webpage_bytes).decode('ascii
')
749 self._downloader.to_screen(dump)
750 if self._downloader.params.get('write_pages
', False):
751 basen = '%s_%s' % (video_id, urlh.geturl())
753 h = '___
' + hashlib.md5(basen.encode('utf
-8')).hexdigest()
754 basen = basen[:240 - len(h)] + h
755 raw_filename = basen + '.dump
'
756 filename = sanitize_filename(raw_filename, restricted=True)
757 self.to_screen('Saving request to
' + filename)
758 # Working around MAX_PATH limitation on Windows (see
759 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
760 if compat_os_name == 'nt
':
761 absfilepath = os.path.abspath(filename)
762 if len(absfilepath) > 259:
763 filename = '\\\\?
\\' + absfilepath
764 with open(filename, 'wb
') as outf:
765 outf.write(webpage_bytes)
768 content = webpage_bytes.decode(encoding, 'replace
')
770 content = webpage_bytes.decode('utf
-8', 'replace
')
772 self.__check_blocked(content)
776 def _download_webpage(
777 self, url_or_request, video_id, note=None, errnote=None,
778 fatal=True, tries=1, timeout=5, encoding=None, data=None,
779 headers={}, query={}, expected_status=None):
781 Return the data of the page as a string.
784 url_or_request -- plain text URL as a string or
785 a compat_urllib_request.Requestobject
786 video_id -- Video/playlist/item identifier (string)
789 note -- note printed before downloading (string)
790 errnote -- note printed in case of an error (string)
791 fatal -- flag denoting whether error should be considered fatal,
792 i.e. whether it should cause ExtractionError to be raised,
793 otherwise a warning will be reported and extraction continued
794 tries -- number of tries
795 timeout -- sleep interval between tries
796 encoding -- encoding for a page content decoding, guessed automatically
797 when not explicitly specified
798 data -- POST data (bytes)
799 headers -- HTTP headers (dict)
800 query -- URL query (dict)
801 expected_status -- allows to accept failed HTTP requests (non 2xx
802 status code) by explicitly specifying a set of accepted status
803 codes. Can be any of the following entities:
804 - an integer type specifying an exact failed status code to
806 - a list or a tuple of integer types specifying a list of
807 failed status codes to accept
808 - a callable accepting an actual failed status code and
809 returning True if it should be accepted
810 Note that this argument does not affect success status codes (2xx)
811 which are always accepted.
816 while success is False:
818 res = self._download_webpage_handle(
819 url_or_request, video_id, note, errnote, fatal,
820 encoding=encoding, data=data, headers=headers, query=query,
821 expected_status=expected_status)
823 except compat_http_client.IncompleteRead as e:
825 if try_count >= tries:
827 self._sleep(timeout, video_id)
834 def _download_xml_handle(
835 self, url_or_request, video_id, note='Downloading XML
',
836 errnote='Unable to download XML
', transform_source=None,
837 fatal=True, encoding=None, data=None, headers={}, query={},
838 expected_status=None):
840 Return a tuple (xml as an compat_etree_Element, URL handle).
842 See _download_webpage docstring for arguments specification.
844 res = self._download_webpage_handle(
845 url_or_request, video_id, note, errnote, fatal=fatal,
846 encoding=encoding, data=data, headers=headers, query=query,
847 expected_status=expected_status)
850 xml_string, urlh = res
851 return self._parse_xml(
852 xml_string, video_id, transform_source=transform_source,
856 self, url_or_request, video_id,
857 note='Downloading XML
', errnote='Unable to download XML
',
858 transform_source=None, fatal=True, encoding=None,
859 data=None, headers={}, query={}, expected_status=None):
861 Return the xml as an compat_etree_Element.
863 See _download_webpage docstring for arguments specification.
865 res = self._download_xml_handle(
866 url_or_request, video_id, note=note, errnote=errnote,
867 transform_source=transform_source, fatal=fatal, encoding=encoding,
868 data=data, headers=headers, query=query,
869 expected_status=expected_status)
870 return res if res is False else res[0]
872 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
874 xml_string = transform_source(xml_string)
876 return compat_etree_fromstring(xml_string.encode('utf
-8'))
877 except compat_xml_parse_error as ve:
878 errmsg = '%s: Failed to parse XML
' % video_id
880 raise ExtractorError(errmsg, cause=ve)
882 self.report_warning(errmsg + str(ve))
884 def _download_json_handle(
885 self, url_or_request, video_id, note='Downloading JSON metadata
',
886 errnote='Unable to download JSON metadata
', transform_source=None,
887 fatal=True, encoding=None, data=None, headers={}, query={},
888 expected_status=None):
890 Return a tuple (JSON object, URL handle).
892 See _download_webpage docstring for arguments specification.
894 res = self._download_webpage_handle(
895 url_or_request, video_id, note, errnote, fatal=fatal,
896 encoding=encoding, data=data, headers=headers, query=query,
897 expected_status=expected_status)
900 json_string, urlh = res
901 return self._parse_json(
902 json_string, video_id, transform_source=transform_source,
906 self, url_or_request, video_id, note='Downloading JSON metadata
',
907 errnote='Unable to download JSON metadata
', transform_source=None,
908 fatal=True, encoding=None, data=None, headers={}, query={},
909 expected_status=None):
911 Return the JSON object as a dict.
913 See _download_webpage docstring for arguments specification.
915 res = self._download_json_handle(
916 url_or_request, video_id, note=note, errnote=errnote,
917 transform_source=transform_source, fatal=fatal, encoding=encoding,
918 data=data, headers=headers, query=query,
919 expected_status=expected_status)
920 return res if res is False else res[0]
922 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
924 json_string = transform_source(json_string)
926 return json.loads(json_string)
927 except ValueError as ve:
928 errmsg = '%s: Failed to parse JSON
' % video_id
930 raise ExtractorError(errmsg, cause=ve)
932 self.report_warning(errmsg + str(ve))
934 def report_warning(self, msg, video_id=None):
935 idstr = '' if video_id is None else '%s: ' % video_id
936 self._downloader.report_warning(
937 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
939 def to_screen(self, msg):
940 """Print msg to screen, prefixing it with '[ie_name
]'"""
941 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
943 def report_extraction(self, id_or_name):
944 """Report information extraction."""
945 self.to_screen('%s: Extracting information
' % id_or_name)
947 def report_download_webpage(self, video_id):
948 """Report webpage download."""
949 self.to_screen('%s: Downloading webpage
' % video_id)
951 def report_age_confirmation(self):
952 """Report attempt to confirm age."""
953 self.to_screen('Confirming age
')
955 def report_login(self):
956 """Report attempt to log in."""
957 self.to_screen('Logging
in')
960 def raise_login_required(msg='This video
is only available
for registered users
'):
961 raise ExtractorError(
962 '%s. Use
--username
and --password
or --netrc to provide account credentials
.' % msg,
966 def raise_geo_restricted(msg='This video
is not available
from your location due to geo restriction
', countries=None):
967 raise GeoRestrictedError(msg, countries=countries)
969 # Methods for following #608
971 def url_result(url, ie=None, video_id=None, video_title=None):
972 """Returns a URL that points to a page that should be processed"""
973 # TODO: ie should be the class used for getting the info
974 video_info = {'_type
': 'url
',
977 if video_id is not None:
978 video_info['id'] = video_id
979 if video_title is not None:
980 video_info['title
'] = video_title
983 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
985 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
987 return self.playlist_result(
988 urls, playlist_id=playlist_id, playlist_title=playlist_title)
991 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, **kwargs):
992 """Returns a playlist"""
993 video_info = {'_type
': 'playlist
',
995 video_info.update(kwargs)
997 video_info['id'] = playlist_id
999 video_info['title
'] = playlist_title
1000 if playlist_description is not None:
1001 video_info['description
'] = playlist_description
1004 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1006 Perform a regex search on the given string, using a single or a list of
1007 patterns returning the first matching group.
1008 In case of failure return a default value or raise a WARNING or a
1009 RegexNotFoundError, depending on fatal, specifying the field name.
1011 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
1012 mobj = re.search(pattern, string, flags)
1015 mobj = re.search(p, string, flags)
1019 if not self._downloader.params.get('no_color
') and compat_os_name != 'nt
' and sys.stderr.isatty():
1020 _name = '\033[0;34m
%s\033[0m
' % name
1026 # return the first matching group
1027 return next(g for g in mobj.groups() if g is not None)
1029 return mobj.group(group)
1030 elif default is not NO_DEFAULT:
1033 raise RegexNotFoundError('Unable to extract
%s' % _name)
1035 self._downloader.report_warning('unable to extract
%s' % _name + bug_reports_message())
1038 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
1040 Like _search_regex, but strips HTML tags and unescapes entities.
1042 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
1044 return clean_html(res).strip()
1048 def _get_netrc_login_info(self, netrc_machine=None):
1051 netrc_machine = netrc_machine or self._NETRC_MACHINE
1053 if self._downloader.params.get('usenetrc
', False):
1055 info = netrc.netrc().authenticators(netrc_machine)
1056 if info is not None:
1060 raise netrc.NetrcParseError(
1061 'No authenticators
for %s' % netrc_machine)
1062 except (IOError, netrc.NetrcParseError) as err:
1063 self._downloader.report_warning(
1064 'parsing
.netrc
: %s' % error_to_compat_str(err))
1066 return username, password
1068 def _get_login_info(self, username_option='username
', password_option='password
', netrc_machine=None):
1070 Get the login info as (username, password)
1071 First look for the manually specified credentials using username_option
1072 and password_option as keys in params dictionary. If no such credentials
1073 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
1075 If there's no info available
, return (None, None)
1077 if self._downloader is None:
1080 downloader_params = self._downloader.params
1082 # Attempt to use provided username and password or .netrc data
1083 if downloader_params.get(username_option) is not None:
1084 username = downloader_params[username_option]
1085 password = downloader_params[password_option]
1087 username, password = self._get_netrc_login_info(netrc_machine)
1089 return username, password
1091 def _get_tfa_info(self, note='two-factor verification code'):
1093 Get the two
-factor authentication info
1094 TODO
- asking the user will be required
for sms
/phone verify
1095 currently just uses the command line option
1096 If there
's no info available, return None
1098 if self._downloader is None:
1100 downloader_params = self._downloader.params
1102 if downloader_params.get('twofactor
') is not None:
1103 return downloader_params['twofactor
']
1105 return compat_getpass('Type
%s and press
[Return
]: ' % note)
1107 # Helper functions for extracting OpenGraph info
1109 def _og_regexes(prop):
1110 content_re = r'content
=(?
:"([^"]+?
)"|\'([^\']+?)\'|\s*([^\s"\'=<>`
]+?
))'
1111 property_re = (r'(?
:name|
property)=(?
:\'og
[:-]%(prop)s\'|
"og[:-]%(prop)s"|\s
*og
[:-]%(prop)s\b)'
1112 % {'prop': re.escape(prop)})
1113 template = r'<meta
[^
>]+?
%s[^
>]+?
%s'
1115 template % (property_re, content_re),
1116 template % (content_re, property_re),
1120 def _meta_regex(prop):
1121 return r'''(?isx)<meta
1122 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
1123 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
1125 def _og_search_property(self, prop, html, name=None, **kargs):
1126 if not isinstance(prop, (list, tuple)):
1129 name = 'OpenGraph
%s' % prop[0]
1132 og_regexes.extend(self._og_regexes(p))
1133 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
1136 return unescapeHTML(escaped)
1138 def _og_search_thumbnail(self, html, **kargs):
1139 return self._og_search_property('image
', html, 'thumbnail URL
', fatal=False, **kargs)
1141 def _og_search_description(self, html, **kargs):
1142 return self._og_search_property('description
', html, fatal=False, **kargs)
1144 def _og_search_title(self, html, **kargs):
1145 return self._og_search_property('title
', html, **kargs)
1147 def _og_search_video_url(self, html, name='video url
', secure=True, **kargs):
1148 regexes = self._og_regexes('video
') + self._og_regexes('video
:url
')
1150 regexes = self._og_regexes('video
:secure_url
') + regexes
1151 return self._html_search_regex(regexes, html, name, **kargs)
1153 def _og_search_url(self, html, **kargs):
1154 return self._og_search_property('url
', html, **kargs)
1156 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1157 if not isinstance(name, (list, tuple)):
1159 if display_name is None:
1160 display_name = name[0]
1161 return self._html_search_regex(
1162 [self._meta_regex(n) for n in name],
1163 html, display_name, fatal=fatal, group='content
', **kwargs)
1165 def _dc_search_uploader(self, html):
1166 return self._html_search_meta('dc
.creator
', html, 'uploader
')
1168 def _rta_search(self, html):
1169 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1170 if re.search(r'(?ix
)<meta\s
+name
="rating"\s
+'
1171 r' content
="RTA-5042-1996-1400-1577-RTA"',
1176 def _media_rating_search(self, html):
1177 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1178 rating = self._html_search_meta('rating
', html)
1190 return RATING_TABLE.get(rating.lower())
1192 def _family_friendly_search(self, html):
1193 # See http://schema.org/VideoObject
1194 family_friendly = self._html_search_meta(
1195 'isFamilyFriendly
', html, default=None)
1197 if not family_friendly:
1206 return RATING_TABLE.get(family_friendly.lower())
1208 def _twitter_search_player(self, html):
1209 return self._html_search_meta('twitter
:player
', html,
1210 'twitter card player
')
1212 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1213 json_ld_list = list(re.finditer(JSON_LD_RE, html))
1214 default = kwargs.get('default
', NO_DEFAULT)
1215 # JSON-LD may be malformed and thus `fatal` should be respected.
1216 # At the same time `default` may be passed that assumes `fatal=False`
1217 # for _search_regex. Let's simulate the same behavior here
as well
.
1218 fatal
= kwargs
.get('fatal', True) if default
== NO_DEFAULT
else False
1220 for mobj
in json_ld_list
:
1221 json_ld_item
= self
._parse
_json
(
1222 mobj
.group('json_ld'), video_id
, fatal
=fatal
)
1223 if not json_ld_item
:
1225 if isinstance(json_ld_item
, dict):
1226 json_ld
.append(json_ld_item
)
1227 elif isinstance(json_ld_item
, (list, tuple)):
1228 json_ld
.extend(json_ld_item
)
1230 json_ld
= self
._json
_ld
(json_ld
, video_id
, fatal
=fatal
, expected_type
=expected_type
)
1233 if default
is not NO_DEFAULT
:
1236 raise RegexNotFoundError('Unable to extract JSON-LD')
1238 self
._downloader
.report_warning('unable to extract JSON-LD %s' % bug_reports_message())
1241 def _json_ld(self
, json_ld
, video_id
, fatal
=True, expected_type
=None):
1242 if isinstance(json_ld
, compat_str
):
1243 json_ld
= self
._parse
_json
(json_ld
, video_id
, fatal
=fatal
)
1247 if not isinstance(json_ld
, (list, tuple, dict)):
1249 if isinstance(json_ld
, dict):
1252 INTERACTION_TYPE_MAP
= {
1253 'CommentAction': 'comment',
1254 'AgreeAction': 'like',
1255 'DisagreeAction': 'dislike',
1256 'LikeAction': 'like',
1257 'DislikeAction': 'dislike',
1258 'ListenAction': 'view',
1259 'WatchAction': 'view',
1260 'ViewAction': 'view',
1263 def extract_interaction_type(e
):
1264 interaction_type
= e
.get('interactionType')
1265 if isinstance(interaction_type
, dict):
1266 interaction_type
= interaction_type
.get('@type')
1267 return str_or_none(interaction_type
)
1269 def extract_interaction_statistic(e
):
1270 interaction_statistic
= e
.get('interactionStatistic')
1271 if isinstance(interaction_statistic
, dict):
1272 interaction_statistic
= [interaction_statistic
]
1273 if not isinstance(interaction_statistic
, list):
1275 for is_e
in interaction_statistic
:
1276 if not isinstance(is_e
, dict):
1278 if is_e
.get('@type') != 'InteractionCounter':
1280 interaction_type
= extract_interaction_type(is_e
)
1281 if not interaction_type
:
1283 # For interaction count some sites provide string instead of
1284 # an integer (as per spec) with non digit characters (e.g. ",")
1285 # so extracting count with more relaxed str_to_int
1286 interaction_count
= str_to_int(is_e
.get('userInteractionCount'))
1287 if interaction_count
is None:
1289 count_kind
= INTERACTION_TYPE_MAP
.get(interaction_type
.split('/')[-1])
1292 count_key
= '%s_count' % count_kind
1293 if info
.get(count_key
) is not None:
1295 info
[count_key
] = interaction_count
1297 def extract_video_object(e
):
1298 assert e
['@type'] == 'VideoObject'
1300 'url': url_or_none(e
.get('contentUrl')),
1301 'title': unescapeHTML(e
.get('name')),
1302 'description': unescapeHTML(e
.get('description')),
1303 'thumbnail': url_or_none(e
.get('thumbnailUrl') or e
.get('thumbnailURL')),
1304 'duration': parse_duration(e
.get('duration')),
1305 'timestamp': unified_timestamp(e
.get('uploadDate')),
1306 'uploader': str_or_none(e
.get('author')),
1307 'filesize': float_or_none(e
.get('contentSize')),
1308 'tbr': int_or_none(e
.get('bitrate')),
1309 'width': int_or_none(e
.get('width')),
1310 'height': int_or_none(e
.get('height')),
1311 'view_count': int_or_none(e
.get('interactionCount')),
1313 extract_interaction_statistic(e
)
1317 item_type
= e
.get('@type')
1318 if expected_type
is not None and expected_type
!= item_type
:
1320 if item_type
in ('TVEpisode', 'Episode'):
1321 episode_name
= unescapeHTML(e
.get('name'))
1323 'episode': episode_name
,
1324 'episode_number': int_or_none(e
.get('episodeNumber')),
1325 'description': unescapeHTML(e
.get('description')),
1327 if not info
.get('title') and episode_name
:
1328 info
['title'] = episode_name
1329 part_of_season
= e
.get('partOfSeason')
1330 if isinstance(part_of_season
, dict) and part_of_season
.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1332 'season': unescapeHTML(part_of_season
.get('name')),
1333 'season_number': int_or_none(part_of_season
.get('seasonNumber')),
1335 part_of_series
= e
.get('partOfSeries') or e
.get('partOfTVSeries')
1336 if isinstance(part_of_series
, dict) and part_of_series
.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1337 info
['series'] = unescapeHTML(part_of_series
.get('name'))
1338 elif item_type
== 'Movie':
1340 'title': unescapeHTML(e
.get('name')),
1341 'description': unescapeHTML(e
.get('description')),
1342 'duration': parse_duration(e
.get('duration')),
1343 'timestamp': unified_timestamp(e
.get('dateCreated')),
1345 elif item_type
in ('Article', 'NewsArticle'):
1347 'timestamp': parse_iso8601(e
.get('datePublished')),
1348 'title': unescapeHTML(e
.get('headline')),
1349 'description': unescapeHTML(e
.get('articleBody')),
1351 elif item_type
== 'VideoObject':
1352 extract_video_object(e
)
1353 if expected_type
is None:
1357 video
= e
.get('video')
1358 if isinstance(video
, dict) and video
.get('@type') == 'VideoObject':
1359 extract_video_object(video
)
1360 if expected_type
is None:
1364 return dict((k
, v
) for k
, v
in info
.items() if v
is not None)
1367 def _hidden_inputs(html
):
1368 html
= re
.sub(r
'<!--(?:(?!<!--).)*-->', '', html
)
1370 for input in re
.findall(r
'(?i)(<input[^>]+>)', html
):
1371 attrs
= extract_attributes(input)
1374 if attrs
.get('type') not in ('hidden', 'submit'):
1376 name
= attrs
.get('name') or attrs
.get('id')
1377 value
= attrs
.get('value')
1378 if name
and value
is not None:
1379 hidden_inputs
[name
] = value
1380 return hidden_inputs
1382 def _form_hidden_inputs(self
, form_id
, html
):
1383 form
= self
._search
_regex
(
1384 r
'(?is)<form[^>]+?id=(["\'])%s\
1[^
>]*>(?P
<form
>.+?
)</form
>' % form_id,
1385 html, '%s form
' % form_id, group='form
')
1386 return self._hidden_inputs(form)
1389 regex = r' *((?P
<reverse
>\
+)?
(?P
<field
>[a
-zA
-Z0
-9_]+)((?P
<seperator
>[~
:])(?P
<limit
>.*?
))?
)?
*$
'
1391 default = ('hidden
', 'hasvid
', 'ie_pref
', 'lang
', 'quality
',
1392 'res
', 'fps
', 'codec
:vp9
.2
', 'size
', 'br
', 'asr
',
1393 'proto
', 'ext
', 'has_audio
', 'source
', 'format_id
') # These must not be aliases
1396 'vcodec
': {'type': 'ordered
', 'regex
': True,
1397 'order
': ['av0?
1', 'vp0?
9.2', 'vp0?
9', '[hx
]265|he?vc?
', '[hx
]264|avc
', 'vp0?
8', 'mp4v|h263
', 'theora
', '', None, 'none
']},
1398 'acodec
': {'type': 'ordered
', 'regex
': True,
1399 'order
': ['opus
', 'vorbis
', 'aac
', 'mp?
4a?
', 'mp3
', 'e?a?c
-?
3', 'dts
', '', None, 'none
']},
1400 'proto
': {'type': 'ordered
', 'regex
': True, 'field
': 'protocol
',
1401 'order
': ['(ht|f
)tps
', '(ht|f
)tp$
', 'm3u8
.+', 'm3u8
', '.*dash
', '', 'mms|rtsp
', 'none
', 'f4
']},
1402 'vext
': {'type': 'ordered
', 'field
': 'video_ext
',
1403 'order
': ('mp4
', 'webm
', 'flv
', '', 'none
'),
1404 'order_free
': ('webm
', 'mp4
', 'flv
', '', 'none
')},
1405 'aext
': {'type': 'ordered
', 'field
': 'audio_ext
',
1406 'order
': ('m4a
', 'aac
', 'mp3
', 'ogg
', 'opus
', 'webm
', '', 'none
'),
1407 'order_free
': ('opus
', 'ogg
', 'webm
', 'm4a
', 'mp3
', 'aac
', '', 'none
')},
1408 'hidden
': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000},
1409 'ie_pref
': {'priority': True, 'type': 'extractor'},
1410 'hasvid
': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
1411 'hasaud
': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
1412 'lang
': {'priority': True, 'convert': 'ignore', 'type': 'extractor', 'field': 'language_preference'},
1413 'quality
': {'convert': 'float_none', 'type': 'extractor'},
1414 'filesize
': {'convert': 'bytes'},
1415 'fs_approx
': {'convert': 'bytes', 'field': 'filesize_approx'},
1416 'id': {'convert': 'string', 'field': 'format_id'},
1417 'height
': {'convert': 'float_none'},
1418 'width
': {'convert': 'float_none'},
1419 'fps
': {'convert': 'float_none'},
1420 'tbr
': {'convert': 'float_none'},
1421 'vbr
': {'convert': 'float_none'},
1422 'abr
': {'convert': 'float_none'},
1423 'asr
': {'convert': 'float_none'},
1424 'source
': {'convert': 'ignore', 'type': 'extractor', 'field': 'source_preference'},
1426 'codec
': {'type': 'combined', 'field': ('vcodec', 'acodec')},
1427 'br
': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
1428 'size
': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')},
1429 'ext
': {'type': 'combined', 'field': ('vext', 'aext')},
1430 'res
': {'type': 'multiple', 'field': ('height', 'width'), 'function': min},
1432 # Most of these exist only for compatibility reasons
1433 'dimension
': {'type': 'alias', 'field': 'res'},
1434 'resolution
': {'type': 'alias', 'field': 'res'},
1435 'extension
': {'type': 'alias', 'field': 'ext'},
1436 'bitrate
': {'type': 'alias', 'field': 'br'},
1437 'total_bitrate
': {'type': 'alias', 'field': 'tbr'},
1438 'video_bitrate
': {'type': 'alias', 'field': 'vbr'},
1439 'audio_bitrate
': {'type': 'alias', 'field': 'abr'},
1440 'framerate
': {'type': 'alias', 'field': 'fps'},
1441 'language_preference
': {'type': 'alias', 'field': 'lang'}, # not named as 'language
' because such a field exists
1442 'protocol
': {'type': 'alias', 'field': 'proto'},
1443 'source_preference
': {'type': 'alias', 'field': 'source'},
1444 'filesize_approx
': {'type': 'alias', 'field': 'fs_approx'},
1445 'filesize_estimate
': {'type': 'alias', 'field': 'size'},
1446 'samplerate
': {'type': 'alias', 'field': 'asr'},
1447 'video_ext
': {'type': 'alias', 'field': 'vext'},
1448 'audio_ext
': {'type': 'alias', 'field': 'aext'},
1449 'video_codec
': {'type': 'alias', 'field': 'vcodec'},
1450 'audio_codec
': {'type': 'alias', 'field': 'acodec'},
1451 'video
': {'type': 'alias', 'field': 'hasvid'},
1452 'has_video
': {'type': 'alias', 'field': 'hasvid'},
1453 'audio
': {'type': 'alias', 'field': 'hasaud'},
1454 'has_audio
': {'type': 'alias', 'field': 'hasaud'},
1455 'extractor
': {'type': 'alias', 'field': 'ie_pref'},
1456 'preference
': {'type': 'alias', 'field': 'ie_pref'},
1457 'extractor_preference
': {'type': 'alias', 'field': 'ie_pref'},
1458 'format_id
': {'type': 'alias', 'field': 'id'},
1463 def _get_field_setting(self, field, key):
1464 if field not in self.settings:
1465 self.settings[field] = {}
1466 propObj = self.settings[field]
1467 if key not in propObj:
1468 type = propObj.get('type')
1470 default = 'preference
' if type == 'extractor
' else (field,) if type in ('combined
', 'multiple
') else field
1471 elif key == 'convert
':
1472 default = 'order
' if type == 'ordered
' else 'float_string
' if field else 'ignore
'
1474 default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,), 'function': max}.get(key, None)
1475 propObj[key] = default
1478 def _resolve_field_value(self, field, value, convertNone=False):
1483 value = value.lower()
1484 conversion = self._get_field_setting(field, 'convert
')
1485 if conversion == 'ignore
':
1487 if conversion == 'string
':
1489 elif conversion == 'float_none
':
1490 return float_or_none(value)
1491 elif conversion == 'bytes':
1492 return FileDownloader.parse_bytes(value)
1493 elif conversion == 'order
':
1494 order_list = (self._use_free_order and self._get_field_setting(field, 'order_free
')) or self._get_field_setting(field, 'order
')
1495 use_regex = self._get_field_setting(field, 'regex
')
1496 list_length = len(order_list)
1497 empty_pos = order_list.index('') if '' in order_list else list_length + 1
1498 if use_regex and value is not None:
1499 for i, regex in enumerate(order_list):
1500 if regex and re.match(regex, value):
1501 return list_length - i
1502 return list_length - empty_pos # not in list
1503 else: # not regex or value = None
1504 return list_length - (order_list.index(value) if value in order_list else empty_pos)
1506 if value.isnumeric():
1509 self.settings[field]['convert
'] = 'string
'
1512 def evaluate_params(self, params, sort_extractor):
1513 self._use_free_order = params.get('prefer_free_formats
', False)
1514 self._sort_user = params.get('format_sort
', [])
1515 self._sort_extractor = sort_extractor
1517 def add_item(field, reverse, closest, limit_text):
1518 field = field.lower()
1519 if field in self._order:
1521 self._order.append(field)
1522 limit = self._resolve_field_value(field, limit_text)
1525 'closest
': False if limit is None else closest,
1526 'limit_text
': limit_text,
1528 if field in self.settings:
1529 self.settings[field].update(data)
1531 self.settings[field] = data
1534 tuple(field for field in self.default if self._get_field_setting(field, 'forced
'))
1535 + (tuple() if params.get('format_sort_force
', False)
1536 else tuple(field for field in self.default if self._get_field_setting(field, 'priority
')))
1537 + tuple(self._sort_user) + tuple(sort_extractor) + self.default)
1539 for item in sort_list:
1540 match = re.match(self.regex, item)
1542 raise ExtractorError('Invalid format sort string
"%s" given by extractor
' % item)
1543 field = match.group('field
')
1546 if self._get_field_setting(field, 'type') == 'alias
':
1547 field = self._get_field_setting(field, 'field
')
1548 reverse = match.group('reverse
') is not None
1549 closest = match.group('seperator
') == '~
'
1550 limit_text = match.group('limit
')
1552 has_limit = limit_text is not None
1553 has_multiple_fields = self._get_field_setting(field, 'type') == 'combined
'
1554 has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit
')
1556 fields = self._get_field_setting(field, 'field
') if has_multiple_fields else (field,)
1557 limits = limit_text.split(":") if has_multiple_limits else (limit_text,) if has_limit else tuple()
1558 limit_count = len(limits)
1559 for (i, f) in enumerate(fields):
1560 add_item(f, reverse, closest,
1561 limits[i] if i < limit_count
1562 else limits[0] if has_limit and not has_multiple_limits
1565 def print_verbose_info(self, to_screen):
1566 to_screen('[debug
] Sort order given by user
: %s' % ','.join(self._sort_user))
1567 if self._sort_extractor:
1568 to_screen('[debug
] Sort order given by extractor
: %s' % ', '.join(self._sort_extractor))
1569 to_screen('[debug
] Formats
sorted by
: %s' % ', '.join(['%s%s%s' % (
1570 '+' if self._get_field_setting(field, 'reverse
') else '', field,
1571 '%s%s(%s)' % ('~
' if self._get_field_setting(field, 'closest
') else ':',
1572 self._get_field_setting(field, 'limit_text
'),
1573 self._get_field_setting(field, 'limit
'))
1574 if self._get_field_setting(field, 'limit_text
') is not None else '')
1575 for field in self._order if self._get_field_setting(field, 'visible
')]))
1577 def _calculate_field_preference_from_value(self, format, field, type, value):
1578 reverse = self._get_field_setting(field, 'reverse
')
1579 closest = self._get_field_setting(field, 'closest
')
1580 limit = self._get_field_setting(field, 'limit
')
1582 if type == 'extractor
':
1583 maximum = self._get_field_setting(field, 'max')
1584 if value is None or (maximum is not None and value >= maximum):
1586 elif type == 'boolean
':
1587 in_list = self._get_field_setting(field, 'in_list
')
1588 not_in_list = self._get_field_setting(field, 'not_in_list
')
1589 value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1
1590 elif type == 'ordered
':
1591 value = self._resolve_field_value(field, value, True)
1593 # try to convert to number
1594 val_num = float_or_none(value)
1595 is_num = self._get_field_setting(field, 'convert
') != 'string
' and val_num is not None
1599 return ((-10, 0) if value is None
1600 else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher
1601 else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest
1602 else (0, value, 0) if not reverse and (limit is None or value <= limit)
1603 else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit
1604 else (-1, value, 0))
1606 def _calculate_field_preference(self, format, field):
1607 type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple
1608 get_value = lambda f: format.get(self._get_field_setting(f, 'field
'))
1609 if type == 'multiple
':
1610 type = 'field
' # Only 'field
' is allowed in multiple for now
1611 actual_fields = self._get_field_setting(field, 'field
')
1613 def wrapped_function(values):
1614 values = tuple(filter(lambda x: x is not None, values))
1615 return (self._get_field_setting(field, 'function
')(*values) if len(values) > 1
1616 else values[0] if values
1619 value = wrapped_function((get_value(f) for f in actual_fields))
1621 value = get_value(field)
1622 return self._calculate_field_preference_from_value(format, field, type, value)
1624 def calculate_preference(self, format):
1625 # Determine missing protocol
1626 if not format.get('protocol
'):
1627 format['protocol
'] = determine_protocol(format)
1629 # Determine missing ext
1630 if not format.get('ext
') and 'url
' in format:
1631 format['ext
'] = determine_ext(format['url
'])
1632 if format.get('vcodec
') == 'none
':
1633 format['audio_ext
'] = format['ext
']
1634 format['video_ext
'] = 'none
'
1636 format['video_ext
'] = format['ext
']
1637 format['audio_ext
'] = 'none
'
1638 # if format.get('preference
') is None and format.get('ext
') in ('f4f
', 'f4m
'): # Not supported?
1639 # format['preference
'] = -1000
1641 # Determine missing bitrates
1642 if format.get('tbr
') is None:
1643 if format.get('vbr
') is not None and format.get('abr
') is not None:
1644 format['tbr
'] = format.get('vbr
', 0) + format.get('abr
', 0)
1646 if format.get('vcodec
') != "none" and format.get('vbr
') is None:
1647 format['vbr
'] = format.get('tbr
') - format.get('abr
', 0)
1648 if format.get('acodec
') != "none" and format.get('abr
') is None:
1649 format['abr
'] = format.get('tbr
') - format.get('vbr
', 0)
1651 return tuple(self._calculate_field_preference(format, field) for field in self._order)
1653 def _sort_formats(self, formats, field_preference=[]):
1655 raise ExtractorError('No video formats found
')
1656 format_sort = self.FormatSort() # params and to_screen are taken from the downloader
1657 format_sort.evaluate_params(self._downloader.params, field_preference)
1658 if self._downloader.params.get('verbose
', False):
1659 format_sort.print_verbose_info(self._downloader.to_screen)
1660 formats.sort(key=lambda f: format_sort.calculate_preference(f))
1662 def _check_formats(self, formats, video_id):
1664 formats[:] = filter(
1665 lambda f: self._is_valid_url(
1667 item='%s video format
' % f.get('format_id
') if f.get('format_id
') else 'video
'),
1671 def _remove_duplicate_formats(formats):
1675 if f['url
'] not in format_urls:
1676 format_urls.add(f['url
'])
1677 unique_formats.append(f)
1678 formats[:] = unique_formats
1680 def _is_valid_url(self, url, video_id, item='video
', headers={}):
1681 url = self._proto_relative_url(url, scheme='http
:')
1682 # For now assume non HTTP(S) URLs always valid
1683 if not (url.startswith('http
://') or url.startswith('https
://')):
1686 self._request_webpage(url, video_id, 'Checking
%s URL
' % item, headers=headers)
1688 except ExtractorError as e:
1690 '%s: %s URL
is invalid
, skipping
: %s'
1691 % (video_id, item, error_to_compat_str(e.cause)))
1694 def http_scheme(self):
1695 """ Either "http:" or "https:", depending on the user's preferences
"""
1698 if self._downloader.params.get('prefer_insecure', False)
1701 def _proto_relative_url(self, url, scheme=None):
1704 if url.startswith('//'):
1706 scheme = self.http_scheme()
1711 def _sleep(self, timeout, video_id, msg_template=None):
1712 if msg_template is None:
1713 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1714 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1718 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1719 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1720 fatal=True, m3u8_id=None, data=None, headers={}, query={}):
1721 manifest = self._download_xml(
1722 manifest_url, video_id, 'Downloading f4m manifest',
1723 'Unable to download f4m manifest',
1724 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1725 # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
1726 transform_source=transform_source,
1727 fatal=fatal, data=data, headers=headers, query=query)
1729 if manifest is False:
1732 return self._parse_f4m_formats(
1733 manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1734 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1736 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
1737 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1738 fatal=True, m3u8_id=None):
1739 if not isinstance(manifest, compat_etree_Element) and not fatal:
1742 # currently yt-dlp cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1743 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1744 if akamai_pv is not None and ';' in akamai_pv.text:
1745 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1746 if playerVerificationChallenge.strip() != '':
1750 manifest_version = '1.0'
1751 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1753 manifest_version = '2.0'
1754 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1755 # Remove unsupported DRM protected media from final formats
1756 # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
1757 media_nodes = remove_encrypted_media(media_nodes)
1761 manifest_base_url = get_base_url(manifest)
1763 bootstrap_info = xpath_element(
1764 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1765 'bootstrap info', default=None)
1768 mime_type = xpath_text(
1769 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1770 'base URL', default=None)
1771 if mime_type and mime_type.startswith('audio/'):
1774 for i, media_el in enumerate(media_nodes):
1775 tbr = int_or_none(media_el.attrib.get('bitrate'))
1776 width = int_or_none(media_el.attrib.get('width'))
1777 height = int_or_none(media_el.attrib.get('height'))
1778 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1779 # If <bootstrapInfo> is present, the specified f4m is a
1780 # stream-level manifest, and only set-level manifests may refer to
1781 # external resources. See section 11.4 and section 4 of F4M spec
1782 if bootstrap_info is None:
1784 # @href is introduced in 2.0, see section 11.6 of F4M spec
1785 if manifest_version == '2.0':
1786 media_url = media_el.attrib.get('href')
1787 if media_url is None:
1788 media_url = media_el.attrib.get('url')
1792 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1793 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1794 # If media_url is itself a f4m manifest do the recursive extraction
1795 # since bitrates in parent manifest (this one) and media_url manifest
1796 # may differ leading to inability to resolve the format by requested
1797 # bitrate in f4m downloader
1798 ext = determine_ext(manifest_url)
1800 f4m_formats = self._extract_f4m_formats(
1801 manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
1802 transform_source=transform_source, fatal=fatal)
1803 # Sometimes stream-level manifest contains single media entry that
1804 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1805 # At the same time parent's media entry in set-level manifest may
1806 # contain it. We will copy it from parent in such cases.
1807 if len(f4m_formats) == 1:
1810 'tbr': f.get('tbr') or tbr,
1811 'width': f.get('width') or width,
1812 'height': f.get('height') or height,
1813 'format_id': f.get('format_id') if not tbr else format_id,
1816 formats.extend(f4m_formats)
1819 formats.extend(self._extract_m3u8_formats(
1820 manifest_url, video_id, 'mp4', preference=preference,
1821 quality=quality, m3u8_id=m3u8_id, fatal=fatal))
1824 'format_id': format_id,
1825 'url': manifest_url,
1826 'manifest_url': manifest_url,
1827 'ext': 'flv' if bootstrap_info is not None else None,
1833 'preference': preference,
1838 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
1840 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1844 'preference': preference - 100 if preference else -100,
1846 'resolution': 'multiple',
1847 'format_note': 'Quality selection URL',
1850 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1851 entry_protocol='m3u8', preference=None, quality=None,
1852 m3u8_id=None, live=False, note=None, errnote=None,
1853 fatal=True, data=None, headers={}, query={}):
1854 res = self._download_webpage_handle(
1856 note=note or 'Downloading m3u8 information',
1857 errnote=errnote or 'Failed to download m3u8 information',
1858 fatal=fatal, data=data, headers=headers, query=query)
1863 m3u8_doc, urlh = res
1864 m3u8_url = urlh.geturl()
1866 return self._parse_m3u8_formats(
1867 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1868 preference=preference, quality=quality, m3u8_id=m3u8_id,
1869 note=note, errnote=errnote, fatal=fatal, live=live, data=data,
1870 headers=headers, query=query, video_id=video_id)
1872 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1873 entry_protocol='m3u8', preference=None, quality=None,
1874 m3u8_id=None, live=False, note=None, errnote=None,
1875 fatal=True, data=None, headers={}, query={}, video_id=None):
1876 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1879 if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
1884 format_url = lambda u: (
1886 if re.match(r'^https?://', u)
1887 else compat_urlparse.urljoin(m3u8_url, u))
1889 split_discontinuity = self._downloader.params.get('hls_split_discontinuity', False)
1892 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1893 # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
1894 # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
1896 # We should try extracting formats only from master playlists [1, 4.3.4],
1897 # i.e. playlists that describe available qualities. On the other hand
1898 # media playlists [1, 4.3.3] should be returned as is since they contain
1899 # just the media without qualities renditions.
1900 # Fortunately, master playlist can be easily distinguished from media
1901 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1902 # master playlist tags MUST NOT appear in a media playlist and vice versa.
1903 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1904 # media playlist and MUST NOT appear in master playlist thus we can
1905 # clearly detect media playlist with this criterion.
1907 def _extract_m3u8_playlist_formats(format_url=None, m3u8_doc=None):
1911 res = self._download_webpage_handle(
1912 format_url, video_id,
1914 errnote=errnote or 'Failed to download m3u8 playlist information',
1915 fatal=fatal, data=data, headers=headers, query=query)
1920 m3u8_doc, urlh = res
1921 format_url = urlh.geturl()
1923 playlist_formats = []
1926 if split_discontinuity
1933 for line in m3u8_doc.splitlines():
1934 if not line.startswith('#'):
1935 format_info['files'].append(line)
1936 elif split_discontinuity and line.startswith('#EXT-X-DISCONTINUITY'):
1938 playlist_formats.append(format_info)
1944 playlist_formats.append(format_info)
1945 return playlist_formats
1947 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1949 playlist_formats = _extract_m3u8_playlist_formats(m3u8_doc=m3u8_doc)
1951 for format in playlist_formats:
1954 format_id.append(m3u8_id)
1955 format_index = format.get('index')
1957 format_id.append(str(format_index))
1959 'format_id': '-'.join(format_id),
1960 'format_index': format_index,
1963 'protocol': entry_protocol,
1964 'preference': preference,
1972 last_stream_inf = {}
1974 def extract_media(x_media_line):
1975 media = parse_m3u8_attributes(x_media_line)
1976 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1977 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1978 if not (media_type and group_id and name):
1980 groups.setdefault(group_id, []).append(media)
1981 if media_type not in ('VIDEO', 'AUDIO'):
1983 media_url = media.get('URI')
1985 manifest_url = format_url(media_url)
1987 playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
1989 for format in playlist_formats:
1990 format_index = format.get('index')
1991 for v in (m3u8_id, group_id, name):
1995 format_id.append(str(format_index))
1997 'format_id': '-'.join(format_id),
1998 'format_index': format_index,
1999 'url': manifest_url,
2000 'manifest_url': m3u8_url,
2001 'language': media.get('LANGUAGE'),
2003 'protocol': entry_protocol,
2004 'preference': preference,
2007 if media_type == 'AUDIO':
2008 f['vcodec'] = 'none'
2011 def build_stream_name():
2012 # Despite specification does not mention NAME attribute for
2013 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
2014 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
2015 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
2016 stream_name = last_stream_inf.get('NAME')
2019 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
2020 # from corresponding rendition group
2021 stream_group_id = last_stream_inf.get('VIDEO')
2022 if not stream_group_id:
2024 stream_group = groups.get(stream_group_id)
2025 if not stream_group:
2026 return stream_group_id
2027 rendition = stream_group[0]
2028 return rendition.get('NAME') or stream_group_id
2030 # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the
2031 # chance to detect video only formats when EXT-X-STREAM-INF tags
2032 # precede EXT-X-MEDIA tags in HLS manifest such as [3].
2033 for line in m3u8_doc.splitlines():
2034 if line.startswith('#EXT-X-MEDIA:'):
2037 for line in m3u8_doc.splitlines():
2038 if line.startswith('#EXT-X-STREAM-INF:'):
2039 last_stream_inf = parse_m3u8_attributes(line)
2040 elif line.startswith('#') or not line.strip():
2043 tbr = float_or_none(
2044 last_stream_inf.get('AVERAGE-BANDWIDTH')
2045 or last_stream_inf.get('BANDWIDTH'), scale=1000)
2046 manifest_url = format_url(line.strip())
2048 playlist_formats = _extract_m3u8_playlist_formats(manifest_url)
2050 for format in playlist_formats:
2053 format_id.append(m3u8_id)
2054 format_index = format.get('index')
2055 stream_name = build_stream_name()
2056 # Bandwidth of live streams may differ over time thus making
2057 # format_id unpredictable. So it's better to keep provided
2060 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
2062 format_id.append(str(format_index))
2064 'format_id': '-'.join(format_id),
2065 'format_index': format_index,
2066 'url': manifest_url,
2067 'manifest_url': m3u8_url,
2070 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
2071 'protocol': entry_protocol,
2072 'preference': preference,
2075 resolution = last_stream_inf.get('RESOLUTION')
2077 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
2079 f['width'] = int(mobj.group('width'))
2080 f['height'] = int(mobj.group('height'))
2081 # Unified Streaming Platform
2083 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
2085 abr, vbr = mobj.groups()
2086 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
2091 codecs = parse_codecs(last_stream_inf.get('CODECS'))
2093 audio_group_id = last_stream_inf.get('AUDIO')
2094 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
2095 # references a rendition group MUST have a CODECS attribute.
2096 # However, this is not always respected, for example, [2]
2097 # contains EXT-X-STREAM-INF tag which references AUDIO
2098 # rendition group but does not have CODECS and despite
2099 # referencing an audio group it represents a complete
2100 # (with audio and video) format. So, for such cases we will
2101 # ignore references to rendition groups and treat them
2102 # as complete formats.
2103 if audio_group_id and codecs and f.get('vcodec') != 'none':
2104 audio_group = groups.get(audio_group_id)
2105 if audio_group and audio_group[0].get('URI'):
2106 # TODO: update acodec for audio only formats with
2108 f['acodec'] = 'none'
2112 progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
2115 del http_f['manifest_url']
2117 'format_id': f['format_id'].replace('hls-', 'http-'),
2119 'url': progressive_uri,
2121 formats.append(http_f)
2123 last_stream_inf = {}
2127 def _xpath_ns(path, namespace=None):
2131 for c in path.split('/'):
2132 if not c or c == '.':
2135 out.append('{%s}%s' % (namespace, c))
2136 return '/'.join(out)
2138 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
2139 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
2145 namespace = self._parse_smil_namespace(smil)
2147 return self._parse_smil_formats(
2148 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2150 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
2151 smil = self._download_smil(smil_url, video_id, fatal=fatal)
2154 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
2156 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
2157 return self._download_xml(
2158 smil_url, video_id, 'Downloading SMIL file',
2159 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
2161 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
2162 namespace = self._parse_smil_namespace(smil)
2164 formats = self._parse_smil_formats(
2165 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
2166 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
2168 video_id = os.path.splitext(url_basename(smil_url))[0]
2172 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2173 name = meta.attrib.get('name')
2174 content = meta.attrib.get('content')
2175 if not name or not content:
2177 if not title and name == 'title':
2179 elif not description and name in ('description', 'abstract'):
2180 description = content
2181 elif not upload_date and name == 'date':
2182 upload_date = unified_strdate(content)
2185 'id': image.get('type'),
2186 'url': image.get('src'),
2187 'width': int_or_none(image.get('width')),
2188 'height': int_or_none(image.get('height')),
2189 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
2193 'title': title or video_id,
2194 'description': description,
2195 'upload_date': upload_date,
2196 'thumbnails': thumbnails,
2198 'subtitles': subtitles,
2201 def _parse_smil_namespace(self, smil):
2202 return self._search_regex(
2203 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
2205 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
2207 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
2208 b = meta.get('base') or meta.get('httpBase')
2219 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
2220 for medium in media:
2221 src = medium.get('src')
2222 if not src or src in srcs:
2226 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
2227 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
2228 width = int_or_none(medium.get('width'))
2229 height = int_or_none(medium.get('height'))
2230 proto = medium.get('proto')
2231 ext = medium.get('ext')
2232 src_ext = determine_ext(src)
2233 streamer = medium.get('streamer') or base
2235 if proto == 'rtmp' or streamer.startswith('rtmp'):
2241 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
2243 'filesize': filesize,
2247 if transform_rtmp_url:
2248 streamer, src = transform_rtmp_url(streamer, src)
2249 formats[-1].update({
2255 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
2256 src_url = src_url.strip()
2258 if proto == 'm3u8' or src_ext == 'm3u8':
2259 m3u8_formats = self._extract_m3u8_formats(
2260 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
2261 if len(m3u8_formats) == 1:
2263 m3u8_formats[0].update({
2264 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
2269 formats.extend(m3u8_formats)
2270 elif src_ext == 'f4m':
2275 'plugin': 'flowplayer-3.2.0.1',
2277 f4m_url += '&' if '?' in f4m_url else '?'
2278 f4m_url += compat_urllib_parse_urlencode(f4m_params)
2279 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
2280 elif src_ext == 'mpd':
2281 formats.extend(self._extract_mpd_formats(
2282 src_url, video_id, mpd_id='dash', fatal=False))
2283 elif re.search(r'\.ism/[Mm]anifest', src_url):
2284 formats.extend(self._extract_ism_formats(
2285 src_url, video_id, ism_id='mss', fatal=False))
2286 elif src_url.startswith('http') and self._is_valid_url(src, video_id):
2290 'ext': ext or src_ext or 'flv',
2291 'format_id': 'http-%d' % (bitrate or http_count),
2293 'filesize': filesize,
2300 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
2303 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
2304 src = textstream.get('src')
2305 if not src or src in urls:
2308 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
2309 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
2310 subtitles.setdefault(lang, []).append({
2316 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
2317 xspf = self._download_xml(
2318 xspf_url, playlist_id, 'Downloading xpsf playlist',
2319 'Unable to download xspf manifest', fatal=fatal)
2322 return self._parse_xspf(
2323 xspf, playlist_id, xspf_url=xspf_url,
2324 xspf_base_url=base_url(xspf_url))
2326 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
2328 'xspf': 'http://xspf.org/ns/0/',
2329 's1': 'http://static.streamone.nl/player/ns/0',
2333 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
2335 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
2336 description = xpath_text(
2337 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
2338 thumbnail = xpath_text(
2339 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
2340 duration = float_or_none(
2341 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
2344 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
2345 format_url = urljoin(xspf_base_url, location.text)
2350 'manifest_url': xspf_url,
2351 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
2352 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
2353 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
2355 self._sort_formats(formats)
2360 'description': description,
2361 'thumbnail': thumbnail,
2362 'duration': duration,
2367 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2368 res = self._download_xml_handle(
2370 note=note or 'Downloading MPD manifest',
2371 errnote=errnote or 'Failed to download MPD manifest',
2372 fatal=fatal, data=data, headers=headers, query=query)
2378 mpd_base_url = base_url(urlh.geturl())
2380 return self._parse_mpd_formats(
2381 mpd_doc, mpd_id, mpd_base_url, mpd_url)
2383 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', mpd_url=None):
2385 Parse formats
from MPD manifest
.
2387 1. MPEG
-DASH Standard
, ISO
/IEC
23009-1:2014(E
),
2388 http
://standards
.iso
.org
/ittf
/PubliclyAvailableStandards
/c065274_ISO_IEC_23009
-1_2014.zip
2389 2. https
://en
.wikipedia
.org
/wiki
/Dynamic_Adaptive_Streaming_over_HTTP
2391 if not self._downloader.params.get('dynamic_mpd'):
2392 if mpd_doc.get('type') == 'dynamic':
2395 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
2398 return self._xpath_ns(path, namespace)
2400 def is_drm_protected(element):
2401 return element.find(_add_ns('ContentProtection')) is not None
2403 def extract_multisegment_info(element, ms_parent_info):
2404 ms_info = ms_parent_info.copy()
2406 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
2407 # common attributes and elements. We will only extract relevant
2409 def extract_common(source):
2410 segment_timeline = source.find(_add_ns('SegmentTimeline'))
2411 if segment_timeline is not None:
2412 s_e = segment_timeline.findall(_add_ns('S'))
2414 ms_info['total_number'] = 0
2417 r = int(s.get('r', 0))
2418 ms_info['total_number'] += 1 + r
2419 ms_info['s'].append({
2420 't': int(s.get('t', 0)),
2421 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
2422 'd': int(s.attrib['d']),
2425 start_number = source.get('startNumber')
2427 ms_info['start_number'] = int(start_number)
2428 timescale = source.get('timescale')
2430 ms_info['timescale'] = int(timescale)
2431 segment_duration = source.get('duration')
2432 if segment_duration:
2433 ms_info['segment_duration'] = float(segment_duration)
2435 def extract_Initialization(source):
2436 initialization = source.find(_add_ns('Initialization'))
2437 if initialization is not None:
2438 ms_info['initialization_url'] = initialization.attrib['sourceURL']
2440 segment_list = element.find(_add_ns('SegmentList'))
2441 if segment_list is not None:
2442 extract_common(segment_list)
2443 extract_Initialization(segment_list)
2444 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
2446 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
2448 segment_template = element.find(_add_ns('SegmentTemplate'))
2449 if segment_template is not None:
2450 extract_common(segment_template)
2451 media = segment_template.get('media')
2453 ms_info['media'] = media
2454 initialization = segment_template.get('initialization')
2456 ms_info['initialization'] = initialization
2458 extract_Initialization(segment_template)
2461 skip_unplayable = not self._downloader.params.get('allow_unplayable_formats')
2463 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
2465 for period in mpd_doc.findall(_add_ns('Period')):
2466 period_duration = parse_duration(period.get('duration')) or mpd_duration
2467 period_ms_info = extract_multisegment_info(period, {
2471 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
2472 if skip_unplayable and is_drm_protected(adaptation_set):
2474 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
2475 for representation in adaptation_set.findall(_add_ns('Representation')):
2476 if skip_unplayable and is_drm_protected(representation):
2478 representation_attrib = adaptation_set.attrib.copy()
2479 representation_attrib.update(representation.attrib)
2480 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
2481 mime_type = representation_attrib['mimeType']
2482 content_type = mime_type.split('/')[0]
2483 if content_type == 'text':
2484 # TODO implement WebVTT downloading
2486 elif content_type in ('video', 'audio'):
2488 for element in (representation, adaptation_set, period, mpd_doc):
2489 base_url_e = element.find(_add_ns('BaseURL'))
2490 if base_url_e is not None:
2491 base_url = base_url_e.text + base_url
2492 if re.match(r'^https?://', base_url):
2494 if mpd_base_url and not re.match(r'^https?://', base_url):
2495 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
2497 base_url = mpd_base_url + base_url
2498 representation_id = representation_attrib.get('id')
2499 lang = representation_attrib.get('lang')
2500 url_el = representation.find(_add_ns('BaseURL'))
2501 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
2502 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2504 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2505 'manifest_url': mpd_url,
2506 'ext': mimetype2ext(mime_type),
2507 'width': int_or_none(representation_attrib.get('width')),
2508 'height': int_or_none(representation_attrib.get('height')),
2509 'tbr': float_or_none(bandwidth, 1000),
2510 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2511 'fps': int_or_none(representation_attrib.get('frameRate')),
2512 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2513 'format_note': 'DASH %s' % content_type,
2514 'filesize': filesize,
2515 'container': mimetype2ext(mime_type) + '_dash',
2517 f.update(parse_codecs(representation_attrib.get('codecs')))
2518 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2520 def prepare_template(template_name, identifiers):
2521 tmpl = representation_ms_info[template_name]
2522 # First of, % characters outside $...$ templates
2523 # must be escaped by doubling for proper processing
2524 # by % operator string formatting used further (see
2525 # https://github.com/ytdl-org/youtube-dl/issues/16867).
2531 in_template = not in_template
2532 elif c == '%' and not in_template:
2534 # Next, $...$ templates are translated to their
2535 # %(...) counterparts to be used with % operator
2536 t = t.replace('$RepresentationID$', representation_id)
2537 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2538 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2539 t.replace('$$', '$')
2542 # @initialization is a regular template like @media one
2543 # so it should be handled just the same way (see
2544 # https://github.com/ytdl-org/youtube-dl/issues/11605)
2545 if 'initialization' in representation_ms_info:
2546 initialization_template = prepare_template(
2548 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2549 # $Time$ shall not be included for @initialization thus
2550 # only $Bandwidth$ remains
2552 representation_ms_info['initialization_url'] = initialization_template % {
2553 'Bandwidth': bandwidth,
2556 def location_key(location):
2557 return 'url' if re.match(r'^https?://', location) else 'path'
2559 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2561 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2562 media_location_key = location_key(media_template)
2564 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2565 # can't be used at the same time
2566 if '%(Number' in media_template and 's' not in representation_ms_info:
2567 segment_duration = None
2568 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2569 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2570 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2571 representation_ms_info['fragments'] = [{
2572 media_location_key: media_template % {
2573 'Number': segment_number,
2574 'Bandwidth': bandwidth,
2576 'duration': segment_duration,
2577 } for segment_number in range(
2578 representation_ms_info['start_number'],
2579 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2581 # $Number*$ or $Time$ in media template with S list available
2582 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2583 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2584 representation_ms_info['fragments'] = []
2587 segment_number = representation_ms_info['start_number']
2589 def add_segment_url():
2590 segment_url = media_template % {
2591 'Time': segment_time,
2592 'Bandwidth': bandwidth,
2593 'Number': segment_number,
2595 representation_ms_info['fragments'].append({
2596 media_location_key: segment_url,
2597 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2600 for num, s in enumerate(representation_ms_info['s']):
2601 segment_time = s.get('t') or segment_time
2605 for r in range(s.get('r', 0)):
2606 segment_time += segment_d
2609 segment_time += segment_d
2610 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2612 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2613 # or any YouTube dashsegments video
2616 timescale = representation_ms_info['timescale']
2617 for s in representation_ms_info['s']:
2618 duration = float_or_none(s['d'], timescale)
2619 for r in range(s.get('r', 0) + 1):
2620 segment_uri = representation_ms_info['segment_urls'][segment_index]
2622 location_key(segment_uri): segment_uri,
2623 'duration': duration,
2626 representation_ms_info['fragments'] = fragments
2627 elif 'segment_urls' in representation_ms_info:
2628 # Segment URLs with no SegmentTimeline
2629 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2630 # https://github.com/ytdl-org/youtube-dl/pull/14844
2632 segment_duration = float_or_none(
2633 representation_ms_info['segment_duration'],
2634 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2635 for segment_url in representation_ms_info['segment_urls']:
2637 location_key(segment_url): segment_url,
2639 if segment_duration:
2640 fragment['duration'] = segment_duration
2641 fragments.append(fragment)
2642 representation_ms_info['fragments'] = fragments
2643 # If there is a fragments key available then we correctly recognized fragmented media.
2644 # Otherwise we will assume unfragmented media with direct access. Technically, such
2645 # assumption is not necessarily correct since we may simply have no support for
2646 # some forms of fragmented media renditions yet, but for now we'll use this fallback.
2647 if 'fragments' in representation_ms_info:
2649 # NB: mpd_url may be empty when MPD manifest is parsed from a string
2650 'url': mpd_url or base_url,
2651 'fragment_base_url': base_url,
2653 'protocol': 'http_dash_segments',
2655 if 'initialization_url' in representation_ms_info:
2656 initialization_url = representation_ms_info['initialization_url']
2657 if not f.get('url'):
2658 f['url'] = initialization_url
2659 f['fragments'].append({location_key(initialization_url): initialization_url})
2660 f['fragments'].extend(representation_ms_info['fragments'])
2662 # Assuming direct URL to unfragmented media.
2666 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2669 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
2670 res = self._download_xml_handle(
2672 note=note or 'Downloading ISM manifest',
2673 errnote=errnote or 'Failed to download ISM manifest',
2674 fatal=fatal, data=data, headers=headers, query=query)
2681 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2683 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2685 Parse formats
from ISM manifest
.
2687 1. [MS
-SSTR
]: Smooth Streaming Protocol
,
2688 https
://msdn
.microsoft
.com
/en
-us
/library
/ff469518
.aspx
2690 if ism_doc.get('IsLive') == 'TRUE':
2692 if (not self._downloader.params.get('allow_unplayable_formats')
2693 and ism_doc.find('Protection') is not None):
2696 duration = int(ism_doc.attrib['Duration'])
2697 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2700 for stream in ism_doc.findall('StreamIndex'):
2701 stream_type = stream.get('Type')
2702 if stream_type not in ('video', 'audio'):
2704 url_pattern = stream.attrib['Url']
2705 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2706 stream_name = stream.get('Name')
2707 for track in stream.findall('QualityLevel'):
2708 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2709 # TODO: add support for WVC1 and WMAP
2710 if fourcc not in ('H264', 'AVC1', 'AACL'):
2711 self.report_warning('%s is not a supported codec' % fourcc)
2713 tbr = int(track.attrib['Bitrate']) // 1000
2714 # [1] does not mention Width and Height attributes. However,
2715 # they're often present while MaxWidth and MaxHeight are
2716 # missing, so should be used as fallbacks
2717 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2718 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2719 sampling_rate = int_or_none(track.get('SamplingRate'))
2721 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2722 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2728 stream_fragments = stream.findall('c')
2729 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2730 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2731 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2732 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2733 if not fragment_ctx['duration']:
2735 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2737 next_fragment_time = duration
2738 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2739 for _ in range(fragment_repeat):
2741 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2742 'duration': fragment_ctx['duration'] / stream_timescale,
2744 fragment_ctx['time'] += fragment_ctx['duration']
2748 format_id.append(ism_id)
2750 format_id.append(stream_name)
2751 format_id.append(compat_str(tbr))
2754 'format_id': '-'.join(format_id),
2756 'manifest_url': ism_url,
2757 'ext': 'ismv' if stream_type == 'video' else 'isma',
2761 'asr': sampling_rate,
2762 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2763 'acodec': 'none' if stream_type == 'video' else fourcc,
2765 'fragments': fragments,
2766 '_download_params': {
2767 'duration': duration,
2768 'timescale': stream_timescale,
2769 'width': width or 0,
2770 'height': height or 0,
2772 'codec_private_data': track.get('CodecPrivateData'),
2773 'sampling_rate': sampling_rate,
2774 'channels': int_or_none(track.get('Channels', 2)),
2775 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2776 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2781 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None):
2782 def absolute_url(item_url):
2783 return urljoin(base_url, item_url)
2785 def parse_content_type(content_type):
2786 if not content_type:
2788 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2790 mimetype, codecs = ctr.groups()
2791 f = parse_codecs(codecs)
2792 f['ext'] = mimetype2ext(mimetype)
2796 def _media_formats(src, cur_media_type, type_info={}):
2797 full_url = absolute_url(src)
2798 ext = type_info.get('ext') or determine_ext(full_url)
2800 is_plain_url = False
2801 formats = self._extract_m3u8_formats(
2802 full_url, video_id, ext='mp4',
2803 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2804 preference=preference, quality=quality, fatal=False)
2806 is_plain_url = False
2807 formats = self._extract_mpd_formats(
2808 full_url, video_id, mpd_id=mpd_id, fatal=False)
2813 'vcodec': 'none' if cur_media_type == 'audio' else None,
2815 return is_plain_url, formats
2818 # amp-video and amp-audio are very similar to their HTML5 counterparts
2819 # so we wll include them right here (see
2820 # https://www.ampproject.org/docs/reference/components/amp-video)
2821 # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/
2822 _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)'
2823 media_tags = [(media_tag, media_tag_name, media_type, '')
2824 for media_tag, media_tag_name, media_type
2825 in re.findall(r'(?s)(<(%s)[^>]*/>)' % _MEDIA_TAG_NAME_RE, webpage)]
2826 media_tags.extend(re.findall(
2827 # We only allow video|audio followed by a whitespace or '>'.
2828 # Allowing more characters may end up in significant slow down (see
2829 # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
2830 # http://www.porntrex.com/maps/videositemap.xml).
2831 r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage))
2832 for media_tag, _, media_type, media_content in media_tags:
2837 media_attributes = extract_attributes(media_tag)
2838 src = strip_or_none(media_attributes.get('src'))
2840 _, formats = _media_formats(src, media_type)
2841 media_info['formats'].extend(formats)
2842 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2844 for source_tag in re.findall(r'<source[^>]+>', media_content):
2845 s_attr = extract_attributes(source_tag)
2846 # data-video-src and data-src are non standard but seen
2847 # several times in the wild
2848 src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
2851 f = parse_content_type(s_attr.get('type'))
2852 is_plain_url, formats = _media_formats(src, media_type, f)
2854 # width, height, res, label and title attributes are
2855 # all not standard but seen several times in the wild
2858 for lbl in ('label', 'title')
2859 if str_or_none(s_attr.get(lbl))
2861 width = int_or_none(s_attr.get('width'))
2862 height = (int_or_none(s_attr.get('height'))
2863 or int_or_none(s_attr.get('res')))
2864 if not width or not height:
2866 resolution = parse_resolution(lbl)
2869 width = width or resolution.get('width')
2870 height = height or resolution.get('height')
2872 tbr = parse_bitrate(lbl)
2881 'format_id': s_attr.get('label') or s_attr.get('title'),
2883 f.update(formats[0])
2884 media_info['formats'].append(f)
2886 media_info['formats'].extend(formats)
2887 for track_tag in re.findall(r'<track[^>]+>', media_content):
2888 track_attributes = extract_attributes(track_tag)
2889 kind = track_attributes.get('kind')
2890 if not kind or kind in ('subtitles', 'captions'):
2891 src = strip_or_none(track_attributes.get('src'))
2894 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2895 media_info['subtitles'].setdefault(lang, []).append({
2896 'url': absolute_url(src),
2898 for f in media_info['formats']:
2899 f.setdefault('http_headers', {})['Referer'] = base_url
2900 if media_info['formats'] or media_info['subtitles']:
2901 entries.append(media_info)
2904 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2905 signed = 'hdnea=' in manifest_url
2907 # https://learn.akamai.com/en-us/webhelp/media-services-on-demand/stream-packaging-user-guide/GUID-BE6C0F73-1E06-483B-B0EA-57984B91B7F9.html
2908 manifest_url = re.sub(
2909 r'(?:b=[\d,-]+|(?:__a__|attributes)=off|__b__=\d+)&?',
2910 '', manifest_url).strip('?')
2914 hdcore_sign = 'hdcore=3.7.0'
2915 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2916 hds_host = hosts.get('hds')
2918 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2919 if 'hdcore=' not in f4m_url:
2920 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2921 f4m_formats = self._extract_f4m_formats(
2922 f4m_url, video_id, f4m_id='hds', fatal=False)
2923 for entry in f4m_formats:
2924 entry.update({'extra_param_to_segment_url': hdcore_sign})
2925 formats.extend(f4m_formats)
2927 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2928 hls_host = hosts.get('hls')
2930 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2931 m3u8_formats = self._extract_m3u8_formats(
2932 m3u8_url, video_id, 'mp4', 'm3u8_native',
2933 m3u8_id='hls', fatal=False)
2934 formats.extend(m3u8_formats)
2936 http_host = hosts.get('http')
2937 if http_host and m3u8_formats and not signed:
2938 REPL_REGEX = r'https?://[^/]+/i/([^,]+),([^/]+),([^/]+)\.csmil/.+'
2939 qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
2940 qualities_length = len(qualities)
2941 if len(m3u8_formats) in (qualities_length, qualities_length + 1):
2943 for f in m3u8_formats:
2944 if f['vcodec'] != 'none':
2945 for protocol in ('http', 'https'):
2947 del http_f['manifest_url']
2949 REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url'])
2951 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
2953 'protocol': protocol,
2955 formats.append(http_f)
2960 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2961 query = compat_urlparse.urlparse(url).query
2962 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2964 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2965 url_base = mobj.group('url')
2966 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2969 def manifest_url(manifest):
2970 m_url = '%s/%s' % (http_base_url, manifest)
2972 m_url += '?%s' % query
2975 if 'm3u8' not in skip_protocols:
2976 formats.extend(self._extract_m3u8_formats(
2977 manifest_url('playlist.m3u8'), video_id, 'mp4',
2978 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2979 if 'f4m' not in skip_protocols:
2980 formats.extend(self._extract_f4m_formats(
2981 manifest_url('manifest.f4m'),
2982 video_id, f4m_id='hds', fatal=False))
2983 if 'dash' not in skip_protocols:
2984 formats.extend(self._extract_mpd_formats(
2985 manifest_url('manifest.mpd'),
2986 video_id, mpd_id='dash', fatal=False))
2987 if re.search(r'(?:/smil:|\.smil)', url_base):
2988 if 'smil' not in skip_protocols:
2989 rtmp_formats = self._extract_smil_formats(
2990 manifest_url('jwplayer.smil'),
2991 video_id, fatal=False)
2992 for rtmp_format in rtmp_formats:
2993 rtsp_format = rtmp_format.copy()
2994 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2995 del rtsp_format['play_path']
2996 del rtsp_format['ext']
2997 rtsp_format.update({
2998 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2999 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
3002 formats.extend([rtmp_format, rtsp_format])
3004 for protocol in ('rtmp', 'rtsp'):
3005 if protocol not in skip_protocols:
3007 'url': '%s:%s' % (protocol, url_base),
3008 'format_id': protocol,
3009 'protocol': protocol,
3013 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
3015 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
3019 jwplayer_data = self._parse_json(mobj.group('options'),
3021 transform_source=transform_source)
3022 except ExtractorError:
3025 if isinstance(jwplayer_data, dict):
3026 return jwplayer_data
3028 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
3029 jwplayer_data = self._find_jwplayer_data(
3030 webpage, video_id, transform_source=js_to_json)
3031 return self._parse_jwplayer_data(
3032 jwplayer_data, video_id, *args, **kwargs)
3034 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
3035 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3036 # JWPlayer backward compatibility: flattened playlists
3037 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
3038 if 'playlist' not in jwplayer_data:
3039 jwplayer_data = {'playlist': [jwplayer_data]}
3043 # JWPlayer backward compatibility: single playlist item
3044 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
3045 if not isinstance(jwplayer_data['playlist'], list):
3046 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
3048 for video_data in jwplayer_data['playlist']:
3049 # JWPlayer backward compatibility: flattened sources
3050 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
3051 if 'sources' not in video_data:
3052 video_data['sources'] = [video_data]
3054 this_video_id = video_id or video_data['mediaid']
3056 formats = self._parse_jwplayer_formats(
3057 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
3058 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
3061 tracks = video_data.get('tracks')
3062 if tracks and isinstance(tracks, list):
3063 for track in tracks:
3064 if not isinstance(track, dict):
3066 track_kind = track.get('kind')
3067 if not track_kind or not isinstance(track_kind, compat_str):
3069 if track_kind.lower() not in ('captions', 'subtitles'):
3071 track_url = urljoin(base_url, track.get('file'))
3074 subtitles.setdefault(track.get('label') or 'en', []).append({
3075 'url': self._proto_relative_url(track_url)
3079 'id': this_video_id,
3080 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
3081 'description': clean_html(video_data.get('description')),
3082 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
3083 'timestamp': int_or_none(video_data.get('pubdate')),
3084 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
3085 'subtitles': subtitles,
3087 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
3088 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
3090 '_type': 'url_transparent',
3091 'url': formats[0]['url'],
3094 self._sort_formats(formats)
3095 entry['formats'] = formats
3096 entries.append(entry)
3097 if len(entries) == 1:
3100 return self.playlist_result(entries)
3102 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
3103 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
3106 for source in jwplayer_sources_data:
3107 if not isinstance(source, dict):
3109 source_url = urljoin(
3110 base_url, self._proto_relative_url(source.get('file')))
3111 if not source_url or source_url in urls:
3113 urls.append(source_url)
3114 source_type = source.get('type') or ''
3115 ext = mimetype2ext(source_type) or determine_ext(source_url)
3116 if source_type == 'hls' or ext == 'm3u8':
3117 formats.extend(self._extract_m3u8_formats(
3118 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
3119 m3u8_id=m3u8_id, fatal=False))
3120 elif source_type == 'dash' or ext == 'mpd':
3121 formats.extend(self._extract_mpd_formats(
3122 source_url, video_id, mpd_id=mpd_id, fatal=False))
3124 formats.extend(self._extract_smil_formats(
3125 source_url, video_id, fatal=False))
3126 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
3127 elif source_type.startswith('audio') or ext in (
3128 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
3135 height = int_or_none(source.get('height'))
3137 # Often no height is provided but there is a label in
3138 # format like "1080p", "720p SD", or 1080.
3139 height = int_or_none(self._search_regex(
3140 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
3141 'height', default=None))
3144 'width': int_or_none(source.get('width')),
3146 'tbr': int_or_none(source.get('bitrate')),
3149 if source_url.startswith('rtmp'):
3150 a_format['ext'] = 'flv'
3151 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
3152 # of jwplayer.flash.swf
3153 rtmp_url_parts = re.split(
3154 r'((?:mp4|mp3|flv):)', source_url, 1)
3155 if len(rtmp_url_parts) == 3:
3156 rtmp_url, prefix, play_path = rtmp_url_parts
3159 'play_path': prefix + play_path,
3162 a_format.update(rtmp_params)
3163 formats.append(a_format)
3166 def _live_title(self, name):
3167 """ Generate the title
for a live video
"""
3168 now = datetime.datetime.now()
3169 now_str = now.strftime('%Y-%m-%d %H:%M')
3170 return name + ' ' + now_str
3172 def _int(self, v, name, fatal=False, **kwargs):
3173 res = int_or_none(v, **kwargs)
3174 if 'get_attr' in kwargs:
3175 print(getattr(v, kwargs['get_attr']))
3177 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3179 raise ExtractorError(msg)
3181 self._downloader.report_warning(msg)
3184 def _float(self, v, name, fatal=False, **kwargs):
3185 res = float_or_none(v, **kwargs)
3187 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
3189 raise ExtractorError(msg)
3191 self._downloader.report_warning(msg)
3194 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
3195 path='/', secure=False, discard=False, rest={}, **kwargs):
3196 cookie = compat_cookiejar_Cookie(
3197 0, name, value, port, port is not None, domain, True,
3198 domain.startswith('.'), path, True, secure, expire_time,
3199 discard, None, None, rest)
3200 self._downloader.cookiejar.set_cookie(cookie)
3202 def _get_cookies(self, url):
3203 """ Return a compat_cookies
.SimpleCookie
with the cookies
for the url
"""
3204 req = sanitized_Request(url)
3205 self._downloader.cookiejar.add_cookie_header(req)
3206 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
3208 def _apply_first_set_cookie_header(self, url_handle, cookie):
3210 Apply first Set
-Cookie header instead of the last
. Experimental
.
3212 Some
sites (e
.g
. [1-3]) may serve two cookies under the same name
3213 in Set
-Cookie header
and expect the
first (old
) one to be
set rather
3214 than
second (new
). However
, as of RFC6265 the newer one cookie
3215 should be
set into cookie store what actually happens
.
3216 We will workaround this issue by resetting the cookie to
3217 the first one manually
.
3218 1. https
://new
.vk
.com
/
3219 2. https
://github
.com
/ytdl
-org
/youtube
-dl
/issues
/9841#issuecomment-227871201
3220 3. https
://learning
.oreilly
.com
/
3222 for header, cookies in url_handle.headers.items():
3223 if header.lower() != 'set-cookie':
3225 if sys.version_info[0] >= 3:
3226 cookies = cookies.encode('iso-8859-1')
3227 cookies = cookies.decode('utf-8')
3228 cookie_value = re.search(
3229 r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
3231 value, domain = cookie_value.groups()
3232 self._set_cookie(domain, cookie, value)
3235 def get_testcases(self, include_onlymatching=False):
3236 t = getattr(self, '_TEST', None)
3238 assert not hasattr(self, '_TESTS'), \
3239 '%s has _TEST and _TESTS' % type(self).__name__
3242 tests = getattr(self, '_TESTS', [])
3244 if not include_onlymatching and t.get('only_matching', False):
3246 t['name'] = type(self).__name__[:-len('IE')]
3249 def is_suitable(self, age_limit):
3250 """ Test whether the extractor
is generally suitable
for the given
3251 age
limit (i
.e
. pornographic sites are
not, all others usually are
) """
3253 any_restricted = False
3254 for tc in self.get_testcases(include_onlymatching=False):
3255 if tc.get('playlist', []):
3256 tc = tc['playlist'][0]
3257 is_restricted = age_restricted(
3258 tc.get('info_dict', {}).get('age_limit'), age_limit)
3259 if not is_restricted:
3261 any_restricted = any_restricted or is_restricted
3262 return not any_restricted
3264 def extract_subtitles(self, *args, **kwargs):
3265 if (self._downloader.params.get('writesubtitles', False)
3266 or self._downloader.params.get('listsubtitles')):
3267 return self._get_subtitles(*args, **kwargs)
3270 def _get_subtitles(self, *args, **kwargs):
3271 raise NotImplementedError('This method must be implemented by subclasses')
3274 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
3275 """ Merge subtitle items
for one language
. Items
with duplicated URLs
3276 will be dropped
. """
3277 list1_urls = set([item['url'] for item in subtitle_list1])
3278 ret = list(subtitle_list1)
3279 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
3283 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
3284 """ Merge two subtitle dictionaries
, language by language
. """
3285 ret = dict(subtitle_dict1)
3286 for lang in subtitle_dict2:
3287 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
3290 def extract_automatic_captions(self, *args, **kwargs):
3291 if (self._downloader.params.get('writeautomaticsub', False)
3292 or self._downloader.params.get('listsubtitles')):
3293 return self._get_automatic_captions(*args, **kwargs)
3296 def _get_automatic_captions(self, *args, **kwargs):
3297 raise NotImplementedError('This method must be implemented by subclasses')
3299 def mark_watched(self, *args, **kwargs):
3300 if (self._downloader.params.get('mark_watched', False)
3301 and (self._get_login_info()[0] is not None
3302 or self._downloader.params.get('cookiefile') is not None)):
3303 self._mark_watched(*args, **kwargs)
3305 def _mark_watched(self, *args, **kwargs):
3306 raise NotImplementedError('This method must be implemented by subclasses')
3308 def geo_verification_headers(self):
3310 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
3311 if geo_verification_proxy:
3312 headers['Ytdl-request-proxy'] = geo_verification_proxy
3315 def _generic_id(self, url):
3316 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
3318 def _generic_title(self, url):
3319 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
3322 class SearchInfoExtractor(InfoExtractor):
3324 Base
class for paged search queries extractors
.
3325 They accept URLs
in the format
_SEARCH_KEY(|all|
[0-9]):{query}
3326 Instances should define _SEARCH_KEY
and _MAX_RESULTS
.
3330 def _make_valid_url(cls):
3331 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
3334 def suitable(cls, url):
3335 return re.match(cls._make_valid_url(), url) is not None
3337 def _real_extract(self, query):
3338 mobj = re.match(self._make_valid_url(), query)
3340 raise ExtractorError('Invalid search query "%s"' % query)
3342 prefix = mobj.group('prefix')
3343 query = mobj.group('query')
3345 return self._get_n_results(query, 1)
3346 elif prefix == 'all':
3347 return self._get_n_results(query, self._MAX_RESULTS)
3351 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
3352 elif n > self._MAX_RESULTS:
3353 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
3354 n = self._MAX_RESULTS
3355 return self._get_n_results(query, n)
3357 def _get_n_results(self, query, n):
3358 """Get a specified number of results
for a query
"""
3359 raise NotImplementedError('This method must be implemented by subclasses')
3362 def SEARCH_KEY(self):
3363 return self._SEARCH_KEY