]> jfr.im git - yt-dlp.git/blob - youtube_dl/extractor/common.py
Refactor fragments interface and dash segments downloader
[yt-dlp.git] / youtube_dl / extractor / common.py
1 from __future__ import unicode_literals
2
3 import base64
4 import datetime
5 import hashlib
6 import json
7 import netrc
8 import os
9 import re
10 import socket
11 import sys
12 import time
13 import math
14
15 from ..compat import (
16 compat_cookiejar,
17 compat_cookies,
18 compat_etree_fromstring,
19 compat_getpass,
20 compat_http_client,
21 compat_os_name,
22 compat_str,
23 compat_urllib_error,
24 compat_urllib_parse_urlencode,
25 compat_urllib_request,
26 compat_urlparse,
27 )
28 from ..downloader.f4m import remove_encrypted_media
29 from ..utils import (
30 NO_DEFAULT,
31 age_restricted,
32 bug_reports_message,
33 clean_html,
34 compiled_regex_type,
35 determine_ext,
36 error_to_compat_str,
37 ExtractorError,
38 fix_xml_ampersands,
39 float_or_none,
40 int_or_none,
41 parse_iso8601,
42 RegexNotFoundError,
43 sanitize_filename,
44 sanitized_Request,
45 unescapeHTML,
46 unified_strdate,
47 unified_timestamp,
48 url_basename,
49 xpath_element,
50 xpath_text,
51 xpath_with_ns,
52 determine_protocol,
53 parse_duration,
54 mimetype2ext,
55 update_Request,
56 update_url_query,
57 parse_m3u8_attributes,
58 extract_attributes,
59 parse_codecs,
60 )
61
62
63 class InfoExtractor(object):
64 """Information Extractor class.
65
66 Information extractors are the classes that, given a URL, extract
67 information about the video (or videos) the URL refers to. This
68 information includes the real video URL, the video title, author and
69 others. The information is stored in a dictionary which is then
70 passed to the YoutubeDL. The YoutubeDL processes this
71 information possibly downloading the video to the file system, among
72 other possible outcomes.
73
74 The type field determines the type of the result.
75 By far the most common value (and the default if _type is missing) is
76 "video", which indicates a single video.
77
78 For a video, the dictionaries must include the following fields:
79
80 id: Video identifier.
81 title: Video title, unescaped.
82
83 Additionally, it must contain either a formats entry or a url one:
84
85 formats: A list of dictionaries for each format available, ordered
86 from worst to best quality.
87
88 Potential fields:
89 * url Mandatory. The URL of the video file
90 * manifest_url
91 The URL of the manifest file in case of
92 fragmented media (DASH, hls, hds)
93 * ext Will be calculated from URL if missing
94 * format A human-readable description of the format
95 ("mp4 container with h264/opus").
96 Calculated from the format_id, width, height.
97 and format_note fields if missing.
98 * format_id A short description of the format
99 ("mp4_h264_opus" or "19").
100 Technically optional, but strongly recommended.
101 * format_note Additional info about the format
102 ("3D" or "DASH video")
103 * width Width of the video, if known
104 * height Height of the video, if known
105 * resolution Textual description of width and height
106 * tbr Average bitrate of audio and video in KBit/s
107 * abr Average audio bitrate in KBit/s
108 * acodec Name of the audio codec in use
109 * asr Audio sampling rate in Hertz
110 * vbr Average video bitrate in KBit/s
111 * fps Frame rate
112 * vcodec Name of the video codec in use
113 * container Name of the container format
114 * filesize The number of bytes, if known in advance
115 * filesize_approx An estimate for the number of bytes
116 * player_url SWF Player URL (used for rtmpdump).
117 * protocol The protocol that will be used for the actual
118 download, lower-case.
119 "http", "https", "rtsp", "rtmp", "rtmpe",
120 "m3u8", "m3u8_native" or "http_dash_segments".
121 * fragments A list of fragments of the fragmented media,
122 with the following entries:
123 * "url" (mandatory) - fragment's URL
124 * "duration" (optional, int or float)
125 * "filesize" (optional, int)
126 * preference Order number of this format. If this field is
127 present and not None, the formats get sorted
128 by this field, regardless of all other values.
129 -1 for default (order by other properties),
130 -2 or smaller for less than default.
131 < -1000 to hide the format (if there is
132 another one which is strictly better)
133 * language Language code, e.g. "de" or "en-US".
134 * language_preference Is this in the language mentioned in
135 the URL?
136 10 if it's what the URL is about,
137 -1 for default (don't know),
138 -10 otherwise, other values reserved for now.
139 * quality Order number of the video quality of this
140 format, irrespective of the file format.
141 -1 for default (order by other properties),
142 -2 or smaller for less than default.
143 * source_preference Order number for this video source
144 (quality takes higher priority)
145 -1 for default (order by other properties),
146 -2 or smaller for less than default.
147 * http_headers A dictionary of additional HTTP headers
148 to add to the request.
149 * stretched_ratio If given and not 1, indicates that the
150 video's pixels are not square.
151 width : height ratio as float.
152 * no_resume The server does not support resuming the
153 (HTTP or RTMP) download. Boolean.
154
155 url: Final video URL.
156 ext: Video filename extension.
157 format: The video format, defaults to ext (used for --get-format)
158 player_url: SWF Player URL (used for rtmpdump).
159
160 The following fields are optional:
161
162 alt_title: A secondary title of the video.
163 display_id An alternative identifier for the video, not necessarily
164 unique, but available before title. Typically, id is
165 something like "4234987", title "Dancing naked mole rats",
166 and display_id "dancing-naked-mole-rats"
167 thumbnails: A list of dictionaries, with the following entries:
168 * "id" (optional, string) - Thumbnail format ID
169 * "url"
170 * "preference" (optional, int) - quality of the image
171 * "width" (optional, int)
172 * "height" (optional, int)
173 * "resolution" (optional, string "{width}x{height"},
174 deprecated)
175 * "filesize" (optional, int)
176 thumbnail: Full URL to a video thumbnail image.
177 description: Full video description.
178 uploader: Full name of the video uploader.
179 license: License name the video is licensed under.
180 creator: The creator of the video.
181 release_date: The date (YYYYMMDD) when the video was released.
182 timestamp: UNIX timestamp of the moment the video became available.
183 upload_date: Video upload date (YYYYMMDD).
184 If not explicitly set, calculated from timestamp.
185 uploader_id: Nickname or id of the video uploader.
186 uploader_url: Full URL to a personal webpage of the video uploader.
187 location: Physical location where the video was filmed.
188 subtitles: The available subtitles as a dictionary in the format
189 {language: subformats}. "subformats" is a list sorted from
190 lower to higher preference, each element is a dictionary
191 with the "ext" entry and one of:
192 * "data": The subtitles file contents
193 * "url": A URL pointing to the subtitles file
194 "ext" will be calculated from URL if missing
195 automatic_captions: Like 'subtitles', used by the YoutubeIE for
196 automatically generated captions
197 duration: Length of the video in seconds, as an integer or float.
198 view_count: How many users have watched the video on the platform.
199 like_count: Number of positive ratings of the video
200 dislike_count: Number of negative ratings of the video
201 repost_count: Number of reposts of the video
202 average_rating: Average rating give by users, the scale used depends on the webpage
203 comment_count: Number of comments on the video
204 comments: A list of comments, each with one or more of the following
205 properties (all but one of text or html optional):
206 * "author" - human-readable name of the comment author
207 * "author_id" - user ID of the comment author
208 * "id" - Comment ID
209 * "html" - Comment as HTML
210 * "text" - Plain text of the comment
211 * "timestamp" - UNIX timestamp of comment
212 * "parent" - ID of the comment this one is replying to.
213 Set to "root" to indicate that this is a
214 comment to the original video.
215 age_limit: Age restriction for the video, as an integer (years)
216 webpage_url: The URL to the video webpage, if given to youtube-dl it
217 should allow to get the same result again. (It will be set
218 by YoutubeDL if it's missing)
219 categories: A list of categories that the video falls in, for example
220 ["Sports", "Berlin"]
221 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
222 is_live: True, False, or None (=unknown). Whether this video is a
223 live stream that goes on instead of a fixed-length video.
224 start_time: Time in seconds where the reproduction should start, as
225 specified in the URL.
226 end_time: Time in seconds where the reproduction should end, as
227 specified in the URL.
228
229 The following fields should only be used when the video belongs to some logical
230 chapter or section:
231
232 chapter: Name or title of the chapter the video belongs to.
233 chapter_number: Number of the chapter the video belongs to, as an integer.
234 chapter_id: Id of the chapter the video belongs to, as a unicode string.
235
236 The following fields should only be used when the video is an episode of some
237 series or programme:
238
239 series: Title of the series or programme the video episode belongs to.
240 season: Title of the season the video episode belongs to.
241 season_number: Number of the season the video episode belongs to, as an integer.
242 season_id: Id of the season the video episode belongs to, as a unicode string.
243 episode: Title of the video episode. Unlike mandatory video title field,
244 this field should denote the exact title of the video episode
245 without any kind of decoration.
246 episode_number: Number of the video episode within a season, as an integer.
247 episode_id: Id of the video episode, as a unicode string.
248
249 The following fields should only be used when the media is a track or a part of
250 a music album:
251
252 track: Title of the track.
253 track_number: Number of the track within an album or a disc, as an integer.
254 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
255 as a unicode string.
256 artist: Artist(s) of the track.
257 genre: Genre(s) of the track.
258 album: Title of the album the track belongs to.
259 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
260 album_artist: List of all artists appeared on the album (e.g.
261 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
262 and compilations).
263 disc_number: Number of the disc or other physical medium the track belongs to,
264 as an integer.
265 release_year: Year (YYYY) when the album was released.
266
267 Unless mentioned otherwise, the fields should be Unicode strings.
268
269 Unless mentioned otherwise, None is equivalent to absence of information.
270
271
272 _type "playlist" indicates multiple videos.
273 There must be a key "entries", which is a list, an iterable, or a PagedList
274 object, each element of which is a valid dictionary by this specification.
275
276 Additionally, playlists can have "title", "description" and "id" attributes
277 with the same semantics as videos (see above).
278
279
280 _type "multi_video" indicates that there are multiple videos that
281 form a single show, for examples multiple acts of an opera or TV episode.
282 It must have an entries key like a playlist and contain all the keys
283 required for a video at the same time.
284
285
286 _type "url" indicates that the video must be extracted from another
287 location, possibly by a different extractor. Its only required key is:
288 "url" - the next URL to extract.
289 The key "ie_key" can be set to the class name (minus the trailing "IE",
290 e.g. "Youtube") if the extractor class is known in advance.
291 Additionally, the dictionary may have any properties of the resolved entity
292 known in advance, for example "title" if the title of the referred video is
293 known ahead of time.
294
295
296 _type "url_transparent" entities have the same specification as "url", but
297 indicate that the given additional information is more precise than the one
298 associated with the resolved URL.
299 This is useful when a site employs a video service that hosts the video and
300 its technical metadata, but that video service does not embed a useful
301 title, description etc.
302
303
304 Subclasses of this one should re-define the _real_initialize() and
305 _real_extract() methods and define a _VALID_URL regexp.
306 Probably, they should also be added to the list of extractors.
307
308 Finally, the _WORKING attribute should be set to False for broken IEs
309 in order to warn the users and skip the tests.
310 """
311
312 _ready = False
313 _downloader = None
314 _WORKING = True
315
316 def __init__(self, downloader=None):
317 """Constructor. Receives an optional downloader."""
318 self._ready = False
319 self.set_downloader(downloader)
320
321 @classmethod
322 def suitable(cls, url):
323 """Receives a URL and returns True if suitable for this IE."""
324
325 # This does not use has/getattr intentionally - we want to know whether
326 # we have cached the regexp for *this* class, whereas getattr would also
327 # match the superclass
328 if '_VALID_URL_RE' not in cls.__dict__:
329 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
330 return cls._VALID_URL_RE.match(url) is not None
331
332 @classmethod
333 def _match_id(cls, url):
334 if '_VALID_URL_RE' not in cls.__dict__:
335 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
336 m = cls._VALID_URL_RE.match(url)
337 assert m
338 return m.group('id')
339
340 @classmethod
341 def working(cls):
342 """Getter method for _WORKING."""
343 return cls._WORKING
344
345 def initialize(self):
346 """Initializes an instance (authentication, etc)."""
347 if not self._ready:
348 self._real_initialize()
349 self._ready = True
350
351 def extract(self, url):
352 """Extracts URL information and returns it in list of dicts."""
353 try:
354 self.initialize()
355 return self._real_extract(url)
356 except ExtractorError:
357 raise
358 except compat_http_client.IncompleteRead as e:
359 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
360 except (KeyError, StopIteration) as e:
361 raise ExtractorError('An extractor error has occurred.', cause=e)
362
363 def set_downloader(self, downloader):
364 """Sets the downloader for this IE."""
365 self._downloader = downloader
366
367 def _real_initialize(self):
368 """Real initialization process. Redefine in subclasses."""
369 pass
370
371 def _real_extract(self, url):
372 """Real extraction process. Redefine in subclasses."""
373 pass
374
375 @classmethod
376 def ie_key(cls):
377 """A string for getting the InfoExtractor with get_info_extractor"""
378 return compat_str(cls.__name__[:-2])
379
380 @property
381 def IE_NAME(self):
382 return compat_str(type(self).__name__[:-2])
383
384 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
385 """ Returns the response handle """
386 if note is None:
387 self.report_download_webpage(video_id)
388 elif note is not False:
389 if video_id is None:
390 self.to_screen('%s' % (note,))
391 else:
392 self.to_screen('%s: %s' % (video_id, note))
393 if isinstance(url_or_request, compat_urllib_request.Request):
394 url_or_request = update_Request(
395 url_or_request, data=data, headers=headers, query=query)
396 else:
397 if query:
398 url_or_request = update_url_query(url_or_request, query)
399 if data is not None or headers:
400 url_or_request = sanitized_Request(url_or_request, data, headers)
401 try:
402 return self._downloader.urlopen(url_or_request)
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 if errnote is False:
405 return False
406 if errnote is None:
407 errnote = 'Unable to download webpage'
408
409 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
410 if fatal:
411 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
412 else:
413 self._downloader.report_warning(errmsg)
414 return False
415
416 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
417 """ Returns a tuple (page content as string, URL handle) """
418 # Strip hashes from the URL (#1038)
419 if isinstance(url_or_request, (compat_str, str)):
420 url_or_request = url_or_request.partition('#')[0]
421
422 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
423 if urlh is False:
424 assert not fatal
425 return False
426 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
427 return (content, urlh)
428
429 @staticmethod
430 def _guess_encoding_from_content(content_type, webpage_bytes):
431 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
432 if m:
433 encoding = m.group(1)
434 else:
435 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
436 webpage_bytes[:1024])
437 if m:
438 encoding = m.group(1).decode('ascii')
439 elif webpage_bytes.startswith(b'\xff\xfe'):
440 encoding = 'utf-16'
441 else:
442 encoding = 'utf-8'
443
444 return encoding
445
446 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
447 content_type = urlh.headers.get('Content-Type', '')
448 webpage_bytes = urlh.read()
449 if prefix is not None:
450 webpage_bytes = prefix + webpage_bytes
451 if not encoding:
452 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
453 if self._downloader.params.get('dump_intermediate_pages', False):
454 try:
455 url = url_or_request.get_full_url()
456 except AttributeError:
457 url = url_or_request
458 self.to_screen('Dumping request to ' + url)
459 dump = base64.b64encode(webpage_bytes).decode('ascii')
460 self._downloader.to_screen(dump)
461 if self._downloader.params.get('write_pages', False):
462 try:
463 url = url_or_request.get_full_url()
464 except AttributeError:
465 url = url_or_request
466 basen = '%s_%s' % (video_id, url)
467 if len(basen) > 240:
468 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
469 basen = basen[:240 - len(h)] + h
470 raw_filename = basen + '.dump'
471 filename = sanitize_filename(raw_filename, restricted=True)
472 self.to_screen('Saving request to ' + filename)
473 # Working around MAX_PATH limitation on Windows (see
474 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
475 if compat_os_name == 'nt':
476 absfilepath = os.path.abspath(filename)
477 if len(absfilepath) > 259:
478 filename = '\\\\?\\' + absfilepath
479 with open(filename, 'wb') as outf:
480 outf.write(webpage_bytes)
481
482 try:
483 content = webpage_bytes.decode(encoding, 'replace')
484 except LookupError:
485 content = webpage_bytes.decode('utf-8', 'replace')
486
487 if ('<title>Access to this site is blocked</title>' in content and
488 'Websense' in content[:512]):
489 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
490 blocked_iframe = self._html_search_regex(
491 r'<iframe src="([^"]+)"', content,
492 'Websense information URL', default=None)
493 if blocked_iframe:
494 msg += ' Visit %s for more details' % blocked_iframe
495 raise ExtractorError(msg, expected=True)
496 if '<title>The URL you requested has been blocked</title>' in content[:512]:
497 msg = (
498 'Access to this webpage has been blocked by Indian censorship. '
499 'Use a VPN or proxy server (with --proxy) to route around it.')
500 block_msg = self._html_search_regex(
501 r'</h1><p>(.*?)</p>',
502 content, 'block message', default=None)
503 if block_msg:
504 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
505 raise ExtractorError(msg, expected=True)
506
507 return content
508
509 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
510 """ Returns the data of the page as a string """
511 success = False
512 try_count = 0
513 while success is False:
514 try:
515 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
516 success = True
517 except compat_http_client.IncompleteRead as e:
518 try_count += 1
519 if try_count >= tries:
520 raise e
521 self._sleep(timeout, video_id)
522 if res is False:
523 return res
524 else:
525 content, _ = res
526 return content
527
528 def _download_xml(self, url_or_request, video_id,
529 note='Downloading XML', errnote='Unable to download XML',
530 transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
531 """Return the xml as an xml.etree.ElementTree.Element"""
532 xml_string = self._download_webpage(
533 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
534 if xml_string is False:
535 return xml_string
536 if transform_source:
537 xml_string = transform_source(xml_string)
538 return compat_etree_fromstring(xml_string.encode('utf-8'))
539
540 def _download_json(self, url_or_request, video_id,
541 note='Downloading JSON metadata',
542 errnote='Unable to download JSON metadata',
543 transform_source=None,
544 fatal=True, encoding=None, data=None, headers={}, query={}):
545 json_string = self._download_webpage(
546 url_or_request, video_id, note, errnote, fatal=fatal,
547 encoding=encoding, data=data, headers=headers, query=query)
548 if (not fatal) and json_string is False:
549 return None
550 return self._parse_json(
551 json_string, video_id, transform_source=transform_source, fatal=fatal)
552
553 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
554 if transform_source:
555 json_string = transform_source(json_string)
556 try:
557 return json.loads(json_string)
558 except ValueError as ve:
559 errmsg = '%s: Failed to parse JSON ' % video_id
560 if fatal:
561 raise ExtractorError(errmsg, cause=ve)
562 else:
563 self.report_warning(errmsg + str(ve))
564
565 def report_warning(self, msg, video_id=None):
566 idstr = '' if video_id is None else '%s: ' % video_id
567 self._downloader.report_warning(
568 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
569
570 def to_screen(self, msg):
571 """Print msg to screen, prefixing it with '[ie_name]'"""
572 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
573
574 def report_extraction(self, id_or_name):
575 """Report information extraction."""
576 self.to_screen('%s: Extracting information' % id_or_name)
577
578 def report_download_webpage(self, video_id):
579 """Report webpage download."""
580 self.to_screen('%s: Downloading webpage' % video_id)
581
582 def report_age_confirmation(self):
583 """Report attempt to confirm age."""
584 self.to_screen('Confirming age')
585
586 def report_login(self):
587 """Report attempt to log in."""
588 self.to_screen('Logging in')
589
590 @staticmethod
591 def raise_login_required(msg='This video is only available for registered users'):
592 raise ExtractorError(
593 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
594 expected=True)
595
596 @staticmethod
597 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
598 raise ExtractorError(
599 '%s. You might want to use --proxy to workaround.' % msg,
600 expected=True)
601
602 # Methods for following #608
603 @staticmethod
604 def url_result(url, ie=None, video_id=None, video_title=None):
605 """Returns a URL that points to a page that should be processed"""
606 # TODO: ie should be the class used for getting the info
607 video_info = {'_type': 'url',
608 'url': url,
609 'ie_key': ie}
610 if video_id is not None:
611 video_info['id'] = video_id
612 if video_title is not None:
613 video_info['title'] = video_title
614 return video_info
615
616 @staticmethod
617 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
618 """Returns a playlist"""
619 video_info = {'_type': 'playlist',
620 'entries': entries}
621 if playlist_id:
622 video_info['id'] = playlist_id
623 if playlist_title:
624 video_info['title'] = playlist_title
625 if playlist_description:
626 video_info['description'] = playlist_description
627 return video_info
628
629 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
630 """
631 Perform a regex search on the given string, using a single or a list of
632 patterns returning the first matching group.
633 In case of failure return a default value or raise a WARNING or a
634 RegexNotFoundError, depending on fatal, specifying the field name.
635 """
636 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
637 mobj = re.search(pattern, string, flags)
638 else:
639 for p in pattern:
640 mobj = re.search(p, string, flags)
641 if mobj:
642 break
643
644 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
645 _name = '\033[0;34m%s\033[0m' % name
646 else:
647 _name = name
648
649 if mobj:
650 if group is None:
651 # return the first matching group
652 return next(g for g in mobj.groups() if g is not None)
653 else:
654 return mobj.group(group)
655 elif default is not NO_DEFAULT:
656 return default
657 elif fatal:
658 raise RegexNotFoundError('Unable to extract %s' % _name)
659 else:
660 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
661 return None
662
663 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
664 """
665 Like _search_regex, but strips HTML tags and unescapes entities.
666 """
667 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
668 if res:
669 return clean_html(res).strip()
670 else:
671 return res
672
673 def _get_netrc_login_info(self, netrc_machine=None):
674 username = None
675 password = None
676 netrc_machine = netrc_machine or self._NETRC_MACHINE
677
678 if self._downloader.params.get('usenetrc', False):
679 try:
680 info = netrc.netrc().authenticators(netrc_machine)
681 if info is not None:
682 username = info[0]
683 password = info[2]
684 else:
685 raise netrc.NetrcParseError(
686 'No authenticators for %s' % netrc_machine)
687 except (IOError, netrc.NetrcParseError) as err:
688 self._downloader.report_warning(
689 'parsing .netrc: %s' % error_to_compat_str(err))
690
691 return username, password
692
693 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
694 """
695 Get the login info as (username, password)
696 First look for the manually specified credentials using username_option
697 and password_option as keys in params dictionary. If no such credentials
698 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
699 value.
700 If there's no info available, return (None, None)
701 """
702 if self._downloader is None:
703 return (None, None)
704
705 downloader_params = self._downloader.params
706
707 # Attempt to use provided username and password or .netrc data
708 if downloader_params.get(username_option) is not None:
709 username = downloader_params[username_option]
710 password = downloader_params[password_option]
711 else:
712 username, password = self._get_netrc_login_info(netrc_machine)
713
714 return username, password
715
716 def _get_tfa_info(self, note='two-factor verification code'):
717 """
718 Get the two-factor authentication info
719 TODO - asking the user will be required for sms/phone verify
720 currently just uses the command line option
721 If there's no info available, return None
722 """
723 if self._downloader is None:
724 return None
725 downloader_params = self._downloader.params
726
727 if downloader_params.get('twofactor') is not None:
728 return downloader_params['twofactor']
729
730 return compat_getpass('Type %s and press [Return]: ' % note)
731
732 # Helper functions for extracting OpenGraph info
733 @staticmethod
734 def _og_regexes(prop):
735 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
736 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
737 % {'prop': re.escape(prop)})
738 template = r'<meta[^>]+?%s[^>]+?%s'
739 return [
740 template % (property_re, content_re),
741 template % (content_re, property_re),
742 ]
743
744 @staticmethod
745 def _meta_regex(prop):
746 return r'''(?isx)<meta
747 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
748 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
749
750 def _og_search_property(self, prop, html, name=None, **kargs):
751 if not isinstance(prop, (list, tuple)):
752 prop = [prop]
753 if name is None:
754 name = 'OpenGraph %s' % prop[0]
755 og_regexes = []
756 for p in prop:
757 og_regexes.extend(self._og_regexes(p))
758 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
759 if escaped is None:
760 return None
761 return unescapeHTML(escaped)
762
763 def _og_search_thumbnail(self, html, **kargs):
764 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
765
766 def _og_search_description(self, html, **kargs):
767 return self._og_search_property('description', html, fatal=False, **kargs)
768
769 def _og_search_title(self, html, **kargs):
770 return self._og_search_property('title', html, **kargs)
771
772 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
773 regexes = self._og_regexes('video') + self._og_regexes('video:url')
774 if secure:
775 regexes = self._og_regexes('video:secure_url') + regexes
776 return self._html_search_regex(regexes, html, name, **kargs)
777
778 def _og_search_url(self, html, **kargs):
779 return self._og_search_property('url', html, **kargs)
780
781 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
782 if not isinstance(name, (list, tuple)):
783 name = [name]
784 if display_name is None:
785 display_name = name[0]
786 return self._html_search_regex(
787 [self._meta_regex(n) for n in name],
788 html, display_name, fatal=fatal, group='content', **kwargs)
789
790 def _dc_search_uploader(self, html):
791 return self._html_search_meta('dc.creator', html, 'uploader')
792
793 def _rta_search(self, html):
794 # See http://www.rtalabel.org/index.php?content=howtofaq#single
795 if re.search(r'(?ix)<meta\s+name="rating"\s+'
796 r' content="RTA-5042-1996-1400-1577-RTA"',
797 html):
798 return 18
799 return 0
800
801 def _media_rating_search(self, html):
802 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
803 rating = self._html_search_meta('rating', html)
804
805 if not rating:
806 return None
807
808 RATING_TABLE = {
809 'safe for kids': 0,
810 'general': 8,
811 '14 years': 14,
812 'mature': 17,
813 'restricted': 19,
814 }
815 return RATING_TABLE.get(rating.lower())
816
817 def _family_friendly_search(self, html):
818 # See http://schema.org/VideoObject
819 family_friendly = self._html_search_meta('isFamilyFriendly', html)
820
821 if not family_friendly:
822 return None
823
824 RATING_TABLE = {
825 '1': 0,
826 'true': 0,
827 '0': 18,
828 'false': 18,
829 }
830 return RATING_TABLE.get(family_friendly.lower())
831
832 def _twitter_search_player(self, html):
833 return self._html_search_meta('twitter:player', html,
834 'twitter card player')
835
836 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
837 json_ld = self._search_regex(
838 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
839 html, 'JSON-LD', group='json_ld', **kwargs)
840 default = kwargs.get('default', NO_DEFAULT)
841 if not json_ld:
842 return default if default is not NO_DEFAULT else {}
843 # JSON-LD may be malformed and thus `fatal` should be respected.
844 # At the same time `default` may be passed that assumes `fatal=False`
845 # for _search_regex. Let's simulate the same behavior here as well.
846 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
847 return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
848
849 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
850 if isinstance(json_ld, compat_str):
851 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
852 if not json_ld:
853 return {}
854 info = {}
855 if not isinstance(json_ld, (list, tuple, dict)):
856 return info
857 if isinstance(json_ld, dict):
858 json_ld = [json_ld]
859 for e in json_ld:
860 if e.get('@context') == 'http://schema.org':
861 item_type = e.get('@type')
862 if expected_type is not None and expected_type != item_type:
863 return info
864 if item_type == 'TVEpisode':
865 info.update({
866 'episode': unescapeHTML(e.get('name')),
867 'episode_number': int_or_none(e.get('episodeNumber')),
868 'description': unescapeHTML(e.get('description')),
869 })
870 part_of_season = e.get('partOfSeason')
871 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
872 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
873 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
874 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
875 info['series'] = unescapeHTML(part_of_series.get('name'))
876 elif item_type == 'Article':
877 info.update({
878 'timestamp': parse_iso8601(e.get('datePublished')),
879 'title': unescapeHTML(e.get('headline')),
880 'description': unescapeHTML(e.get('articleBody')),
881 })
882 elif item_type == 'VideoObject':
883 info.update({
884 'url': e.get('contentUrl'),
885 'title': unescapeHTML(e.get('name')),
886 'description': unescapeHTML(e.get('description')),
887 'thumbnail': e.get('thumbnailUrl'),
888 'duration': parse_duration(e.get('duration')),
889 'timestamp': unified_timestamp(e.get('uploadDate')),
890 'filesize': float_or_none(e.get('contentSize')),
891 'tbr': int_or_none(e.get('bitrate')),
892 'width': int_or_none(e.get('width')),
893 'height': int_or_none(e.get('height')),
894 })
895 break
896 return dict((k, v) for k, v in info.items() if v is not None)
897
898 @staticmethod
899 def _hidden_inputs(html):
900 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
901 hidden_inputs = {}
902 for input in re.findall(r'(?i)(<input[^>]+>)', html):
903 attrs = extract_attributes(input)
904 if not input:
905 continue
906 if attrs.get('type') not in ('hidden', 'submit'):
907 continue
908 name = attrs.get('name') or attrs.get('id')
909 value = attrs.get('value')
910 if name and value is not None:
911 hidden_inputs[name] = value
912 return hidden_inputs
913
914 def _form_hidden_inputs(self, form_id, html):
915 form = self._search_regex(
916 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
917 html, '%s form' % form_id, group='form')
918 return self._hidden_inputs(form)
919
920 def _sort_formats(self, formats, field_preference=None):
921 if not formats:
922 raise ExtractorError('No video formats found')
923
924 for f in formats:
925 # Automatically determine tbr when missing based on abr and vbr (improves
926 # formats sorting in some cases)
927 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
928 f['tbr'] = f['abr'] + f['vbr']
929
930 def _formats_key(f):
931 # TODO remove the following workaround
932 from ..utils import determine_ext
933 if not f.get('ext') and 'url' in f:
934 f['ext'] = determine_ext(f['url'])
935
936 if isinstance(field_preference, (list, tuple)):
937 return tuple(
938 f.get(field)
939 if f.get(field) is not None
940 else ('' if field == 'format_id' else -1)
941 for field in field_preference)
942
943 preference = f.get('preference')
944 if preference is None:
945 preference = 0
946 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
947 preference -= 0.5
948
949 protocol = f.get('protocol') or determine_protocol(f)
950 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
951
952 if f.get('vcodec') == 'none': # audio only
953 preference -= 50
954 if self._downloader.params.get('prefer_free_formats'):
955 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
956 else:
957 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
958 ext_preference = 0
959 try:
960 audio_ext_preference = ORDER.index(f['ext'])
961 except ValueError:
962 audio_ext_preference = -1
963 else:
964 if f.get('acodec') == 'none': # video only
965 preference -= 40
966 if self._downloader.params.get('prefer_free_formats'):
967 ORDER = ['flv', 'mp4', 'webm']
968 else:
969 ORDER = ['webm', 'flv', 'mp4']
970 try:
971 ext_preference = ORDER.index(f['ext'])
972 except ValueError:
973 ext_preference = -1
974 audio_ext_preference = 0
975
976 return (
977 preference,
978 f.get('language_preference') if f.get('language_preference') is not None else -1,
979 f.get('quality') if f.get('quality') is not None else -1,
980 f.get('tbr') if f.get('tbr') is not None else -1,
981 f.get('filesize') if f.get('filesize') is not None else -1,
982 f.get('vbr') if f.get('vbr') is not None else -1,
983 f.get('height') if f.get('height') is not None else -1,
984 f.get('width') if f.get('width') is not None else -1,
985 proto_preference,
986 ext_preference,
987 f.get('abr') if f.get('abr') is not None else -1,
988 audio_ext_preference,
989 f.get('fps') if f.get('fps') is not None else -1,
990 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
991 f.get('source_preference') if f.get('source_preference') is not None else -1,
992 f.get('format_id') if f.get('format_id') is not None else '',
993 )
994 formats.sort(key=_formats_key)
995
996 def _check_formats(self, formats, video_id):
997 if formats:
998 formats[:] = filter(
999 lambda f: self._is_valid_url(
1000 f['url'], video_id,
1001 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1002 formats)
1003
1004 @staticmethod
1005 def _remove_duplicate_formats(formats):
1006 format_urls = set()
1007 unique_formats = []
1008 for f in formats:
1009 if f['url'] not in format_urls:
1010 format_urls.add(f['url'])
1011 unique_formats.append(f)
1012 formats[:] = unique_formats
1013
1014 def _is_valid_url(self, url, video_id, item='video'):
1015 url = self._proto_relative_url(url, scheme='http:')
1016 # For now assume non HTTP(S) URLs always valid
1017 if not (url.startswith('http://') or url.startswith('https://')):
1018 return True
1019 try:
1020 self._request_webpage(url, video_id, 'Checking %s URL' % item)
1021 return True
1022 except ExtractorError as e:
1023 if isinstance(e.cause, compat_urllib_error.URLError):
1024 self.to_screen(
1025 '%s: %s URL is invalid, skipping' % (video_id, item))
1026 return False
1027 raise
1028
1029 def http_scheme(self):
1030 """ Either "http:" or "https:", depending on the user's preferences """
1031 return (
1032 'http:'
1033 if self._downloader.params.get('prefer_insecure', False)
1034 else 'https:')
1035
1036 def _proto_relative_url(self, url, scheme=None):
1037 if url is None:
1038 return url
1039 if url.startswith('//'):
1040 if scheme is None:
1041 scheme = self.http_scheme()
1042 return scheme + url
1043 else:
1044 return url
1045
1046 def _sleep(self, timeout, video_id, msg_template=None):
1047 if msg_template is None:
1048 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1049 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1050 self.to_screen(msg)
1051 time.sleep(timeout)
1052
1053 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1054 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1055 fatal=True, m3u8_id=None):
1056 manifest = self._download_xml(
1057 manifest_url, video_id, 'Downloading f4m manifest',
1058 'Unable to download f4m manifest',
1059 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1060 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1061 transform_source=transform_source,
1062 fatal=fatal)
1063
1064 if manifest is False:
1065 return []
1066
1067 return self._parse_f4m_formats(
1068 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1069 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1070
1071 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1072 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1073 fatal=True, m3u8_id=None):
1074 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1075 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1076 if akamai_pv is not None and ';' in akamai_pv.text:
1077 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1078 if playerVerificationChallenge.strip() != '':
1079 return []
1080
1081 formats = []
1082 manifest_version = '1.0'
1083 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1084 if not media_nodes:
1085 manifest_version = '2.0'
1086 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1087 # Remove unsupported DRM protected media from final formats
1088 # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1089 media_nodes = remove_encrypted_media(media_nodes)
1090 if not media_nodes:
1091 return formats
1092 base_url = xpath_text(
1093 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1094 'base URL', default=None)
1095 if base_url:
1096 base_url = base_url.strip()
1097
1098 bootstrap_info = xpath_element(
1099 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1100 'bootstrap info', default=None)
1101
1102 for i, media_el in enumerate(media_nodes):
1103 tbr = int_or_none(media_el.attrib.get('bitrate'))
1104 width = int_or_none(media_el.attrib.get('width'))
1105 height = int_or_none(media_el.attrib.get('height'))
1106 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1107 # If <bootstrapInfo> is present, the specified f4m is a
1108 # stream-level manifest, and only set-level manifests may refer to
1109 # external resources. See section 11.4 and section 4 of F4M spec
1110 if bootstrap_info is None:
1111 media_url = None
1112 # @href is introduced in 2.0, see section 11.6 of F4M spec
1113 if manifest_version == '2.0':
1114 media_url = media_el.attrib.get('href')
1115 if media_url is None:
1116 media_url = media_el.attrib.get('url')
1117 if not media_url:
1118 continue
1119 manifest_url = (
1120 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1121 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1122 # If media_url is itself a f4m manifest do the recursive extraction
1123 # since bitrates in parent manifest (this one) and media_url manifest
1124 # may differ leading to inability to resolve the format by requested
1125 # bitrate in f4m downloader
1126 ext = determine_ext(manifest_url)
1127 if ext == 'f4m':
1128 f4m_formats = self._extract_f4m_formats(
1129 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1130 transform_source=transform_source, fatal=fatal)
1131 # Sometimes stream-level manifest contains single media entry that
1132 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1133 # At the same time parent's media entry in set-level manifest may
1134 # contain it. We will copy it from parent in such cases.
1135 if len(f4m_formats) == 1:
1136 f = f4m_formats[0]
1137 f.update({
1138 'tbr': f.get('tbr') or tbr,
1139 'width': f.get('width') or width,
1140 'height': f.get('height') or height,
1141 'format_id': f.get('format_id') if not tbr else format_id,
1142 })
1143 formats.extend(f4m_formats)
1144 continue
1145 elif ext == 'm3u8':
1146 formats.extend(self._extract_m3u8_formats(
1147 manifest_url, video_id, 'mp4', preference=preference,
1148 m3u8_id=m3u8_id, fatal=fatal))
1149 continue
1150 formats.append({
1151 'format_id': format_id,
1152 'url': manifest_url,
1153 'ext': 'flv' if bootstrap_info is not None else None,
1154 'tbr': tbr,
1155 'width': width,
1156 'height': height,
1157 'preference': preference,
1158 })
1159 return formats
1160
1161 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1162 return {
1163 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1164 'url': m3u8_url,
1165 'ext': ext,
1166 'protocol': 'm3u8',
1167 'preference': preference - 100 if preference else -100,
1168 'resolution': 'multiple',
1169 'format_note': 'Quality selection URL',
1170 }
1171
1172 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1173 entry_protocol='m3u8', preference=None,
1174 m3u8_id=None, note=None, errnote=None,
1175 fatal=True, live=False):
1176
1177 res = self._download_webpage_handle(
1178 m3u8_url, video_id,
1179 note=note or 'Downloading m3u8 information',
1180 errnote=errnote or 'Failed to download m3u8 information',
1181 fatal=fatal)
1182 if res is False:
1183 return []
1184 m3u8_doc, urlh = res
1185 m3u8_url = urlh.geturl()
1186
1187 formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1188
1189 format_url = lambda u: (
1190 u
1191 if re.match(r'^https?://', u)
1192 else compat_urlparse.urljoin(m3u8_url, u))
1193
1194 # We should try extracting formats only from master playlists [1], i.e.
1195 # playlists that describe available qualities. On the other hand media
1196 # playlists [2] should be returned as is since they contain just the media
1197 # without qualities renditions.
1198 # Fortunately, master playlist can be easily distinguished from media
1199 # playlist based on particular tags availability. As of [1, 2] master
1200 # playlist tags MUST NOT appear in a media playist and vice versa.
1201 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1202 # and MUST NOT appear in master playlist thus we can clearly detect media
1203 # playlist with this criterion.
1204 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1205 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1206 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1207 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1208 return [{
1209 'url': m3u8_url,
1210 'format_id': m3u8_id,
1211 'ext': ext,
1212 'protocol': entry_protocol,
1213 'preference': preference,
1214 }]
1215 last_info = {}
1216 last_media = {}
1217 for line in m3u8_doc.splitlines():
1218 if line.startswith('#EXT-X-STREAM-INF:'):
1219 last_info = parse_m3u8_attributes(line)
1220 elif line.startswith('#EXT-X-MEDIA:'):
1221 media = parse_m3u8_attributes(line)
1222 media_type = media.get('TYPE')
1223 if media_type in ('VIDEO', 'AUDIO'):
1224 media_url = media.get('URI')
1225 if media_url:
1226 format_id = []
1227 for v in (media.get('GROUP-ID'), media.get('NAME')):
1228 if v:
1229 format_id.append(v)
1230 formats.append({
1231 'format_id': '-'.join(format_id),
1232 'url': format_url(media_url),
1233 'language': media.get('LANGUAGE'),
1234 'vcodec': 'none' if media_type == 'AUDIO' else None,
1235 'ext': ext,
1236 'protocol': entry_protocol,
1237 'preference': preference,
1238 })
1239 else:
1240 # When there is no URI in EXT-X-MEDIA let this tag's
1241 # data be used by regular URI lines below
1242 last_media = media
1243 elif line.startswith('#') or not line.strip():
1244 continue
1245 else:
1246 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1247 format_id = []
1248 if m3u8_id:
1249 format_id.append(m3u8_id)
1250 # Despite specification does not mention NAME attribute for
1251 # EXT-X-STREAM-INF it still sometimes may be present
1252 stream_name = last_info.get('NAME') or last_media.get('NAME')
1253 # Bandwidth of live streams may differ over time thus making
1254 # format_id unpredictable. So it's better to keep provided
1255 # format_id intact.
1256 if not live:
1257 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1258 f = {
1259 'format_id': '-'.join(format_id),
1260 'url': format_url(line.strip()),
1261 'tbr': tbr,
1262 'ext': ext,
1263 'fps': float_or_none(last_info.get('FRAME-RATE')),
1264 'protocol': entry_protocol,
1265 'preference': preference,
1266 }
1267 resolution = last_info.get('RESOLUTION')
1268 if resolution:
1269 width_str, height_str = resolution.split('x')
1270 f['width'] = int(width_str)
1271 f['height'] = int(height_str)
1272 # Unified Streaming Platform
1273 mobj = re.search(
1274 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1275 if mobj:
1276 abr, vbr = mobj.groups()
1277 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1278 f.update({
1279 'vbr': vbr,
1280 'abr': abr,
1281 })
1282 f.update(parse_codecs(last_info.get('CODECS')))
1283 formats.append(f)
1284 last_info = {}
1285 last_media = {}
1286 return formats
1287
1288 @staticmethod
1289 def _xpath_ns(path, namespace=None):
1290 if not namespace:
1291 return path
1292 out = []
1293 for c in path.split('/'):
1294 if not c or c == '.':
1295 out.append(c)
1296 else:
1297 out.append('{%s}%s' % (namespace, c))
1298 return '/'.join(out)
1299
1300 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1301 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1302
1303 if smil is False:
1304 assert not fatal
1305 return []
1306
1307 namespace = self._parse_smil_namespace(smil)
1308
1309 return self._parse_smil_formats(
1310 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1311
1312 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1313 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1314 if smil is False:
1315 return {}
1316 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1317
1318 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1319 return self._download_xml(
1320 smil_url, video_id, 'Downloading SMIL file',
1321 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1322
1323 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1324 namespace = self._parse_smil_namespace(smil)
1325
1326 formats = self._parse_smil_formats(
1327 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1328 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1329
1330 video_id = os.path.splitext(url_basename(smil_url))[0]
1331 title = None
1332 description = None
1333 upload_date = None
1334 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1335 name = meta.attrib.get('name')
1336 content = meta.attrib.get('content')
1337 if not name or not content:
1338 continue
1339 if not title and name == 'title':
1340 title = content
1341 elif not description and name in ('description', 'abstract'):
1342 description = content
1343 elif not upload_date and name == 'date':
1344 upload_date = unified_strdate(content)
1345
1346 thumbnails = [{
1347 'id': image.get('type'),
1348 'url': image.get('src'),
1349 'width': int_or_none(image.get('width')),
1350 'height': int_or_none(image.get('height')),
1351 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1352
1353 return {
1354 'id': video_id,
1355 'title': title or video_id,
1356 'description': description,
1357 'upload_date': upload_date,
1358 'thumbnails': thumbnails,
1359 'formats': formats,
1360 'subtitles': subtitles,
1361 }
1362
1363 def _parse_smil_namespace(self, smil):
1364 return self._search_regex(
1365 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1366
1367 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1368 base = smil_url
1369 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1370 b = meta.get('base') or meta.get('httpBase')
1371 if b:
1372 base = b
1373 break
1374
1375 formats = []
1376 rtmp_count = 0
1377 http_count = 0
1378 m3u8_count = 0
1379
1380 srcs = []
1381 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1382 for medium in media:
1383 src = medium.get('src')
1384 if not src or src in srcs:
1385 continue
1386 srcs.append(src)
1387
1388 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1389 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1390 width = int_or_none(medium.get('width'))
1391 height = int_or_none(medium.get('height'))
1392 proto = medium.get('proto')
1393 ext = medium.get('ext')
1394 src_ext = determine_ext(src)
1395 streamer = medium.get('streamer') or base
1396
1397 if proto == 'rtmp' or streamer.startswith('rtmp'):
1398 rtmp_count += 1
1399 formats.append({
1400 'url': streamer,
1401 'play_path': src,
1402 'ext': 'flv',
1403 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1404 'tbr': bitrate,
1405 'filesize': filesize,
1406 'width': width,
1407 'height': height,
1408 })
1409 if transform_rtmp_url:
1410 streamer, src = transform_rtmp_url(streamer, src)
1411 formats[-1].update({
1412 'url': streamer,
1413 'play_path': src,
1414 })
1415 continue
1416
1417 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1418 src_url = src_url.strip()
1419
1420 if proto == 'm3u8' or src_ext == 'm3u8':
1421 m3u8_formats = self._extract_m3u8_formats(
1422 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1423 if len(m3u8_formats) == 1:
1424 m3u8_count += 1
1425 m3u8_formats[0].update({
1426 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1427 'tbr': bitrate,
1428 'width': width,
1429 'height': height,
1430 })
1431 formats.extend(m3u8_formats)
1432 continue
1433
1434 if src_ext == 'f4m':
1435 f4m_url = src_url
1436 if not f4m_params:
1437 f4m_params = {
1438 'hdcore': '3.2.0',
1439 'plugin': 'flowplayer-3.2.0.1',
1440 }
1441 f4m_url += '&' if '?' in f4m_url else '?'
1442 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1443 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1444 continue
1445
1446 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1447 http_count += 1
1448 formats.append({
1449 'url': src_url,
1450 'ext': ext or src_ext or 'flv',
1451 'format_id': 'http-%d' % (bitrate or http_count),
1452 'tbr': bitrate,
1453 'filesize': filesize,
1454 'width': width,
1455 'height': height,
1456 })
1457 continue
1458
1459 return formats
1460
1461 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1462 urls = []
1463 subtitles = {}
1464 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1465 src = textstream.get('src')
1466 if not src or src in urls:
1467 continue
1468 urls.append(src)
1469 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1470 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1471 subtitles.setdefault(lang, []).append({
1472 'url': src,
1473 'ext': ext,
1474 })
1475 return subtitles
1476
1477 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1478 xspf = self._download_xml(
1479 playlist_url, playlist_id, 'Downloading xpsf playlist',
1480 'Unable to download xspf manifest', fatal=fatal)
1481 if xspf is False:
1482 return []
1483 return self._parse_xspf(xspf, playlist_id)
1484
1485 def _parse_xspf(self, playlist, playlist_id):
1486 NS_MAP = {
1487 'xspf': 'http://xspf.org/ns/0/',
1488 's1': 'http://static.streamone.nl/player/ns/0',
1489 }
1490
1491 entries = []
1492 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1493 title = xpath_text(
1494 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1495 description = xpath_text(
1496 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1497 thumbnail = xpath_text(
1498 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1499 duration = float_or_none(
1500 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1501
1502 formats = [{
1503 'url': location.text,
1504 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1505 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1506 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1507 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1508 self._sort_formats(formats)
1509
1510 entries.append({
1511 'id': playlist_id,
1512 'title': title,
1513 'description': description,
1514 'thumbnail': thumbnail,
1515 'duration': duration,
1516 'formats': formats,
1517 })
1518 return entries
1519
1520 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1521 res = self._download_webpage_handle(
1522 mpd_url, video_id,
1523 note=note or 'Downloading MPD manifest',
1524 errnote=errnote or 'Failed to download MPD manifest',
1525 fatal=fatal)
1526 if res is False:
1527 return []
1528 mpd, urlh = res
1529 mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1530
1531 return self._parse_mpd_formats(
1532 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1533 formats_dict=formats_dict, mpd_url=mpd_url)
1534
1535 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1536 """
1537 Parse formats from MPD manifest.
1538 References:
1539 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1540 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1541 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1542 """
1543 if mpd_doc.get('type') == 'dynamic':
1544 return []
1545
1546 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1547
1548 def _add_ns(path):
1549 return self._xpath_ns(path, namespace)
1550
1551 def is_drm_protected(element):
1552 return element.find(_add_ns('ContentProtection')) is not None
1553
1554 def extract_multisegment_info(element, ms_parent_info):
1555 ms_info = ms_parent_info.copy()
1556
1557 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1558 # common attributes and elements. We will only extract relevant
1559 # for us.
1560 def extract_common(source):
1561 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1562 if segment_timeline is not None:
1563 s_e = segment_timeline.findall(_add_ns('S'))
1564 if s_e:
1565 ms_info['total_number'] = 0
1566 ms_info['s'] = []
1567 for s in s_e:
1568 r = int(s.get('r', 0))
1569 ms_info['total_number'] += 1 + r
1570 ms_info['s'].append({
1571 't': int(s.get('t', 0)),
1572 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1573 'd': int(s.attrib['d']),
1574 'r': r,
1575 })
1576 start_number = source.get('startNumber')
1577 if start_number:
1578 ms_info['start_number'] = int(start_number)
1579 timescale = source.get('timescale')
1580 if timescale:
1581 ms_info['timescale'] = int(timescale)
1582 segment_duration = source.get('duration')
1583 if segment_duration:
1584 ms_info['segment_duration'] = int(segment_duration)
1585
1586 def extract_Initialization(source):
1587 initialization = source.find(_add_ns('Initialization'))
1588 if initialization is not None:
1589 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1590
1591 segment_list = element.find(_add_ns('SegmentList'))
1592 if segment_list is not None:
1593 extract_common(segment_list)
1594 extract_Initialization(segment_list)
1595 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1596 if segment_urls_e:
1597 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1598 else:
1599 segment_template = element.find(_add_ns('SegmentTemplate'))
1600 if segment_template is not None:
1601 extract_common(segment_template)
1602 media_template = segment_template.get('media')
1603 if media_template:
1604 ms_info['media_template'] = media_template
1605 initialization = segment_template.get('initialization')
1606 if initialization:
1607 ms_info['initialization_url'] = initialization
1608 else:
1609 extract_Initialization(segment_template)
1610 return ms_info
1611
1612 def combine_url(base_url, target_url):
1613 if re.match(r'^https?://', target_url):
1614 return target_url
1615 return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url)
1616
1617 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1618 formats = []
1619 for period in mpd_doc.findall(_add_ns('Period')):
1620 period_duration = parse_duration(period.get('duration')) or mpd_duration
1621 period_ms_info = extract_multisegment_info(period, {
1622 'start_number': 1,
1623 'timescale': 1,
1624 })
1625 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1626 if is_drm_protected(adaptation_set):
1627 continue
1628 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1629 for representation in adaptation_set.findall(_add_ns('Representation')):
1630 if is_drm_protected(representation):
1631 continue
1632 representation_attrib = adaptation_set.attrib.copy()
1633 representation_attrib.update(representation.attrib)
1634 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1635 mime_type = representation_attrib['mimeType']
1636 content_type = mime_type.split('/')[0]
1637 if content_type == 'text':
1638 # TODO implement WebVTT downloading
1639 pass
1640 elif content_type == 'video' or content_type == 'audio':
1641 base_url = ''
1642 for element in (representation, adaptation_set, period, mpd_doc):
1643 base_url_e = element.find(_add_ns('BaseURL'))
1644 if base_url_e is not None:
1645 base_url = base_url_e.text + base_url
1646 if re.match(r'^https?://', base_url):
1647 break
1648 if mpd_base_url and not re.match(r'^https?://', base_url):
1649 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1650 mpd_base_url += '/'
1651 base_url = mpd_base_url + base_url
1652 representation_id = representation_attrib.get('id')
1653 lang = representation_attrib.get('lang')
1654 url_el = representation.find(_add_ns('BaseURL'))
1655 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1656 f = {
1657 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1658 'url': base_url,
1659 'manifest_url': mpd_url,
1660 'ext': mimetype2ext(mime_type),
1661 'width': int_or_none(representation_attrib.get('width')),
1662 'height': int_or_none(representation_attrib.get('height')),
1663 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1664 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1665 'fps': int_or_none(representation_attrib.get('frameRate')),
1666 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1667 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1668 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1669 'format_note': 'DASH %s' % content_type,
1670 'filesize': filesize,
1671 }
1672 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1673 if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1674
1675 media_template = representation_ms_info['media_template']
1676 media_template = media_template.replace('$RepresentationID$', representation_id)
1677 media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
1678 media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template)
1679 media_template.replace('$$', '$')
1680
1681 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1682 # can't be used at the same time
1683 if '%(Number' in media_template and 's' not in representation_ms_info:
1684 segment_duration = None
1685 if 'total_number' not in representation_ms_info and 'segment_duration':
1686 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1687 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1688 representation_ms_info['fragments'] = [{
1689 'url': media_template % {
1690 'Number': segment_number,
1691 'Bandwidth': representation_attrib.get('bandwidth'),
1692 },
1693 'duration': segment_duration,
1694 } for segment_number in range(
1695 representation_ms_info['start_number'],
1696 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1697 else:
1698 # $Number*$ or $Time$ in media template with S list available
1699 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1700 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1701 representation_ms_info['fragments'] = []
1702 segment_time = 0
1703 segment_d = None
1704 segment_number = representation_ms_info['start_number']
1705
1706 def add_segment_url():
1707 segment_url = media_template % {
1708 'Time': segment_time,
1709 'Bandwidth': representation_attrib.get('bandwidth'),
1710 'Number': segment_number,
1711 }
1712 representation_ms_info['fragments'].append({
1713 'url': segment_url,
1714 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1715 })
1716
1717 for num, s in enumerate(representation_ms_info['s']):
1718 segment_time = s.get('t') or segment_time
1719 segment_d = s['d']
1720 add_segment_url()
1721 segment_number += 1
1722 for r in range(s.get('r', 0)):
1723 segment_time += segment_d
1724 add_segment_url()
1725 segment_number += 1
1726 segment_time += segment_d
1727 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1728 # No media template
1729 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1730 # or any YouTube dashsegments video
1731 fragments = []
1732 s_num = 0
1733 for segment_url in representation_ms_info['segment_urls']:
1734 s = representation_ms_info['s'][s_num]
1735 for r in range(s.get('r', 0) + 1):
1736 fragments.append({
1737 'url': segment_url,
1738 'duration': float_or_none(s['d'], representation_ms_info['timescale']),
1739 })
1740 representation_ms_info['fragments'] = fragments
1741 # NB: MPD manifest may contain direct URLs to unfragmented media.
1742 # No fragments key is present in this case.
1743 if 'fragments' in representation_ms_info:
1744 f.update({
1745 'fragments': [],
1746 'protocol': 'http_dash_segments',
1747 })
1748 if 'initialization_url' in representation_ms_info:
1749 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1750 if not f.get('url'):
1751 f['url'] = initialization_url
1752 f['fragments'].append({'url': initialization_url})
1753 f['fragments'].extend(representation_ms_info['fragments'])
1754 for fragment in f['fragments']:
1755 fragment['url'] = combine_url(base_url, fragment['url'])
1756 try:
1757 existing_format = next(
1758 fo for fo in formats
1759 if fo['format_id'] == representation_id)
1760 except StopIteration:
1761 full_info = formats_dict.get(representation_id, {}).copy()
1762 full_info.update(f)
1763 formats.append(full_info)
1764 else:
1765 existing_format.update(f)
1766 else:
1767 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1768 return formats
1769
1770 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'):
1771 def absolute_url(video_url):
1772 return compat_urlparse.urljoin(base_url, video_url)
1773
1774 def parse_content_type(content_type):
1775 if not content_type:
1776 return {}
1777 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
1778 if ctr:
1779 mimetype, codecs = ctr.groups()
1780 f = parse_codecs(codecs)
1781 f['ext'] = mimetype2ext(mimetype)
1782 return f
1783 return {}
1784
1785 def _media_formats(src, cur_media_type):
1786 full_url = absolute_url(src)
1787 if determine_ext(full_url) == 'm3u8':
1788 is_plain_url = False
1789 formats = self._extract_m3u8_formats(
1790 full_url, video_id, ext='mp4',
1791 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
1792 else:
1793 is_plain_url = True
1794 formats = [{
1795 'url': full_url,
1796 'vcodec': 'none' if cur_media_type == 'audio' else None,
1797 }]
1798 return is_plain_url, formats
1799
1800 entries = []
1801 for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
1802 media_info = {
1803 'formats': [],
1804 'subtitles': {},
1805 }
1806 media_attributes = extract_attributes(media_tag)
1807 src = media_attributes.get('src')
1808 if src:
1809 _, formats = _media_formats(src, media_type)
1810 media_info['formats'].extend(formats)
1811 media_info['thumbnail'] = media_attributes.get('poster')
1812 if media_content:
1813 for source_tag in re.findall(r'<source[^>]+>', media_content):
1814 source_attributes = extract_attributes(source_tag)
1815 src = source_attributes.get('src')
1816 if not src:
1817 continue
1818 is_plain_url, formats = _media_formats(src, media_type)
1819 if is_plain_url:
1820 f = parse_content_type(source_attributes.get('type'))
1821 f.update(formats[0])
1822 media_info['formats'].append(f)
1823 else:
1824 media_info['formats'].extend(formats)
1825 for track_tag in re.findall(r'<track[^>]+>', media_content):
1826 track_attributes = extract_attributes(track_tag)
1827 kind = track_attributes.get('kind')
1828 if not kind or kind == 'subtitles':
1829 src = track_attributes.get('src')
1830 if not src:
1831 continue
1832 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
1833 media_info['subtitles'].setdefault(lang, []).append({
1834 'url': absolute_url(src),
1835 })
1836 if media_info['formats']:
1837 entries.append(media_info)
1838 return entries
1839
1840 def _extract_akamai_formats(self, manifest_url, video_id):
1841 formats = []
1842 f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
1843 formats.extend(self._extract_f4m_formats(
1844 update_url_query(f4m_url, {'hdcore': '3.7.0'}),
1845 video_id, f4m_id='hds', fatal=False))
1846 m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
1847 formats.extend(self._extract_m3u8_formats(
1848 m3u8_url, video_id, 'mp4', 'm3u8_native',
1849 m3u8_id='hls', fatal=False))
1850 return formats
1851
1852 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
1853 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
1854 url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
1855 http_base_url = 'http' + url_base
1856 formats = []
1857 if 'm3u8' not in skip_protocols:
1858 formats.extend(self._extract_m3u8_formats(
1859 http_base_url + '/playlist.m3u8', video_id, 'mp4',
1860 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
1861 if 'f4m' not in skip_protocols:
1862 formats.extend(self._extract_f4m_formats(
1863 http_base_url + '/manifest.f4m',
1864 video_id, f4m_id='hds', fatal=False))
1865 if re.search(r'(?:/smil:|\.smil)', url_base):
1866 if 'dash' not in skip_protocols:
1867 formats.extend(self._extract_mpd_formats(
1868 http_base_url + '/manifest.mpd',
1869 video_id, mpd_id='dash', fatal=False))
1870 if 'smil' not in skip_protocols:
1871 rtmp_formats = self._extract_smil_formats(
1872 http_base_url + '/jwplayer.smil',
1873 video_id, fatal=False)
1874 for rtmp_format in rtmp_formats:
1875 rtsp_format = rtmp_format.copy()
1876 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
1877 del rtsp_format['play_path']
1878 del rtsp_format['ext']
1879 rtsp_format.update({
1880 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
1881 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
1882 'protocol': 'rtsp',
1883 })
1884 formats.extend([rtmp_format, rtsp_format])
1885 else:
1886 for protocol in ('rtmp', 'rtsp'):
1887 if protocol not in skip_protocols:
1888 formats.append({
1889 'url': protocol + url_base,
1890 'format_id': protocol,
1891 'protocol': protocol,
1892 })
1893 return formats
1894
1895 def _live_title(self, name):
1896 """ Generate the title for a live video """
1897 now = datetime.datetime.now()
1898 now_str = now.strftime('%Y-%m-%d %H:%M')
1899 return name + ' ' + now_str
1900
1901 def _int(self, v, name, fatal=False, **kwargs):
1902 res = int_or_none(v, **kwargs)
1903 if 'get_attr' in kwargs:
1904 print(getattr(v, kwargs['get_attr']))
1905 if res is None:
1906 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1907 if fatal:
1908 raise ExtractorError(msg)
1909 else:
1910 self._downloader.report_warning(msg)
1911 return res
1912
1913 def _float(self, v, name, fatal=False, **kwargs):
1914 res = float_or_none(v, **kwargs)
1915 if res is None:
1916 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1917 if fatal:
1918 raise ExtractorError(msg)
1919 else:
1920 self._downloader.report_warning(msg)
1921 return res
1922
1923 def _set_cookie(self, domain, name, value, expire_time=None):
1924 cookie = compat_cookiejar.Cookie(
1925 0, name, value, None, None, domain, None,
1926 None, '/', True, False, expire_time, '', None, None, None)
1927 self._downloader.cookiejar.set_cookie(cookie)
1928
1929 def _get_cookies(self, url):
1930 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1931 req = sanitized_Request(url)
1932 self._downloader.cookiejar.add_cookie_header(req)
1933 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1934
1935 def get_testcases(self, include_onlymatching=False):
1936 t = getattr(self, '_TEST', None)
1937 if t:
1938 assert not hasattr(self, '_TESTS'), \
1939 '%s has _TEST and _TESTS' % type(self).__name__
1940 tests = [t]
1941 else:
1942 tests = getattr(self, '_TESTS', [])
1943 for t in tests:
1944 if not include_onlymatching and t.get('only_matching', False):
1945 continue
1946 t['name'] = type(self).__name__[:-len('IE')]
1947 yield t
1948
1949 def is_suitable(self, age_limit):
1950 """ Test whether the extractor is generally suitable for the given
1951 age limit (i.e. pornographic sites are not, all others usually are) """
1952
1953 any_restricted = False
1954 for tc in self.get_testcases(include_onlymatching=False):
1955 if tc.get('playlist', []):
1956 tc = tc['playlist'][0]
1957 is_restricted = age_restricted(
1958 tc.get('info_dict', {}).get('age_limit'), age_limit)
1959 if not is_restricted:
1960 return True
1961 any_restricted = any_restricted or is_restricted
1962 return not any_restricted
1963
1964 def extract_subtitles(self, *args, **kwargs):
1965 if (self._downloader.params.get('writesubtitles', False) or
1966 self._downloader.params.get('listsubtitles')):
1967 return self._get_subtitles(*args, **kwargs)
1968 return {}
1969
1970 def _get_subtitles(self, *args, **kwargs):
1971 raise NotImplementedError('This method must be implemented by subclasses')
1972
1973 @staticmethod
1974 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1975 """ Merge subtitle items for one language. Items with duplicated URLs
1976 will be dropped. """
1977 list1_urls = set([item['url'] for item in subtitle_list1])
1978 ret = list(subtitle_list1)
1979 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1980 return ret
1981
1982 @classmethod
1983 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1984 """ Merge two subtitle dictionaries, language by language. """
1985 ret = dict(subtitle_dict1)
1986 for lang in subtitle_dict2:
1987 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1988 return ret
1989
1990 def extract_automatic_captions(self, *args, **kwargs):
1991 if (self._downloader.params.get('writeautomaticsub', False) or
1992 self._downloader.params.get('listsubtitles')):
1993 return self._get_automatic_captions(*args, **kwargs)
1994 return {}
1995
1996 def _get_automatic_captions(self, *args, **kwargs):
1997 raise NotImplementedError('This method must be implemented by subclasses')
1998
1999 def mark_watched(self, *args, **kwargs):
2000 if (self._downloader.params.get('mark_watched', False) and
2001 (self._get_login_info()[0] is not None or
2002 self._downloader.params.get('cookiefile') is not None)):
2003 self._mark_watched(*args, **kwargs)
2004
2005 def _mark_watched(self, *args, **kwargs):
2006 raise NotImplementedError('This method must be implemented by subclasses')
2007
2008 def geo_verification_headers(self):
2009 headers = {}
2010 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2011 if geo_verification_proxy:
2012 headers['Ytdl-request-proxy'] = geo_verification_proxy
2013 return headers
2014
2015
2016 class SearchInfoExtractor(InfoExtractor):
2017 """
2018 Base class for paged search queries extractors.
2019 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2020 Instances should define _SEARCH_KEY and _MAX_RESULTS.
2021 """
2022
2023 @classmethod
2024 def _make_valid_url(cls):
2025 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2026
2027 @classmethod
2028 def suitable(cls, url):
2029 return re.match(cls._make_valid_url(), url) is not None
2030
2031 def _real_extract(self, query):
2032 mobj = re.match(self._make_valid_url(), query)
2033 if mobj is None:
2034 raise ExtractorError('Invalid search query "%s"' % query)
2035
2036 prefix = mobj.group('prefix')
2037 query = mobj.group('query')
2038 if prefix == '':
2039 return self._get_n_results(query, 1)
2040 elif prefix == 'all':
2041 return self._get_n_results(query, self._MAX_RESULTS)
2042 else:
2043 n = int(prefix)
2044 if n <= 0:
2045 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2046 elif n > self._MAX_RESULTS:
2047 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2048 n = self._MAX_RESULTS
2049 return self._get_n_results(query, n)
2050
2051 def _get_n_results(self, query, n):
2052 """Get a specified number of results for a query"""
2053 raise NotImplementedError('This method must be implemented by subclasses')
2054
2055 @property
2056 def SEARCH_KEY(self):
2057 return self._SEARCH_KEY