]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
Merge pull request #8061 from dstftw/introduce-chapter-and-series-fields
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
f1a9d64e
PH
1from __future__ import unicode_literals
2
d6983cb4 3import base64
f4b1c7ad 4import datetime
3ec05685 5import hashlib
3d3538e4 6import json
4094b6e3 7import netrc
d6983cb4
PH
8import os
9import re
10import socket
11import sys
4094b6e3 12import time
d6983cb4 13
8c25f81b 14from ..compat import (
42939b61 15 compat_cookiejar,
799207e8 16 compat_cookies,
e64b7569 17 compat_getpass,
d6983cb4
PH
18 compat_http_client,
19 compat_urllib_error,
a107193e 20 compat_urllib_parse,
f0b5d6af 21 compat_urlparse,
d6983cb4 22 compat_str,
36e6f62c 23 compat_etree_fromstring,
8c25f81b
PH
24)
25from ..utils import (
c342041f 26 NO_DEFAULT,
05900629 27 age_restricted,
08f2a92c 28 bug_reports_message,
d6983cb4
PH
29 clean_html,
30 compiled_regex_type,
70f0f5a8 31 determine_ext,
9b9c5355 32 error_to_compat_str,
d6983cb4 33 ExtractorError,
97f4aecf 34 fix_xml_ampersands,
b14f3a4c 35 float_or_none,
31bb8d3f 36 int_or_none,
55b3e45b 37 RegexNotFoundError,
d41e6efc 38 sanitize_filename,
5c2266df 39 sanitized_Request,
f38de77f 40 unescapeHTML,
647eab45 41 unified_strdate,
a107193e 42 url_basename,
8d6765cf
S
43 xpath_text,
44 xpath_with_ns,
d497a201 45 determine_protocol,
d6983cb4 46)
c342041f 47
d6983cb4
PH
48
49class InfoExtractor(object):
50 """Information Extractor class.
51
52 Information extractors are the classes that, given a URL, extract
53 information about the video (or videos) the URL refers to. This
54 information includes the real video URL, the video title, author and
55 others. The information is stored in a dictionary which is then
5d380852 56 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
57 information possibly downloading the video to the file system, among
58 other possible outcomes.
59
cf0649f8 60 The type field determines the type of the result.
fed5d032
PH
61 By far the most common value (and the default if _type is missing) is
62 "video", which indicates a single video.
63
64 For a video, the dictionaries must include the following fields:
d6983cb4
PH
65
66 id: Video identifier.
d6983cb4 67 title: Video title, unescaped.
d67b0b15 68
f49d89ee 69 Additionally, it must contain either a formats entry or a url one:
d67b0b15 70
f49d89ee
PH
71 formats: A list of dictionaries for each format available, ordered
72 from worst to best quality.
73
74 Potential fields:
d67b0b15 75 * url Mandatory. The URL of the video file
10952eb2 76 * ext Will be calculated from URL if missing
d67b0b15
PH
77 * format A human-readable description of the format
78 ("mp4 container with h264/opus").
79 Calculated from the format_id, width, height.
80 and format_note fields if missing.
81 * format_id A short description of the format
5d4f3985
PH
82 ("mp4_h264_opus" or "19").
83 Technically optional, but strongly recommended.
d67b0b15
PH
84 * format_note Additional info about the format
85 ("3D" or "DASH video")
86 * width Width of the video, if known
87 * height Height of the video, if known
f49d89ee 88 * resolution Textual description of width and height
7217e148 89 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
90 * abr Average audio bitrate in KBit/s
91 * acodec Name of the audio codec in use
dd27fd17 92 * asr Audio sampling rate in Hertz
d67b0b15 93 * vbr Average video bitrate in KBit/s
fbb21cf5 94 * fps Frame rate
d67b0b15 95 * vcodec Name of the video codec in use
1394ce65 96 * container Name of the container format
d67b0b15 97 * filesize The number of bytes, if known in advance
9732d77e 98 * filesize_approx An estimate for the number of bytes
d67b0b15 99 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
100 * protocol The protocol that will be used for the actual
101 download, lower-case.
b04b8852
PH
102 "http", "https", "rtsp", "rtmp", "rtmpe",
103 "m3u8", or "m3u8_native".
f49d89ee 104 * preference Order number of this format. If this field is
08d13955 105 present and not None, the formats get sorted
38d63d84 106 by this field, regardless of all other values.
f49d89ee
PH
107 -1 for default (order by other properties),
108 -2 or smaller for less than default.
e65566a9
PH
109 < -1000 to hide the format (if there is
110 another one which is strictly better)
32f90364
PH
111 * language Language code, e.g. "de" or "en-US".
112 * language_preference Is this in the language mentioned in
113 the URL?
aff2f4f4
PH
114 10 if it's what the URL is about,
115 -1 for default (don't know),
116 -10 otherwise, other values reserved for now.
5d73273f
PH
117 * quality Order number of the video quality of this
118 format, irrespective of the file format.
119 -1 for default (order by other properties),
120 -2 or smaller for less than default.
c64ed2a3
PH
121 * source_preference Order number for this video source
122 (quality takes higher priority)
123 -1 for default (order by other properties),
124 -2 or smaller for less than default.
d769be6c
PH
125 * http_headers A dictionary of additional HTTP headers
126 to add to the request.
6271f1ca 127 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
128 video's pixels are not square.
129 width : height ratio as float.
130 * no_resume The server does not support resuming the
131 (HTTP or RTMP) download. Boolean.
132
c0ba0f48 133 url: Final video URL.
d6983cb4 134 ext: Video filename extension.
d67b0b15
PH
135 format: The video format, defaults to ext (used for --get-format)
136 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 137
d6983cb4
PH
138 The following fields are optional:
139
f5e43bc6 140 alt_title: A secondary title of the video.
0afef30b
PH
141 display_id An alternative identifier for the video, not necessarily
142 unique, but available before title. Typically, id is
143 something like "4234987", title "Dancing naked mole rats",
144 and display_id "dancing-naked-mole-rats"
d5519808 145 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 146 * "id" (optional, string) - Thumbnail format ID
d5519808 147 * "url"
cfb56d1a 148 * "preference" (optional, int) - quality of the image
d5519808
PH
149 * "width" (optional, int)
150 * "height" (optional, int)
151 * "resolution" (optional, string "{width}x{height"},
152 deprecated)
d6983cb4 153 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 154 description: Full video description.
d6983cb4 155 uploader: Full name of the video uploader.
9bb8e0a3 156 creator: The main artist who created the video.
8aab976b 157 release_date: The date (YYYYMMDD) when the video was released.
955c4514 158 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 159 upload_date: Video upload date (YYYYMMDD).
955c4514 160 If not explicitly set, calculated from timestamp.
d6983cb4 161 uploader_id: Nickname or id of the video uploader.
da9ec3b9 162 location: Physical location where the video was filmed.
a504ced0
JMF
163 subtitles: The available subtitles as a dictionary in the format
164 {language: subformats}. "subformats" is a list sorted from
165 lower to higher preference, each element is a dictionary
166 with the "ext" entry and one of:
167 * "data": The subtitles file contents
10952eb2 168 * "url": A URL pointing to the subtitles file
4bba3716 169 "ext" will be calculated from URL if missing
360e1ca5
JMF
170 automatic_captions: Like 'subtitles', used by the YoutubeIE for
171 automatically generated captions
62d231c0 172 duration: Length of the video in seconds, as an integer or float.
f3d29461 173 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
174 like_count: Number of positive ratings of the video
175 dislike_count: Number of negative ratings of the video
02835c6b 176 repost_count: Number of reposts of the video
2d30521a 177 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 178 comment_count: Number of comments on the video
dd622d7c
PH
179 comments: A list of comments, each with one or more of the following
180 properties (all but one of text or html optional):
181 * "author" - human-readable name of the comment author
182 * "author_id" - user ID of the comment author
183 * "id" - Comment ID
184 * "html" - Comment as HTML
185 * "text" - Plain text of the comment
186 * "timestamp" - UNIX timestamp of comment
187 * "parent" - ID of the comment this one is replying to.
188 Set to "root" to indicate that this is a
189 comment to the original video.
8dbe9899 190 age_limit: Age restriction for the video, as an integer (years)
10952eb2 191 webpage_url: The URL to the video webpage, if given to youtube-dl it
9103bbc5
JMF
192 should allow to get the same result again. (It will be set
193 by YoutubeDL if it's missing)
ad3bc6ac
PH
194 categories: A list of categories that the video falls in, for example
195 ["Sports", "Berlin"]
864f24bd 196 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
7267bd53
PH
197 is_live: True, False, or None (=unknown). Whether this video is a
198 live stream that goes on instead of a fixed-length video.
7c80519c 199 start_time: Time in seconds where the reproduction should start, as
10952eb2 200 specified in the URL.
297a564b 201 end_time: Time in seconds where the reproduction should end, as
10952eb2 202 specified in the URL.
d6983cb4 203
7109903e
S
204 The following fields should only be used when the video belongs to some logical
205 chapter or section:
206
207 chapter: Name or title of the chapter the video belongs to.
27bfd4e5
S
208 chapter_number: Number of the chapter the video belongs to, as an integer.
209 chapter_id: Id of the chapter the video belongs to, as a unicode string.
7109903e
S
210
211 The following fields should only be used when the video is an episode of some
212 series or programme:
213
214 series: Title of the series or programme the video episode belongs to.
215 season: Title of the season the video episode belongs to.
27bfd4e5
S
216 season_number: Number of the season the video episode belongs to, as an integer.
217 season_id: Id of the season the video episode belongs to, as a unicode string.
7109903e
S
218 episode: Title of the video episode. Unlike mandatory video title field,
219 this field should denote the exact title of the video episode
220 without any kind of decoration.
27bfd4e5
S
221 episode_number: Number of the video episode within a season, as an integer.
222 episode_id: Id of the video episode, as a unicode string.
7109903e 223
deefc05b 224 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 225
d838b1bd
PH
226 Unless mentioned otherwise, None is equivalent to absence of information.
227
fed5d032
PH
228
229 _type "playlist" indicates multiple videos.
b82f815f
PH
230 There must be a key "entries", which is a list, an iterable, or a PagedList
231 object, each element of which is a valid dictionary by this specification.
fed5d032 232
e0b9d78f
S
233 Additionally, playlists can have "title", "description" and "id" attributes
234 with the same semantics as videos (see above).
fed5d032
PH
235
236
237 _type "multi_video" indicates that there are multiple videos that
238 form a single show, for examples multiple acts of an opera or TV episode.
239 It must have an entries key like a playlist and contain all the keys
240 required for a video at the same time.
241
242
243 _type "url" indicates that the video must be extracted from another
244 location, possibly by a different extractor. Its only required key is:
245 "url" - the next URL to extract.
f58766ce
PH
246 The key "ie_key" can be set to the class name (minus the trailing "IE",
247 e.g. "Youtube") if the extractor class is known in advance.
248 Additionally, the dictionary may have any properties of the resolved entity
249 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
250 known ahead of time.
251
252
253 _type "url_transparent" entities have the same specification as "url", but
254 indicate that the given additional information is more precise than the one
255 associated with the resolved URL.
256 This is useful when a site employs a video service that hosts the video and
257 its technical metadata, but that video service does not embed a useful
258 title, description etc.
259
260
d6983cb4
PH
261 Subclasses of this one should re-define the _real_initialize() and
262 _real_extract() methods and define a _VALID_URL regexp.
263 Probably, they should also be added to the list of extractors.
264
d6983cb4
PH
265 Finally, the _WORKING attribute should be set to False for broken IEs
266 in order to warn the users and skip the tests.
267 """
268
269 _ready = False
270 _downloader = None
271 _WORKING = True
272
273 def __init__(self, downloader=None):
274 """Constructor. Receives an optional downloader."""
275 self._ready = False
276 self.set_downloader(downloader)
277
278 @classmethod
279 def suitable(cls, url):
280 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
281
282 # This does not use has/getattr intentionally - we want to know whether
283 # we have cached the regexp for *this* class, whereas getattr would also
284 # match the superclass
285 if '_VALID_URL_RE' not in cls.__dict__:
286 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
287 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 288
ed9266db
PH
289 @classmethod
290 def _match_id(cls, url):
291 if '_VALID_URL_RE' not in cls.__dict__:
292 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
293 m = cls._VALID_URL_RE.match(url)
294 assert m
295 return m.group('id')
296
d6983cb4
PH
297 @classmethod
298 def working(cls):
299 """Getter method for _WORKING."""
300 return cls._WORKING
301
302 def initialize(self):
303 """Initializes an instance (authentication, etc)."""
304 if not self._ready:
305 self._real_initialize()
306 self._ready = True
307
308 def extract(self, url):
309 """Extracts URL information and returns it in list of dicts."""
3a5bcd03
PH
310 try:
311 self.initialize()
312 return self._real_extract(url)
313 except ExtractorError:
314 raise
315 except compat_http_client.IncompleteRead as e:
316 raise ExtractorError('A network error has occured.', cause=e, expected=True)
9650885b 317 except (KeyError, StopIteration) as e:
3a5bcd03 318 raise ExtractorError('An extractor error has occured.', cause=e)
d6983cb4
PH
319
320 def set_downloader(self, downloader):
321 """Sets the downloader for this IE."""
322 self._downloader = downloader
323
324 def _real_initialize(self):
325 """Real initialization process. Redefine in subclasses."""
326 pass
327
328 def _real_extract(self, url):
329 """Real extraction process. Redefine in subclasses."""
330 pass
331
56c73665
JMF
332 @classmethod
333 def ie_key(cls):
334 """A string for getting the InfoExtractor with get_info_extractor"""
dc519b54 335 return compat_str(cls.__name__[:-2])
56c73665 336
d6983cb4
PH
337 @property
338 def IE_NAME(self):
dc519b54 339 return compat_str(type(self).__name__[:-2])
d6983cb4 340
7cc3570e 341 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4
PH
342 """ Returns the response handle """
343 if note is None:
344 self.report_download_webpage(video_id)
345 elif note is not False:
7cc3570e 346 if video_id is None:
f1a9d64e 347 self.to_screen('%s' % (note,))
7cc3570e 348 else:
f1a9d64e 349 self.to_screen('%s: %s' % (video_id, note))
d6983cb4 350 try:
dca08720 351 return self._downloader.urlopen(url_or_request)
d6983cb4 352 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
353 if errnote is False:
354 return False
d6983cb4 355 if errnote is None:
f1a9d64e 356 errnote = 'Unable to download webpage'
7f8b2714 357
9b9c5355 358 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
7cc3570e
PH
359 if fatal:
360 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
361 else:
362 self._downloader.report_warning(errmsg)
363 return False
d6983cb4 364
c9a77969 365 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
d6983cb4 366 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
367 # Strip hashes from the URL (#1038)
368 if isinstance(url_or_request, (compat_str, str)):
369 url_or_request = url_or_request.partition('#')[0]
370
7cc3570e
PH
371 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
372 if urlh is False:
373 assert not fatal
374 return False
c9a77969 375 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
376 return (content, urlh)
377
c9a77969
YCH
378 @staticmethod
379 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
380 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
381 if m:
382 encoding = m.group(1)
383 else:
0d75ae2c 384 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
385 webpage_bytes[:1024])
386 if m:
387 encoding = m.group(1).decode('ascii')
b60016e8
PH
388 elif webpage_bytes.startswith(b'\xff\xfe'):
389 encoding = 'utf-16'
f143d86a
PH
390 else:
391 encoding = 'utf-8'
c9a77969
YCH
392
393 return encoding
394
395 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
396 content_type = urlh.headers.get('Content-Type', '')
397 webpage_bytes = urlh.read()
398 if prefix is not None:
399 webpage_bytes = prefix + webpage_bytes
400 if not encoding:
401 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
d6983cb4
PH
402 if self._downloader.params.get('dump_intermediate_pages', False):
403 try:
404 url = url_or_request.get_full_url()
405 except AttributeError:
406 url = url_or_request
f1a9d64e 407 self.to_screen('Dumping request to ' + url)
d6983cb4
PH
408 dump = base64.b64encode(webpage_bytes).decode('ascii')
409 self._downloader.to_screen(dump)
d41e6efc
PH
410 if self._downloader.params.get('write_pages', False):
411 try:
412 url = url_or_request.get_full_url()
413 except AttributeError:
414 url = url_or_request
5afa7f8b 415 basen = '%s_%s' % (video_id, url)
c1bce22f 416 if len(basen) > 240:
f1a9d64e 417 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
418 basen = basen[:240 - len(h)] + h
419 raw_filename = basen + '.dump'
d41e6efc 420 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 421 self.to_screen('Saving request to ' + filename)
5f58165d
S
422 # Working around MAX_PATH limitation on Windows (see
423 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
424 if os.name == 'nt':
425 absfilepath = os.path.abspath(filename)
426 if len(absfilepath) > 259:
427 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
428 with open(filename, 'wb') as outf:
429 outf.write(webpage_bytes)
430
ec0fafbb
AA
431 try:
432 content = webpage_bytes.decode(encoding, 'replace')
433 except LookupError:
434 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 435
f1a9d64e
PH
436 if ('<title>Access to this site is blocked</title>' in content and
437 'Websense' in content[:512]):
438 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
2410c43d
PH
439 blocked_iframe = self._html_search_regex(
440 r'<iframe src="([^"]+)"', content,
f1a9d64e 441 'Websense information URL', default=None)
2410c43d 442 if blocked_iframe:
f1a9d64e 443 msg += ' Visit %s for more details' % blocked_iframe
2410c43d 444 raise ExtractorError(msg, expected=True)
77b2986b
PH
445 if '<title>The URL you requested has been blocked</title>' in content[:512]:
446 msg = (
447 'Access to this webpage has been blocked by Indian censorship. '
448 'Use a VPN or proxy server (with --proxy) to route around it.')
449 block_msg = self._html_search_regex(
450 r'</h1><p>(.*?)</p>',
451 content, 'block message', default=None)
452 if block_msg:
453 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
454 raise ExtractorError(msg, expected=True)
2410c43d 455
23be51d8 456 return content
d6983cb4 457
c9a77969 458 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
d6983cb4 459 """ Returns the data of the page as a string """
995ad69c
TF
460 success = False
461 try_count = 0
462 while success is False:
463 try:
c9a77969 464 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
995ad69c
TF
465 success = True
466 except compat_http_client.IncompleteRead as e:
467 try_count += 1
468 if try_count >= tries:
469 raise e
470 self._sleep(timeout, video_id)
7cc3570e
PH
471 if res is False:
472 return res
473 else:
474 content, _ = res
475 return content
d6983cb4 476
2a275ab0 477 def _download_xml(self, url_or_request, video_id,
f1a9d64e 478 note='Downloading XML', errnote='Unable to download XML',
c9a77969 479 transform_source=None, fatal=True, encoding=None):
267ed0c5 480 """Return the xml as an xml.etree.ElementTree.Element"""
28746fbd 481 xml_string = self._download_webpage(
c9a77969 482 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
28746fbd
PH
483 if xml_string is False:
484 return xml_string
e2b38da9
PH
485 if transform_source:
486 xml_string = transform_source(xml_string)
36e6f62c 487 return compat_etree_fromstring(xml_string.encode('utf-8'))
267ed0c5 488
3d3538e4 489 def _download_json(self, url_or_request, video_id,
f1a9d64e
PH
490 note='Downloading JSON metadata',
491 errnote='Unable to download JSON metadata',
b090af59 492 transform_source=None,
c9a77969 493 fatal=True, encoding=None):
b090af59 494 json_string = self._download_webpage(
c9a77969
YCH
495 url_or_request, video_id, note, errnote, fatal=fatal,
496 encoding=encoding)
b090af59
PH
497 if (not fatal) and json_string is False:
498 return None
ebb64199
TF
499 return self._parse_json(
500 json_string, video_id, transform_source=transform_source, fatal=fatal)
501
502 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
503 if transform_source:
504 json_string = transform_source(json_string)
3d3538e4
PH
505 try:
506 return json.loads(json_string)
507 except ValueError as ve:
e7b6d122
PH
508 errmsg = '%s: Failed to parse JSON ' % video_id
509 if fatal:
510 raise ExtractorError(errmsg, cause=ve)
511 else:
512 self.report_warning(errmsg + str(ve))
3d3538e4 513
f45f96f8 514 def report_warning(self, msg, video_id=None):
f1a9d64e 515 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 516 self._downloader.report_warning(
f1a9d64e 517 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 518
d6983cb4
PH
519 def to_screen(self, msg):
520 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 521 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
522
523 def report_extraction(self, id_or_name):
524 """Report information extraction."""
f1a9d64e 525 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
526
527 def report_download_webpage(self, video_id):
528 """Report webpage download."""
f1a9d64e 529 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
530
531 def report_age_confirmation(self):
532 """Report attempt to confirm age."""
f1a9d64e 533 self.to_screen('Confirming age')
d6983cb4 534
fc79158d
JMF
535 def report_login(self):
536 """Report attempt to log in."""
f1a9d64e 537 self.to_screen('Logging in')
fc79158d 538
43e7d3c9
S
539 @staticmethod
540 def raise_login_required(msg='This video is only available for registered users'):
541 raise ExtractorError(
542 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
543 expected=True)
544
c430802e
S
545 @staticmethod
546 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
547 raise ExtractorError(
548 '%s. You might want to use --proxy to workaround.' % msg,
549 expected=True)
550
5f6a1245 551 # Methods for following #608
c0d0b01f 552 @staticmethod
830d53bf 553 def url_result(url, ie=None, video_id=None, video_title=None):
10952eb2 554 """Returns a URL that points to a page that should be processed"""
5f6a1245 555 # TODO: ie should be the class used for getting the info
d6983cb4
PH
556 video_info = {'_type': 'url',
557 'url': url,
558 'ie_key': ie}
7012b23c
PH
559 if video_id is not None:
560 video_info['id'] = video_id
830d53bf
S
561 if video_title is not None:
562 video_info['title'] = video_title
d6983cb4 563 return video_info
5f6a1245 564
c0d0b01f 565 @staticmethod
acf5cbfe 566 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
d6983cb4
PH
567 """Returns a playlist"""
568 video_info = {'_type': 'playlist',
569 'entries': entries}
570 if playlist_id:
571 video_info['id'] = playlist_id
572 if playlist_title:
573 video_info['title'] = playlist_title
acf5cbfe
S
574 if playlist_description:
575 video_info['description'] = playlist_description
d6983cb4
PH
576 return video_info
577
c342041f 578 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
579 """
580 Perform a regex search on the given string, using a single or a list of
581 patterns returning the first matching group.
582 In case of failure return a default value or raise a WARNING or a
55b3e45b 583 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
584 """
585 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
586 mobj = re.search(pattern, string, flags)
587 else:
588 for p in pattern:
589 mobj = re.search(p, string, flags)
c3415d1b
PH
590 if mobj:
591 break
d6983cb4 592
7e5db8c9 593 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
f1a9d64e 594 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
595 else:
596 _name = name
597
598 if mobj:
711ede6e
PH
599 if group is None:
600 # return the first matching group
601 return next(g for g in mobj.groups() if g is not None)
602 else:
603 return mobj.group(group)
c342041f 604 elif default is not NO_DEFAULT:
d6983cb4
PH
605 return default
606 elif fatal:
f1a9d64e 607 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 608 else:
08f2a92c 609 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
610 return None
611
c342041f 612 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
613 """
614 Like _search_regex, but strips HTML tags and unescapes entities.
615 """
711ede6e 616 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
617 if res:
618 return clean_html(res).strip()
619 else:
620 return res
621
fc79158d
JMF
622 def _get_login_info(self):
623 """
cf0649f8 624 Get the login info as (username, password)
fc79158d
JMF
625 It will look in the netrc file using the _NETRC_MACHINE value
626 If there's no info available, return (None, None)
627 """
628 if self._downloader is None:
629 return (None, None)
630
631 username = None
632 password = None
633 downloader_params = self._downloader.params
634
635 # Attempt to use provided username and password or .netrc data
636 if downloader_params.get('username', None) is not None:
637 username = downloader_params['username']
638 password = downloader_params['password']
639 elif downloader_params.get('usenetrc', False):
640 try:
641 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
642 if info is not None:
643 username = info[0]
644 password = info[2]
645 else:
646 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
647 except (IOError, netrc.NetrcParseError) as err:
9b9c5355 648 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
5f6a1245 649
fc79158d
JMF
650 return (username, password)
651
e64b7569 652 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 653 """
654 Get the two-factor authentication info
655 TODO - asking the user will be required for sms/phone verify
656 currently just uses the command line option
657 If there's no info available, return None
658 """
659 if self._downloader is None:
83317f69 660 return None
661 downloader_params = self._downloader.params
662
663 if downloader_params.get('twofactor', None) is not None:
664 return downloader_params['twofactor']
665
e64b7569 666 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 667
46720279
JMF
668 # Helper functions for extracting OpenGraph info
669 @staticmethod
ab2d5247 670 def _og_regexes(prop):
448ef1f3 671 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
7a6d76a6
S
672 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
673 % {'prop': re.escape(prop)})
78fb87b2 674 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 675 return [
78fb87b2
JMF
676 template % (property_re, content_re),
677 template % (content_re, property_re),
ab2d5247 678 ]
46720279 679
864f24bd
S
680 @staticmethod
681 def _meta_regex(prop):
682 return r'''(?isx)<meta
8b9848ac 683 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
684 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
685
3c4e6d83 686 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 687 if name is None:
3c4e6d83 688 name = 'OpenGraph %s' % prop
ab2d5247 689 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
690 if escaped is None:
691 return None
692 return unescapeHTML(escaped)
46720279
JMF
693
694 def _og_search_thumbnail(self, html, **kargs):
10952eb2 695 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
696
697 def _og_search_description(self, html, **kargs):
698 return self._og_search_property('description', html, fatal=False, **kargs)
699
700 def _og_search_title(self, html, **kargs):
701 return self._og_search_property('title', html, **kargs)
702
8ffa13e0 703 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
704 regexes = self._og_regexes('video') + self._og_regexes('video:url')
705 if secure:
706 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 707 return self._html_search_regex(regexes, html, name, **kargs)
46720279 708
78338f71
JMF
709 def _og_search_url(self, html, **kargs):
710 return self._og_search_property('url', html, **kargs)
711
40c696e5 712 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
59040888
PH
713 if display_name is None:
714 display_name = name
715 return self._html_search_regex(
864f24bd 716 self._meta_regex(name),
711ede6e 717 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
718
719 def _dc_search_uploader(self, html):
720 return self._html_search_meta('dc.creator', html, 'uploader')
721
8dbe9899
PH
722 def _rta_search(self, html):
723 # See http://www.rtalabel.org/index.php?content=howtofaq#single
724 if re.search(r'(?ix)<meta\s+name="rating"\s+'
725 r' content="RTA-5042-1996-1400-1577-RTA"',
726 html):
727 return 18
728 return 0
729
59040888
PH
730 def _media_rating_search(self, html):
731 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
732 rating = self._html_search_meta('rating', html)
733
734 if not rating:
735 return None
736
737 RATING_TABLE = {
738 'safe for kids': 0,
739 'general': 8,
740 '14 years': 14,
741 'mature': 17,
742 'restricted': 19,
743 }
744 return RATING_TABLE.get(rating.lower(), None)
745
69319969 746 def _family_friendly_search(self, html):
6ca7732d 747 # See http://schema.org/VideoObject
69319969
NJ
748 family_friendly = self._html_search_meta('isFamilyFriendly', html)
749
750 if not family_friendly:
751 return None
752
753 RATING_TABLE = {
754 '1': 0,
755 'true': 0,
756 '0': 18,
757 'false': 18,
758 }
759 return RATING_TABLE.get(family_friendly.lower(), None)
760
0c708f11
JMF
761 def _twitter_search_player(self, html):
762 return self._html_search_meta('twitter:player', html,
9e1a5b84 763 'twitter card player')
0c708f11 764
27713812 765 @staticmethod
f8da79f8 766 def _hidden_inputs(html):
586f1cc5 767 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
201ea3ee 768 hidden_inputs = {}
73eb13df 769 for input in re.findall(r'(?i)<input([^>]+)>', html):
be0e5dbd 770 if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
201ea3ee
S
771 continue
772 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
773 if not name:
774 continue
775 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
776 if not value:
777 continue
778 hidden_inputs[name.group('value')] = value.group('value')
779 return hidden_inputs
27713812 780
cf61d96d
S
781 def _form_hidden_inputs(self, form_id, html):
782 form = self._search_regex(
73eb13df 783 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
cf61d96d
S
784 html, '%s form' % form_id, group='form')
785 return self._hidden_inputs(form)
786
3ded7bac 787 def _sort_formats(self, formats, field_preference=None):
7e8caf30 788 if not formats:
f1a9d64e 789 raise ExtractorError('No video formats found')
7e8caf30 790
4bcc7bd1 791 def _formats_key(f):
e6812ac9
PH
792 # TODO remove the following workaround
793 from ..utils import determine_ext
794 if not f.get('ext') and 'url' in f:
795 f['ext'] = determine_ext(f['url'])
796
3ded7bac
S
797 if isinstance(field_preference, (list, tuple)):
798 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
799
4bcc7bd1
PH
800 preference = f.get('preference')
801 if preference is None:
d497a201 802 preference = 0
4bcc7bd1
PH
803 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
804 preference -= 0.5
805
d497a201 806 proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
807
4bcc7bd1
PH
808 if f.get('vcodec') == 'none': # audio only
809 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 810 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 811 else:
f1a9d64e 812 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
813 ext_preference = 0
814 try:
815 audio_ext_preference = ORDER.index(f['ext'])
816 except ValueError:
817 audio_ext_preference = -1
818 else:
819 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 820 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 821 else:
f1a9d64e 822 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
823 try:
824 ext_preference = ORDER.index(f['ext'])
825 except ValueError:
826 ext_preference = -1
827 audio_ext_preference = 0
828
829 return (
830 preference,
aff2f4f4 831 f.get('language_preference') if f.get('language_preference') is not None else -1,
5d73273f 832 f.get('quality') if f.get('quality') is not None else -1,
9933b574 833 f.get('tbr') if f.get('tbr') is not None else -1,
03cd72b0 834 f.get('filesize') if f.get('filesize') is not None else -1,
4bcc7bd1 835 f.get('vbr') if f.get('vbr') is not None else -1,
1a6373ef
PH
836 f.get('height') if f.get('height') is not None else -1,
837 f.get('width') if f.get('width') is not None else -1,
d497a201 838 proto_preference,
1e1896f2 839 ext_preference,
4bcc7bd1
PH
840 f.get('abr') if f.get('abr') is not None else -1,
841 audio_ext_preference,
2c8e03d9 842 f.get('fps') if f.get('fps') is not None else -1,
9732d77e 843 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
c64ed2a3 844 f.get('source_preference') if f.get('source_preference') is not None else -1,
74f72824 845 f.get('format_id') if f.get('format_id') is not None else '',
4bcc7bd1
PH
846 )
847 formats.sort(key=_formats_key)
59040888 848
96a53167
S
849 def _check_formats(self, formats, video_id):
850 if formats:
851 formats[:] = filter(
852 lambda f: self._is_valid_url(
853 f['url'], video_id,
854 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
855 formats)
856
857 def _is_valid_url(self, url, video_id, item='video'):
2f0f6578
S
858 url = self._proto_relative_url(url, scheme='http:')
859 # For now assume non HTTP(S) URLs always valid
860 if not (url.startswith('http://') or url.startswith('https://')):
861 return True
96a53167 862 try:
4069766c 863 self._request_webpage(url, video_id, 'Checking %s URL' % item)
96a53167
S
864 return True
865 except ExtractorError as e:
943a1e24 866 if isinstance(e.cause, compat_urllib_error.URLError):
baa43cba
S
867 self.to_screen(
868 '%s: %s URL is invalid, skipping' % (video_id, item))
96a53167
S
869 return False
870 raise
871
20991253 872 def http_scheme(self):
1ede5b24 873 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
874 return (
875 'http:'
876 if self._downloader.params.get('prefer_insecure', False)
877 else 'https:')
878
57c7411f
PH
879 def _proto_relative_url(self, url, scheme=None):
880 if url is None:
881 return url
882 if url.startswith('//'):
883 if scheme is None:
884 scheme = self.http_scheme()
885 return scheme + url
886 else:
887 return url
888
4094b6e3
PH
889 def _sleep(self, timeout, video_id, msg_template=None):
890 if msg_template is None:
f1a9d64e 891 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
892 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
893 self.to_screen(msg)
894 time.sleep(timeout)
895
a38436e8 896 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
4de61310
S
897 transform_source=lambda s: fix_xml_ampersands(s).strip(),
898 fatal=True):
f036a632
JMF
899 manifest = self._download_xml(
900 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
901 'Unable to download f4m manifest',
902 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
903 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
4de61310
S
904 transform_source=transform_source,
905 fatal=fatal)
906
907 if manifest is False:
8d29e47f 908 return []
31bb8d3f
JMF
909
910 formats = []
7a47d07c 911 manifest_version = '1.0'
b2527359 912 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 913 if not media_nodes:
7a47d07c 914 manifest_version = '2.0'
34e48bed 915 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
019839fa
S
916 base_url = xpath_text(
917 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
918 'base URL', default=None)
919 if base_url:
920 base_url = base_url.strip()
b2527359 921 for i, media_el in enumerate(media_nodes):
7a47d07c 922 if manifest_version == '2.0':
31c746e5
S
923 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
924 if not media_url:
925 continue
cc357c4d
S
926 manifest_url = (
927 media_url if media_url.startswith('http://') or media_url.startswith('https://')
019839fa 928 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
70f0f5a8
S
929 # If media_url is itself a f4m manifest do the recursive extraction
930 # since bitrates in parent manifest (this one) and media_url manifest
931 # may differ leading to inability to resolve the format by requested
932 # bitrate in f4m downloader
933 if determine_ext(manifest_url) == 'f4m':
7e5edcfd
S
934 formats.extend(self._extract_f4m_formats(
935 manifest_url, video_id, preference, f4m_id, fatal=fatal))
70f0f5a8 936 continue
b2527359 937 tbr = int_or_none(media_el.attrib.get('bitrate'))
31bb8d3f 938 formats.append({
e21a55ab 939 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
31bb8d3f
JMF
940 'url': manifest_url,
941 'ext': 'flv',
b2527359 942 'tbr': tbr,
31bb8d3f
JMF
943 'width': int_or_none(media_el.attrib.get('width')),
944 'height': int_or_none(media_el.attrib.get('height')),
60ca389c 945 'preference': preference,
31bb8d3f
JMF
946 })
947 self._sort_formats(formats)
948
949 return formats
950
f0b5d6af 951 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
60ca389c 952 entry_protocol='m3u8', preference=None,
13af92fd
YCH
953 m3u8_id=None, note=None, errnote=None,
954 fatal=True):
f0b5d6af 955
704df56d 956 formats = [{
f207019c 957 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
958 'url': m3u8_url,
959 'ext': ext,
960 'protocol': 'm3u8',
9fe6ef7a 961 'preference': preference - 1 if preference else -1,
704df56d
PH
962 'resolution': 'multiple',
963 'format_note': 'Quality selection URL',
964 }]
965
f0b5d6af
PH
966 format_url = lambda u: (
967 u
968 if re.match(r'^https?://', u)
969 else compat_urlparse.urljoin(m3u8_url, u))
970
dbd82a1d 971 res = self._download_webpage_handle(
81515ad9 972 m3u8_url, video_id,
621ed9f5 973 note=note or 'Downloading m3u8 information',
13af92fd
YCH
974 errnote=errnote or 'Failed to download m3u8 information',
975 fatal=fatal)
dbd82a1d 976 if res is False:
8d29e47f 977 return []
dbd82a1d 978 m3u8_doc, urlh = res
37113045 979 m3u8_url = urlh.geturl()
704df56d 980 last_info = None
fa156077 981 last_media = None
704df56d
PH
982 kv_rex = re.compile(
983 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
984 for line in m3u8_doc.splitlines():
985 if line.startswith('#EXT-X-STREAM-INF:'):
986 last_info = {}
987 for m in kv_rex.finditer(line):
988 v = m.group('val')
989 if v.startswith('"'):
990 v = v[1:-1]
991 last_info[m.group('key')] = v
4cd95bcb
JMF
992 elif line.startswith('#EXT-X-MEDIA:'):
993 last_media = {}
994 for m in kv_rex.finditer(line):
995 v = m.group('val')
996 if v.startswith('"'):
997 v = v[1:-1]
998 last_media[m.group('key')] = v
704df56d
PH
999 elif line.startswith('#') or not line.strip():
1000 continue
1001 else:
daebaab6 1002 if last_info is None:
f0b5d6af 1003 formats.append({'url': format_url(line)})
3524cc25 1004 continue
704df56d 1005 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
8dc9d361
S
1006 format_id = []
1007 if m3u8_id:
1008 format_id.append(m3u8_id)
05d5392c 1009 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
8dc9d361 1010 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
704df56d 1011 f = {
8dc9d361 1012 'format_id': '-'.join(format_id),
f0b5d6af 1013 'url': format_url(line.strip()),
704df56d
PH
1014 'tbr': tbr,
1015 'ext': ext,
f0b5d6af
PH
1016 'protocol': entry_protocol,
1017 'preference': preference,
704df56d
PH
1018 }
1019 codecs = last_info.get('CODECS')
1020 if codecs:
9ebf22b7
S
1021 # TODO: looks like video codec is not always necessarily goes first
1022 va_codecs = codecs.split(',')
1023 if va_codecs[0]:
1024 f['vcodec'] = va_codecs[0].partition('.')[0]
1025 if len(va_codecs) > 1 and va_codecs[1]:
1026 f['acodec'] = va_codecs[1].partition('.')[0]
704df56d
PH
1027 resolution = last_info.get('RESOLUTION')
1028 if resolution:
1029 width_str, height_str = resolution.split('x')
1030 f['width'] = int(width_str)
1031 f['height'] = int(height_str)
4cd95bcb
JMF
1032 if last_media is not None:
1033 f['m3u8_media'] = last_media
1034 last_media = None
704df56d
PH
1035 formats.append(f)
1036 last_info = {}
1037 self._sort_formats(formats)
1038 return formats
1039
a107193e
S
1040 @staticmethod
1041 def _xpath_ns(path, namespace=None):
1042 if not namespace:
1043 return path
1044 out = []
1045 for c in path.split('/'):
1046 if not c or c == '.':
1047 out.append(c)
1048 else:
1049 out.append('{%s}%s' % (namespace, c))
1050 return '/'.join(out)
1051
1052 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1053 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1054
995029a1
PH
1055 if smil is False:
1056 assert not fatal
1057 return []
e89a2aab 1058
17712eeb 1059 namespace = self._parse_smil_namespace(smil)
a107193e
S
1060
1061 return self._parse_smil_formats(
1062 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1063
1064 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1065 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1066 if smil is False:
1067 return {}
1068 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1069
1070 def _download_smil(self, smil_url, video_id, fatal=True):
1071 return self._download_xml(
1072 smil_url, video_id, 'Downloading SMIL file',
1073 'Unable to download SMIL file', fatal=fatal)
1074
1075 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 1076 namespace = self._parse_smil_namespace(smil)
a107193e
S
1077
1078 formats = self._parse_smil_formats(
1079 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1080 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1081
1082 video_id = os.path.splitext(url_basename(smil_url))[0]
1083 title = None
1084 description = None
647eab45 1085 upload_date = None
a107193e
S
1086 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1087 name = meta.attrib.get('name')
1088 content = meta.attrib.get('content')
1089 if not name or not content:
1090 continue
1091 if not title and name == 'title':
1092 title = content
1093 elif not description and name in ('description', 'abstract'):
1094 description = content
647eab45
S
1095 elif not upload_date and name == 'date':
1096 upload_date = unified_strdate(content)
a107193e 1097
1e5bcdec
S
1098 thumbnails = [{
1099 'id': image.get('type'),
1100 'url': image.get('src'),
1101 'width': int_or_none(image.get('width')),
1102 'height': int_or_none(image.get('height')),
1103 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1104
a107193e
S
1105 return {
1106 'id': video_id,
1107 'title': title or video_id,
1108 'description': description,
647eab45 1109 'upload_date': upload_date,
1e5bcdec 1110 'thumbnails': thumbnails,
a107193e
S
1111 'formats': formats,
1112 'subtitles': subtitles,
1113 }
1114
17712eeb
S
1115 def _parse_smil_namespace(self, smil):
1116 return self._search_regex(
1117 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1118
f877c6ae 1119 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
a107193e
S
1120 base = smil_url
1121 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1122 b = meta.get('base') or meta.get('httpBase')
1123 if b:
1124 base = b
1125 break
e89a2aab
S
1126
1127 formats = []
1128 rtmp_count = 0
a107193e
S
1129 http_count = 0
1130
1131 videos = smil.findall(self._xpath_ns('.//video', namespace))
1132 for video in videos:
1133 src = video.get('src')
1134 if not src:
1135 continue
1136
e7d8e98a 1137 bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
a107193e
S
1138 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1139 width = int_or_none(video.get('width'))
1140 height = int_or_none(video.get('height'))
1141 proto = video.get('proto')
1142 ext = video.get('ext')
1143 src_ext = determine_ext(src)
1144 streamer = video.get('streamer') or base
1145
1146 if proto == 'rtmp' or streamer.startswith('rtmp'):
1147 rtmp_count += 1
1148 formats.append({
1149 'url': streamer,
1150 'play_path': src,
1151 'ext': 'flv',
1152 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1153 'tbr': bitrate,
1154 'filesize': filesize,
1155 'width': width,
1156 'height': height,
1157 })
f877c6ae
YCH
1158 if transform_rtmp_url:
1159 streamer, src = transform_rtmp_url(streamer, src)
1160 formats[-1].update({
1161 'url': streamer,
1162 'play_path': src,
1163 })
a107193e
S
1164 continue
1165
1166 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1167
1168 if proto == 'm3u8' or src_ext == 'm3u8':
7e5edcfd
S
1169 formats.extend(self._extract_m3u8_formats(
1170 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False))
a107193e
S
1171 continue
1172
1173 if src_ext == 'f4m':
1174 f4m_url = src_url
1175 if not f4m_params:
1176 f4m_params = {
1177 'hdcore': '3.2.0',
1178 'plugin': 'flowplayer-3.2.0.1',
1179 }
1180 f4m_url += '&' if '?' in f4m_url else '?'
41c3a5a7 1181 f4m_url += compat_urllib_parse.urlencode(f4m_params)
7e5edcfd 1182 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
a107193e
S
1183 continue
1184
c78e4817 1185 if src_url.startswith('http') and self._is_valid_url(src, video_id):
a107193e
S
1186 http_count += 1
1187 formats.append({
1188 'url': src_url,
1189 'ext': ext or src_ext or 'flv',
1190 'format_id': 'http-%d' % (bitrate or http_count),
1191 'tbr': bitrate,
1192 'filesize': filesize,
1193 'width': width,
1194 'height': height,
1195 })
1196 continue
63757032 1197
e89a2aab
S
1198 self._sort_formats(formats)
1199
1200 return formats
1201
ce00af87 1202 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
a107193e
S
1203 subtitles = {}
1204 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1205 src = textstream.get('src')
1206 if not src:
1207 continue
1208 ext = textstream.get('ext') or determine_ext(src)
1209 if not ext:
1210 type_ = textstream.get('type')
5cdefc46
S
1211 SUBTITLES_TYPES = {
1212 'text/vtt': 'vtt',
1213 'text/srt': 'srt',
1214 'application/smptett+xml': 'tt',
1215 }
1216 if type_ in SUBTITLES_TYPES:
1217 ext = SUBTITLES_TYPES[type_]
03bc7237 1218 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
a107193e
S
1219 subtitles.setdefault(lang, []).append({
1220 'url': src,
1221 'ext': ext,
1222 })
1223 return subtitles
63757032 1224
942acef5
S
1225 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1226 xspf = self._download_xml(
8d6765cf 1227 playlist_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
1228 'Unable to download xspf manifest', fatal=fatal)
1229 if xspf is False:
1230 return []
1231 return self._parse_xspf(xspf, playlist_id)
8d6765cf 1232
942acef5 1233 def _parse_xspf(self, playlist, playlist_id):
8d6765cf
S
1234 NS_MAP = {
1235 'xspf': 'http://xspf.org/ns/0/',
1236 's1': 'http://static.streamone.nl/player/ns/0',
1237 }
1238
1239 entries = []
1240 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1241 title = xpath_text(
98044462 1242 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
1243 description = xpath_text(
1244 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1245 thumbnail = xpath_text(
1246 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1247 duration = float_or_none(
1248 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1249
1250 formats = [{
1251 'url': location.text,
1252 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1253 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1254 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1255 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1256 self._sort_formats(formats)
1257
1258 entries.append({
1259 'id': playlist_id,
1260 'title': title,
1261 'description': description,
1262 'thumbnail': thumbnail,
1263 'duration': duration,
1264 'formats': formats,
1265 })
1266 return entries
1267
f4b1c7ad
PH
1268 def _live_title(self, name):
1269 """ Generate the title for a live video """
1270 now = datetime.datetime.now()
1271 now_str = now.strftime("%Y-%m-%d %H:%M")
1272 return name + ' ' + now_str
1273
b14f3a4c
PH
1274 def _int(self, v, name, fatal=False, **kwargs):
1275 res = int_or_none(v, **kwargs)
1276 if 'get_attr' in kwargs:
1277 print(getattr(v, kwargs['get_attr']))
1278 if res is None:
1279 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1280 if fatal:
1281 raise ExtractorError(msg)
1282 else:
1283 self._downloader.report_warning(msg)
1284 return res
1285
1286 def _float(self, v, name, fatal=False, **kwargs):
1287 res = float_or_none(v, **kwargs)
1288 if res is None:
1289 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1290 if fatal:
1291 raise ExtractorError(msg)
1292 else:
1293 self._downloader.report_warning(msg)
1294 return res
1295
42939b61 1296 def _set_cookie(self, domain, name, value, expire_time=None):
810fb84d
PH
1297 cookie = compat_cookiejar.Cookie(
1298 0, name, value, None, None, domain, None,
42939b61
JMF
1299 None, '/', True, False, expire_time, '', None, None, None)
1300 self._downloader.cookiejar.set_cookie(cookie)
1301
799207e8 1302 def _get_cookies(self, url):
1303 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
5c2266df 1304 req = sanitized_Request(url)
799207e8 1305 self._downloader.cookiejar.add_cookie_header(req)
1306 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1307
05900629
PH
1308 def get_testcases(self, include_onlymatching=False):
1309 t = getattr(self, '_TEST', None)
1310 if t:
1311 assert not hasattr(self, '_TESTS'), \
1312 '%s has _TEST and _TESTS' % type(self).__name__
1313 tests = [t]
1314 else:
1315 tests = getattr(self, '_TESTS', [])
1316 for t in tests:
1317 if not include_onlymatching and t.get('only_matching', False):
1318 continue
1319 t['name'] = type(self).__name__[:-len('IE')]
1320 yield t
1321
1322 def is_suitable(self, age_limit):
1323 """ Test whether the extractor is generally suitable for the given
1324 age limit (i.e. pornographic sites are not, all others usually are) """
1325
1326 any_restricted = False
1327 for tc in self.get_testcases(include_onlymatching=False):
1328 if 'playlist' in tc:
1329 tc = tc['playlist'][0]
1330 is_restricted = age_restricted(
1331 tc.get('info_dict', {}).get('age_limit'), age_limit)
1332 if not is_restricted:
1333 return True
1334 any_restricted = any_restricted or is_restricted
1335 return not any_restricted
1336
a504ced0 1337 def extract_subtitles(self, *args, **kwargs):
9868ea49
JMF
1338 if (self._downloader.params.get('writesubtitles', False) or
1339 self._downloader.params.get('listsubtitles')):
1340 return self._get_subtitles(*args, **kwargs)
1341 return {}
a504ced0
JMF
1342
1343 def _get_subtitles(self, *args, **kwargs):
1344 raise NotImplementedError("This method must be implemented by subclasses")
1345
912e0b7e
YCH
1346 @staticmethod
1347 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1348 """ Merge subtitle items for one language. Items with duplicated URLs
1349 will be dropped. """
1350 list1_urls = set([item['url'] for item in subtitle_list1])
1351 ret = list(subtitle_list1)
1352 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1353 return ret
1354
1355 @classmethod
8c97f819 1356 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
912e0b7e 1357 """ Merge two subtitle dictionaries, language by language. """
912e0b7e
YCH
1358 ret = dict(subtitle_dict1)
1359 for lang in subtitle_dict2:
8c97f819 1360 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
912e0b7e
YCH
1361 return ret
1362
360e1ca5 1363 def extract_automatic_captions(self, *args, **kwargs):
9868ea49
JMF
1364 if (self._downloader.params.get('writeautomaticsub', False) or
1365 self._downloader.params.get('listsubtitles')):
1366 return self._get_automatic_captions(*args, **kwargs)
1367 return {}
360e1ca5
JMF
1368
1369 def _get_automatic_captions(self, *args, **kwargs):
1370 raise NotImplementedError("This method must be implemented by subclasses")
1371
8dbe9899 1372
d6983cb4
PH
1373class SearchInfoExtractor(InfoExtractor):
1374 """
1375 Base class for paged search queries extractors.
10952eb2 1376 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
d6983cb4
PH
1377 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1378 """
1379
1380 @classmethod
1381 def _make_valid_url(cls):
1382 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1383
1384 @classmethod
1385 def suitable(cls, url):
1386 return re.match(cls._make_valid_url(), url) is not None
1387
1388 def _real_extract(self, query):
1389 mobj = re.match(self._make_valid_url(), query)
1390 if mobj is None:
f1a9d64e 1391 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
1392
1393 prefix = mobj.group('prefix')
1394 query = mobj.group('query')
1395 if prefix == '':
1396 return self._get_n_results(query, 1)
1397 elif prefix == 'all':
1398 return self._get_n_results(query, self._MAX_RESULTS)
1399 else:
1400 n = int(prefix)
1401 if n <= 0:
f1a9d64e 1402 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 1403 elif n > self._MAX_RESULTS:
f1a9d64e 1404 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
1405 n = self._MAX_RESULTS
1406 return self._get_n_results(query, n)
1407
1408 def _get_n_results(self, query, n):
1409 """Get a specified number of results for a query"""
416a5efc 1410 raise NotImplementedError("This method must be implemented by subclasses")
0f818663
PH
1411
1412 @property
1413 def SEARCH_KEY(self):
1414 return self._SEARCH_KEY