]> jfr.im git - yt-dlp.git/blame - youtube_dl/extractor/common.py
[extractor/common] Interactive TFA code input
[yt-dlp.git] / youtube_dl / extractor / common.py
CommitLineData
f1a9d64e
PH
1from __future__ import unicode_literals
2
d6983cb4 3import base64
f4b1c7ad 4import datetime
3ec05685 5import hashlib
3d3538e4 6import json
4094b6e3 7import netrc
d6983cb4
PH
8import os
9import re
10import socket
11import sys
4094b6e3 12import time
267ed0c5 13import xml.etree.ElementTree
d6983cb4 14
8c25f81b 15from ..compat import (
42939b61 16 compat_cookiejar,
799207e8 17 compat_cookies,
e64b7569 18 compat_getpass,
96a53167 19 compat_HTTPError,
d6983cb4
PH
20 compat_http_client,
21 compat_urllib_error,
a107193e 22 compat_urllib_parse,
c7deaa4c 23 compat_urllib_parse_urlparse,
799207e8 24 compat_urllib_request,
f0b5d6af 25 compat_urlparse,
d6983cb4 26 compat_str,
8c25f81b
PH
27)
28from ..utils import (
c342041f 29 NO_DEFAULT,
05900629 30 age_restricted,
08f2a92c 31 bug_reports_message,
d6983cb4
PH
32 clean_html,
33 compiled_regex_type,
70f0f5a8 34 determine_ext,
d6983cb4 35 ExtractorError,
97f4aecf 36 fix_xml_ampersands,
b14f3a4c 37 float_or_none,
31bb8d3f 38 int_or_none,
55b3e45b 39 RegexNotFoundError,
d41e6efc 40 sanitize_filename,
f38de77f 41 unescapeHTML,
a107193e 42 url_basename,
8d6765cf
S
43 xpath_text,
44 xpath_with_ns,
d6983cb4 45)
c342041f 46
d6983cb4
PH
47
48class InfoExtractor(object):
49 """Information Extractor class.
50
51 Information extractors are the classes that, given a URL, extract
52 information about the video (or videos) the URL refers to. This
53 information includes the real video URL, the video title, author and
54 others. The information is stored in a dictionary which is then
5d380852 55 passed to the YoutubeDL. The YoutubeDL processes this
d6983cb4
PH
56 information possibly downloading the video to the file system, among
57 other possible outcomes.
58
cf0649f8 59 The type field determines the type of the result.
fed5d032
PH
60 By far the most common value (and the default if _type is missing) is
61 "video", which indicates a single video.
62
63 For a video, the dictionaries must include the following fields:
d6983cb4
PH
64
65 id: Video identifier.
d6983cb4 66 title: Video title, unescaped.
d67b0b15 67
f49d89ee 68 Additionally, it must contain either a formats entry or a url one:
d67b0b15 69
f49d89ee
PH
70 formats: A list of dictionaries for each format available, ordered
71 from worst to best quality.
72
73 Potential fields:
d67b0b15 74 * url Mandatory. The URL of the video file
10952eb2 75 * ext Will be calculated from URL if missing
d67b0b15
PH
76 * format A human-readable description of the format
77 ("mp4 container with h264/opus").
78 Calculated from the format_id, width, height.
79 and format_note fields if missing.
80 * format_id A short description of the format
5d4f3985
PH
81 ("mp4_h264_opus" or "19").
82 Technically optional, but strongly recommended.
d67b0b15
PH
83 * format_note Additional info about the format
84 ("3D" or "DASH video")
85 * width Width of the video, if known
86 * height Height of the video, if known
f49d89ee 87 * resolution Textual description of width and height
7217e148 88 * tbr Average bitrate of audio and video in KBit/s
d67b0b15
PH
89 * abr Average audio bitrate in KBit/s
90 * acodec Name of the audio codec in use
dd27fd17 91 * asr Audio sampling rate in Hertz
d67b0b15 92 * vbr Average video bitrate in KBit/s
fbb21cf5 93 * fps Frame rate
d67b0b15 94 * vcodec Name of the video codec in use
1394ce65 95 * container Name of the container format
d67b0b15 96 * filesize The number of bytes, if known in advance
9732d77e 97 * filesize_approx An estimate for the number of bytes
d67b0b15 98 * player_url SWF Player URL (used for rtmpdump).
c7deaa4c
PH
99 * protocol The protocol that will be used for the actual
100 download, lower-case.
b04b8852
PH
101 "http", "https", "rtsp", "rtmp", "rtmpe",
102 "m3u8", or "m3u8_native".
f49d89ee 103 * preference Order number of this format. If this field is
08d13955 104 present and not None, the formats get sorted
38d63d84 105 by this field, regardless of all other values.
f49d89ee
PH
106 -1 for default (order by other properties),
107 -2 or smaller for less than default.
e65566a9
PH
108 < -1000 to hide the format (if there is
109 another one which is strictly better)
aff2f4f4
PH
110 * language_preference Is this in the correct requested
111 language?
112 10 if it's what the URL is about,
113 -1 for default (don't know),
114 -10 otherwise, other values reserved for now.
5d73273f
PH
115 * quality Order number of the video quality of this
116 format, irrespective of the file format.
117 -1 for default (order by other properties),
118 -2 or smaller for less than default.
c64ed2a3
PH
119 * source_preference Order number for this video source
120 (quality takes higher priority)
121 -1 for default (order by other properties),
122 -2 or smaller for less than default.
d769be6c
PH
123 * http_headers A dictionary of additional HTTP headers
124 to add to the request.
6271f1ca 125 * stretched_ratio If given and not 1, indicates that the
3dee7826
PH
126 video's pixels are not square.
127 width : height ratio as float.
128 * no_resume The server does not support resuming the
129 (HTTP or RTMP) download. Boolean.
130
c0ba0f48 131 url: Final video URL.
d6983cb4 132 ext: Video filename extension.
d67b0b15
PH
133 format: The video format, defaults to ext (used for --get-format)
134 player_url: SWF Player URL (used for rtmpdump).
2f5865cc 135
d6983cb4
PH
136 The following fields are optional:
137
f5e43bc6 138 alt_title: A secondary title of the video.
0afef30b
PH
139 display_id An alternative identifier for the video, not necessarily
140 unique, but available before title. Typically, id is
141 something like "4234987", title "Dancing naked mole rats",
142 and display_id "dancing-naked-mole-rats"
d5519808 143 thumbnails: A list of dictionaries, with the following entries:
cfb56d1a 144 * "id" (optional, string) - Thumbnail format ID
d5519808 145 * "url"
cfb56d1a 146 * "preference" (optional, int) - quality of the image
d5519808
PH
147 * "width" (optional, int)
148 * "height" (optional, int)
149 * "resolution" (optional, string "{width}x{height"},
150 deprecated)
d6983cb4 151 thumbnail: Full URL to a video thumbnail image.
f5e43bc6 152 description: Full video description.
d6983cb4 153 uploader: Full name of the video uploader.
9bb8e0a3 154 creator: The main artist who created the video.
955c4514 155 timestamp: UNIX timestamp of the moment the video became available.
d6983cb4 156 upload_date: Video upload date (YYYYMMDD).
955c4514 157 If not explicitly set, calculated from timestamp.
d6983cb4 158 uploader_id: Nickname or id of the video uploader.
da9ec3b9 159 location: Physical location where the video was filmed.
a504ced0
JMF
160 subtitles: The available subtitles as a dictionary in the format
161 {language: subformats}. "subformats" is a list sorted from
162 lower to higher preference, each element is a dictionary
163 with the "ext" entry and one of:
164 * "data": The subtitles file contents
10952eb2 165 * "url": A URL pointing to the subtitles file
360e1ca5
JMF
166 automatic_captions: Like 'subtitles', used by the YoutubeIE for
167 automatically generated captions
c0ba0f48 168 duration: Length of the video in seconds, as an integer.
f3d29461 169 view_count: How many users have watched the video on the platform.
19e3dfc9
PH
170 like_count: Number of positive ratings of the video
171 dislike_count: Number of negative ratings of the video
2d30521a 172 average_rating: Average rating give by users, the scale used depends on the webpage
19e3dfc9 173 comment_count: Number of comments on the video
dd622d7c
PH
174 comments: A list of comments, each with one or more of the following
175 properties (all but one of text or html optional):
176 * "author" - human-readable name of the comment author
177 * "author_id" - user ID of the comment author
178 * "id" - Comment ID
179 * "html" - Comment as HTML
180 * "text" - Plain text of the comment
181 * "timestamp" - UNIX timestamp of comment
182 * "parent" - ID of the comment this one is replying to.
183 Set to "root" to indicate that this is a
184 comment to the original video.
8dbe9899 185 age_limit: Age restriction for the video, as an integer (years)
10952eb2 186 webpage_url: The URL to the video webpage, if given to youtube-dl it
9103bbc5
JMF
187 should allow to get the same result again. (It will be set
188 by YoutubeDL if it's missing)
ad3bc6ac
PH
189 categories: A list of categories that the video falls in, for example
190 ["Sports", "Berlin"]
864f24bd 191 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
7267bd53
PH
192 is_live: True, False, or None (=unknown). Whether this video is a
193 live stream that goes on instead of a fixed-length video.
7c80519c 194 start_time: Time in seconds where the reproduction should start, as
10952eb2 195 specified in the URL.
297a564b 196 end_time: Time in seconds where the reproduction should end, as
10952eb2 197 specified in the URL.
d6983cb4 198
deefc05b 199 Unless mentioned otherwise, the fields should be Unicode strings.
d6983cb4 200
d838b1bd
PH
201 Unless mentioned otherwise, None is equivalent to absence of information.
202
fed5d032
PH
203
204 _type "playlist" indicates multiple videos.
b82f815f
PH
205 There must be a key "entries", which is a list, an iterable, or a PagedList
206 object, each element of which is a valid dictionary by this specification.
fed5d032 207
e0b9d78f
S
208 Additionally, playlists can have "title", "description" and "id" attributes
209 with the same semantics as videos (see above).
fed5d032
PH
210
211
212 _type "multi_video" indicates that there are multiple videos that
213 form a single show, for examples multiple acts of an opera or TV episode.
214 It must have an entries key like a playlist and contain all the keys
215 required for a video at the same time.
216
217
218 _type "url" indicates that the video must be extracted from another
219 location, possibly by a different extractor. Its only required key is:
220 "url" - the next URL to extract.
f58766ce
PH
221 The key "ie_key" can be set to the class name (minus the trailing "IE",
222 e.g. "Youtube") if the extractor class is known in advance.
223 Additionally, the dictionary may have any properties of the resolved entity
224 known in advance, for example "title" if the title of the referred video is
fed5d032
PH
225 known ahead of time.
226
227
228 _type "url_transparent" entities have the same specification as "url", but
229 indicate that the given additional information is more precise than the one
230 associated with the resolved URL.
231 This is useful when a site employs a video service that hosts the video and
232 its technical metadata, but that video service does not embed a useful
233 title, description etc.
234
235
d6983cb4
PH
236 Subclasses of this one should re-define the _real_initialize() and
237 _real_extract() methods and define a _VALID_URL regexp.
238 Probably, they should also be added to the list of extractors.
239
d6983cb4
PH
240 Finally, the _WORKING attribute should be set to False for broken IEs
241 in order to warn the users and skip the tests.
242 """
243
244 _ready = False
245 _downloader = None
246 _WORKING = True
247
248 def __init__(self, downloader=None):
249 """Constructor. Receives an optional downloader."""
250 self._ready = False
251 self.set_downloader(downloader)
252
253 @classmethod
254 def suitable(cls, url):
255 """Receives a URL and returns True if suitable for this IE."""
79cb2577
PH
256
257 # This does not use has/getattr intentionally - we want to know whether
258 # we have cached the regexp for *this* class, whereas getattr would also
259 # match the superclass
260 if '_VALID_URL_RE' not in cls.__dict__:
261 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
262 return cls._VALID_URL_RE.match(url) is not None
d6983cb4 263
ed9266db
PH
264 @classmethod
265 def _match_id(cls, url):
266 if '_VALID_URL_RE' not in cls.__dict__:
267 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
268 m = cls._VALID_URL_RE.match(url)
269 assert m
270 return m.group('id')
271
d6983cb4
PH
272 @classmethod
273 def working(cls):
274 """Getter method for _WORKING."""
275 return cls._WORKING
276
277 def initialize(self):
278 """Initializes an instance (authentication, etc)."""
279 if not self._ready:
280 self._real_initialize()
281 self._ready = True
282
283 def extract(self, url):
284 """Extracts URL information and returns it in list of dicts."""
3a5bcd03
PH
285 try:
286 self.initialize()
287 return self._real_extract(url)
288 except ExtractorError:
289 raise
290 except compat_http_client.IncompleteRead as e:
291 raise ExtractorError('A network error has occured.', cause=e, expected=True)
9650885b 292 except (KeyError, StopIteration) as e:
3a5bcd03 293 raise ExtractorError('An extractor error has occured.', cause=e)
d6983cb4
PH
294
295 def set_downloader(self, downloader):
296 """Sets the downloader for this IE."""
297 self._downloader = downloader
298
299 def _real_initialize(self):
300 """Real initialization process. Redefine in subclasses."""
301 pass
302
303 def _real_extract(self, url):
304 """Real extraction process. Redefine in subclasses."""
305 pass
306
56c73665
JMF
307 @classmethod
308 def ie_key(cls):
309 """A string for getting the InfoExtractor with get_info_extractor"""
310 return cls.__name__[:-2]
311
d6983cb4
PH
312 @property
313 def IE_NAME(self):
314 return type(self).__name__[:-2]
315
7cc3570e 316 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
d6983cb4
PH
317 """ Returns the response handle """
318 if note is None:
319 self.report_download_webpage(video_id)
320 elif note is not False:
7cc3570e 321 if video_id is None:
f1a9d64e 322 self.to_screen('%s' % (note,))
7cc3570e 323 else:
f1a9d64e 324 self.to_screen('%s: %s' % (video_id, note))
d6983cb4 325 try:
dca08720 326 return self._downloader.urlopen(url_or_request)
d6983cb4 327 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
aa94a6d3
PH
328 if errnote is False:
329 return False
d6983cb4 330 if errnote is None:
f1a9d64e
PH
331 errnote = 'Unable to download webpage'
332 errmsg = '%s: %s' % (errnote, compat_str(err))
7cc3570e
PH
333 if fatal:
334 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
335 else:
336 self._downloader.report_warning(errmsg)
337 return False
d6983cb4 338
c9a77969 339 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
d6983cb4 340 """ Returns a tuple (page content as string, URL handle) """
b9d3e163
PH
341 # Strip hashes from the URL (#1038)
342 if isinstance(url_or_request, (compat_str, str)):
343 url_or_request = url_or_request.partition('#')[0]
344
7cc3570e
PH
345 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
346 if urlh is False:
347 assert not fatal
348 return False
c9a77969 349 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
23be51d8
PH
350 return (content, urlh)
351
c9a77969
YCH
352 @staticmethod
353 def _guess_encoding_from_content(content_type, webpage_bytes):
d6983cb4
PH
354 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
355 if m:
356 encoding = m.group(1)
357 else:
0d75ae2c 358 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
f143d86a
PH
359 webpage_bytes[:1024])
360 if m:
361 encoding = m.group(1).decode('ascii')
b60016e8
PH
362 elif webpage_bytes.startswith(b'\xff\xfe'):
363 encoding = 'utf-16'
f143d86a
PH
364 else:
365 encoding = 'utf-8'
c9a77969
YCH
366
367 return encoding
368
369 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
370 content_type = urlh.headers.get('Content-Type', '')
371 webpage_bytes = urlh.read()
372 if prefix is not None:
373 webpage_bytes = prefix + webpage_bytes
374 if not encoding:
375 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
d6983cb4
PH
376 if self._downloader.params.get('dump_intermediate_pages', False):
377 try:
378 url = url_or_request.get_full_url()
379 except AttributeError:
380 url = url_or_request
f1a9d64e 381 self.to_screen('Dumping request to ' + url)
d6983cb4
PH
382 dump = base64.b64encode(webpage_bytes).decode('ascii')
383 self._downloader.to_screen(dump)
d41e6efc
PH
384 if self._downloader.params.get('write_pages', False):
385 try:
386 url = url_or_request.get_full_url()
387 except AttributeError:
388 url = url_or_request
5afa7f8b 389 basen = '%s_%s' % (video_id, url)
c1bce22f 390 if len(basen) > 240:
f1a9d64e 391 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
c1bce22f
PH
392 basen = basen[:240 - len(h)] + h
393 raw_filename = basen + '.dump'
d41e6efc 394 filename = sanitize_filename(raw_filename, restricted=True)
f1a9d64e 395 self.to_screen('Saving request to ' + filename)
5f58165d
S
396 # Working around MAX_PATH limitation on Windows (see
397 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
398 if os.name == 'nt':
399 absfilepath = os.path.abspath(filename)
400 if len(absfilepath) > 259:
401 filename = '\\\\?\\' + absfilepath
d41e6efc
PH
402 with open(filename, 'wb') as outf:
403 outf.write(webpage_bytes)
404
ec0fafbb
AA
405 try:
406 content = webpage_bytes.decode(encoding, 'replace')
407 except LookupError:
408 content = webpage_bytes.decode('utf-8', 'replace')
2410c43d 409
f1a9d64e
PH
410 if ('<title>Access to this site is blocked</title>' in content and
411 'Websense' in content[:512]):
412 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
2410c43d
PH
413 blocked_iframe = self._html_search_regex(
414 r'<iframe src="([^"]+)"', content,
f1a9d64e 415 'Websense information URL', default=None)
2410c43d 416 if blocked_iframe:
f1a9d64e 417 msg += ' Visit %s for more details' % blocked_iframe
2410c43d 418 raise ExtractorError(msg, expected=True)
77b2986b
PH
419 if '<title>The URL you requested has been blocked</title>' in content[:512]:
420 msg = (
421 'Access to this webpage has been blocked by Indian censorship. '
422 'Use a VPN or proxy server (with --proxy) to route around it.')
423 block_msg = self._html_search_regex(
424 r'</h1><p>(.*?)</p>',
425 content, 'block message', default=None)
426 if block_msg:
427 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
428 raise ExtractorError(msg, expected=True)
2410c43d 429
23be51d8 430 return content
d6983cb4 431
c9a77969 432 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
d6983cb4 433 """ Returns the data of the page as a string """
995ad69c
TF
434 success = False
435 try_count = 0
436 while success is False:
437 try:
c9a77969 438 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
995ad69c
TF
439 success = True
440 except compat_http_client.IncompleteRead as e:
441 try_count += 1
442 if try_count >= tries:
443 raise e
444 self._sleep(timeout, video_id)
7cc3570e
PH
445 if res is False:
446 return res
447 else:
448 content, _ = res
449 return content
d6983cb4 450
2a275ab0 451 def _download_xml(self, url_or_request, video_id,
f1a9d64e 452 note='Downloading XML', errnote='Unable to download XML',
c9a77969 453 transform_source=None, fatal=True, encoding=None):
267ed0c5 454 """Return the xml as an xml.etree.ElementTree.Element"""
28746fbd 455 xml_string = self._download_webpage(
c9a77969 456 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
28746fbd
PH
457 if xml_string is False:
458 return xml_string
e2b38da9
PH
459 if transform_source:
460 xml_string = transform_source(xml_string)
267ed0c5
JMF
461 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
462
3d3538e4 463 def _download_json(self, url_or_request, video_id,
f1a9d64e
PH
464 note='Downloading JSON metadata',
465 errnote='Unable to download JSON metadata',
b090af59 466 transform_source=None,
c9a77969 467 fatal=True, encoding=None):
b090af59 468 json_string = self._download_webpage(
c9a77969
YCH
469 url_or_request, video_id, note, errnote, fatal=fatal,
470 encoding=encoding)
b090af59
PH
471 if (not fatal) and json_string is False:
472 return None
ebb64199
TF
473 return self._parse_json(
474 json_string, video_id, transform_source=transform_source, fatal=fatal)
475
476 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
81c2f20b
PH
477 if transform_source:
478 json_string = transform_source(json_string)
3d3538e4
PH
479 try:
480 return json.loads(json_string)
481 except ValueError as ve:
e7b6d122
PH
482 errmsg = '%s: Failed to parse JSON ' % video_id
483 if fatal:
484 raise ExtractorError(errmsg, cause=ve)
485 else:
486 self.report_warning(errmsg + str(ve))
3d3538e4 487
f45f96f8 488 def report_warning(self, msg, video_id=None):
f1a9d64e 489 idstr = '' if video_id is None else '%s: ' % video_id
f45f96f8 490 self._downloader.report_warning(
f1a9d64e 491 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
f45f96f8 492
d6983cb4
PH
493 def to_screen(self, msg):
494 """Print msg to screen, prefixing it with '[ie_name]'"""
f1a9d64e 495 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
d6983cb4
PH
496
497 def report_extraction(self, id_or_name):
498 """Report information extraction."""
f1a9d64e 499 self.to_screen('%s: Extracting information' % id_or_name)
d6983cb4
PH
500
501 def report_download_webpage(self, video_id):
502 """Report webpage download."""
f1a9d64e 503 self.to_screen('%s: Downloading webpage' % video_id)
d6983cb4
PH
504
505 def report_age_confirmation(self):
506 """Report attempt to confirm age."""
f1a9d64e 507 self.to_screen('Confirming age')
d6983cb4 508
fc79158d
JMF
509 def report_login(self):
510 """Report attempt to log in."""
f1a9d64e 511 self.to_screen('Logging in')
fc79158d 512
5f6a1245 513 # Methods for following #608
c0d0b01f 514 @staticmethod
830d53bf 515 def url_result(url, ie=None, video_id=None, video_title=None):
10952eb2 516 """Returns a URL that points to a page that should be processed"""
5f6a1245 517 # TODO: ie should be the class used for getting the info
d6983cb4
PH
518 video_info = {'_type': 'url',
519 'url': url,
520 'ie_key': ie}
7012b23c
PH
521 if video_id is not None:
522 video_info['id'] = video_id
830d53bf
S
523 if video_title is not None:
524 video_info['title'] = video_title
d6983cb4 525 return video_info
5f6a1245 526
c0d0b01f 527 @staticmethod
acf5cbfe 528 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
d6983cb4
PH
529 """Returns a playlist"""
530 video_info = {'_type': 'playlist',
531 'entries': entries}
532 if playlist_id:
533 video_info['id'] = playlist_id
534 if playlist_title:
535 video_info['title'] = playlist_title
acf5cbfe
S
536 if playlist_description:
537 video_info['description'] = playlist_description
d6983cb4
PH
538 return video_info
539
c342041f 540 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
541 """
542 Perform a regex search on the given string, using a single or a list of
543 patterns returning the first matching group.
544 In case of failure return a default value or raise a WARNING or a
55b3e45b 545 RegexNotFoundError, depending on fatal, specifying the field name.
d6983cb4
PH
546 """
547 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
548 mobj = re.search(pattern, string, flags)
549 else:
550 for p in pattern:
551 mobj = re.search(p, string, flags)
c3415d1b
PH
552 if mobj:
553 break
d6983cb4 554
7e5db8c9 555 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
f1a9d64e 556 _name = '\033[0;34m%s\033[0m' % name
d6983cb4
PH
557 else:
558 _name = name
559
560 if mobj:
711ede6e
PH
561 if group is None:
562 # return the first matching group
563 return next(g for g in mobj.groups() if g is not None)
564 else:
565 return mobj.group(group)
c342041f 566 elif default is not NO_DEFAULT:
d6983cb4
PH
567 return default
568 elif fatal:
f1a9d64e 569 raise RegexNotFoundError('Unable to extract %s' % _name)
d6983cb4 570 else:
08f2a92c 571 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
d6983cb4
PH
572 return None
573
c342041f 574 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
d6983cb4
PH
575 """
576 Like _search_regex, but strips HTML tags and unescapes entities.
577 """
711ede6e 578 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
d6983cb4
PH
579 if res:
580 return clean_html(res).strip()
581 else:
582 return res
583
fc79158d
JMF
584 def _get_login_info(self):
585 """
cf0649f8 586 Get the login info as (username, password)
fc79158d
JMF
587 It will look in the netrc file using the _NETRC_MACHINE value
588 If there's no info available, return (None, None)
589 """
590 if self._downloader is None:
591 return (None, None)
592
593 username = None
594 password = None
595 downloader_params = self._downloader.params
596
597 # Attempt to use provided username and password or .netrc data
598 if downloader_params.get('username', None) is not None:
599 username = downloader_params['username']
600 password = downloader_params['password']
601 elif downloader_params.get('usenetrc', False):
602 try:
603 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
604 if info is not None:
605 username = info[0]
606 password = info[2]
607 else:
608 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
609 except (IOError, netrc.NetrcParseError) as err:
f1a9d64e 610 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
5f6a1245 611
fc79158d
JMF
612 return (username, password)
613
e64b7569 614 def _get_tfa_info(self, note='two-factor verification code'):
83317f69 615 """
616 Get the two-factor authentication info
617 TODO - asking the user will be required for sms/phone verify
618 currently just uses the command line option
619 If there's no info available, return None
620 """
621 if self._downloader is None:
83317f69 622 return None
623 downloader_params = self._downloader.params
624
625 if downloader_params.get('twofactor', None) is not None:
626 return downloader_params['twofactor']
627
e64b7569 628 return compat_getpass('Type %s and press [Return]: ' % note)
83317f69 629
46720279
JMF
630 # Helper functions for extracting OpenGraph info
631 @staticmethod
ab2d5247 632 def _og_regexes(prop):
c1206423 633 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
9887c9b2 634 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
78fb87b2 635 template = r'<meta[^>]+?%s[^>]+?%s'
ab2d5247 636 return [
78fb87b2
JMF
637 template % (property_re, content_re),
638 template % (content_re, property_re),
ab2d5247 639 ]
46720279 640
864f24bd
S
641 @staticmethod
642 def _meta_regex(prop):
643 return r'''(?isx)<meta
8b9848ac 644 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
864f24bd
S
645 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
646
3c4e6d83 647 def _og_search_property(self, prop, html, name=None, **kargs):
46720279 648 if name is None:
3c4e6d83 649 name = 'OpenGraph %s' % prop
ab2d5247 650 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
eb0a8398
PH
651 if escaped is None:
652 return None
653 return unescapeHTML(escaped)
46720279
JMF
654
655 def _og_search_thumbnail(self, html, **kargs):
10952eb2 656 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
46720279
JMF
657
658 def _og_search_description(self, html, **kargs):
659 return self._og_search_property('description', html, fatal=False, **kargs)
660
661 def _og_search_title(self, html, **kargs):
662 return self._og_search_property('title', html, **kargs)
663
8ffa13e0 664 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
a3681973
PH
665 regexes = self._og_regexes('video') + self._og_regexes('video:url')
666 if secure:
667 regexes = self._og_regexes('video:secure_url') + regexes
8ffa13e0 668 return self._html_search_regex(regexes, html, name, **kargs)
46720279 669
78338f71
JMF
670 def _og_search_url(self, html, **kargs):
671 return self._og_search_property('url', html, **kargs)
672
40c696e5 673 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
59040888
PH
674 if display_name is None:
675 display_name = name
676 return self._html_search_regex(
864f24bd 677 self._meta_regex(name),
711ede6e 678 html, display_name, fatal=fatal, group='content', **kwargs)
59040888
PH
679
680 def _dc_search_uploader(self, html):
681 return self._html_search_meta('dc.creator', html, 'uploader')
682
8dbe9899
PH
683 def _rta_search(self, html):
684 # See http://www.rtalabel.org/index.php?content=howtofaq#single
685 if re.search(r'(?ix)<meta\s+name="rating"\s+'
686 r' content="RTA-5042-1996-1400-1577-RTA"',
687 html):
688 return 18
689 return 0
690
59040888
PH
691 def _media_rating_search(self, html):
692 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
693 rating = self._html_search_meta('rating', html)
694
695 if not rating:
696 return None
697
698 RATING_TABLE = {
699 'safe for kids': 0,
700 'general': 8,
701 '14 years': 14,
702 'mature': 17,
703 'restricted': 19,
704 }
705 return RATING_TABLE.get(rating.lower(), None)
706
69319969 707 def _family_friendly_search(self, html):
6ca7732d 708 # See http://schema.org/VideoObject
69319969
NJ
709 family_friendly = self._html_search_meta('isFamilyFriendly', html)
710
711 if not family_friendly:
712 return None
713
714 RATING_TABLE = {
715 '1': 0,
716 'true': 0,
717 '0': 18,
718 'false': 18,
719 }
720 return RATING_TABLE.get(family_friendly.lower(), None)
721
0c708f11
JMF
722 def _twitter_search_player(self, html):
723 return self._html_search_meta('twitter:player', html,
9e1a5b84 724 'twitter card player')
0c708f11 725
27713812 726 @staticmethod
f8da79f8 727 def _hidden_inputs(html):
201ea3ee
S
728 hidden_inputs = {}
729 for input in re.findall(r'<input([^>]+)>', html):
730 if not re.search(r'type=(["\'])hidden\1', input):
731 continue
732 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
733 if not name:
734 continue
735 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
736 if not value:
737 continue
738 hidden_inputs[name.group('value')] = value.group('value')
739 return hidden_inputs
27713812 740
cf61d96d
S
741 def _form_hidden_inputs(self, form_id, html):
742 form = self._search_regex(
743 r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
744 html, '%s form' % form_id, group='form')
745 return self._hidden_inputs(form)
746
3ded7bac 747 def _sort_formats(self, formats, field_preference=None):
7e8caf30 748 if not formats:
f1a9d64e 749 raise ExtractorError('No video formats found')
7e8caf30 750
4bcc7bd1 751 def _formats_key(f):
e6812ac9
PH
752 # TODO remove the following workaround
753 from ..utils import determine_ext
754 if not f.get('ext') and 'url' in f:
755 f['ext'] = determine_ext(f['url'])
756
3ded7bac
S
757 if isinstance(field_preference, (list, tuple)):
758 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
759
4bcc7bd1
PH
760 preference = f.get('preference')
761 if preference is None:
c7deaa4c
PH
762 proto = f.get('protocol')
763 if proto is None:
764 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
765
766 preference = 0 if proto in ['http', 'https'] else -0.1
4bcc7bd1
PH
767 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
768 preference -= 0.5
769
770 if f.get('vcodec') == 'none': # audio only
771 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 772 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
4bcc7bd1 773 else:
f1a9d64e 774 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
4bcc7bd1
PH
775 ext_preference = 0
776 try:
777 audio_ext_preference = ORDER.index(f['ext'])
778 except ValueError:
779 audio_ext_preference = -1
780 else:
781 if self._downloader.params.get('prefer_free_formats'):
f1a9d64e 782 ORDER = ['flv', 'mp4', 'webm']
4bcc7bd1 783 else:
f1a9d64e 784 ORDER = ['webm', 'flv', 'mp4']
4bcc7bd1
PH
785 try:
786 ext_preference = ORDER.index(f['ext'])
787 except ValueError:
788 ext_preference = -1
789 audio_ext_preference = 0
790
791 return (
792 preference,
aff2f4f4 793 f.get('language_preference') if f.get('language_preference') is not None else -1,
5d73273f 794 f.get('quality') if f.get('quality') is not None else -1,
9933b574 795 f.get('tbr') if f.get('tbr') is not None else -1,
03cd72b0 796 f.get('filesize') if f.get('filesize') is not None else -1,
4bcc7bd1 797 f.get('vbr') if f.get('vbr') is not None else -1,
1a6373ef
PH
798 f.get('height') if f.get('height') is not None else -1,
799 f.get('width') if f.get('width') is not None else -1,
1e1896f2 800 ext_preference,
4bcc7bd1
PH
801 f.get('abr') if f.get('abr') is not None else -1,
802 audio_ext_preference,
2c8e03d9 803 f.get('fps') if f.get('fps') is not None else -1,
9732d77e 804 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
c64ed2a3 805 f.get('source_preference') if f.get('source_preference') is not None else -1,
74f72824 806 f.get('format_id') if f.get('format_id') is not None else '',
4bcc7bd1
PH
807 )
808 formats.sort(key=_formats_key)
59040888 809
96a53167
S
810 def _check_formats(self, formats, video_id):
811 if formats:
812 formats[:] = filter(
813 lambda f: self._is_valid_url(
814 f['url'], video_id,
815 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
816 formats)
817
818 def _is_valid_url(self, url, video_id, item='video'):
2f0f6578
S
819 url = self._proto_relative_url(url, scheme='http:')
820 # For now assume non HTTP(S) URLs always valid
821 if not (url.startswith('http://') or url.startswith('https://')):
822 return True
96a53167 823 try:
4069766c 824 self._request_webpage(url, video_id, 'Checking %s URL' % item)
96a53167
S
825 return True
826 except ExtractorError as e:
827 if isinstance(e.cause, compat_HTTPError):
baa43cba
S
828 self.to_screen(
829 '%s: %s URL is invalid, skipping' % (video_id, item))
96a53167
S
830 return False
831 raise
832
20991253 833 def http_scheme(self):
1ede5b24 834 """ Either "http:" or "https:", depending on the user's preferences """
20991253
PH
835 return (
836 'http:'
837 if self._downloader.params.get('prefer_insecure', False)
838 else 'https:')
839
57c7411f
PH
840 def _proto_relative_url(self, url, scheme=None):
841 if url is None:
842 return url
843 if url.startswith('//'):
844 if scheme is None:
845 scheme = self.http_scheme()
846 return scheme + url
847 else:
848 return url
849
4094b6e3
PH
850 def _sleep(self, timeout, video_id, msg_template=None):
851 if msg_template is None:
f1a9d64e 852 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
4094b6e3
PH
853 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
854 self.to_screen(msg)
855 time.sleep(timeout)
856
a38436e8
YCH
857 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
858 transform_source=lambda s: fix_xml_ampersands(s).strip()):
f036a632
JMF
859 manifest = self._download_xml(
860 manifest_url, video_id, 'Downloading f4m manifest',
97f4aecf
S
861 'Unable to download f4m manifest',
862 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
863 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
a38436e8 864 transform_source=transform_source)
31bb8d3f
JMF
865
866 formats = []
7a47d07c 867 manifest_version = '1.0'
b2527359 868 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
34e48bed 869 if not media_nodes:
7a47d07c 870 manifest_version = '2.0'
34e48bed 871 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
b2527359 872 for i, media_el in enumerate(media_nodes):
7a47d07c 873 if manifest_version == '2.0':
31c746e5
S
874 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
875 if not media_url:
876 continue
cc357c4d
S
877 manifest_url = (
878 media_url if media_url.startswith('http://') or media_url.startswith('https://')
879 else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
70f0f5a8
S
880 # If media_url is itself a f4m manifest do the recursive extraction
881 # since bitrates in parent manifest (this one) and media_url manifest
882 # may differ leading to inability to resolve the format by requested
883 # bitrate in f4m downloader
884 if determine_ext(manifest_url) == 'f4m':
885 formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
886 continue
b2527359 887 tbr = int_or_none(media_el.attrib.get('bitrate'))
31bb8d3f 888 formats.append({
e21a55ab 889 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
31bb8d3f
JMF
890 'url': manifest_url,
891 'ext': 'flv',
b2527359 892 'tbr': tbr,
31bb8d3f
JMF
893 'width': int_or_none(media_el.attrib.get('width')),
894 'height': int_or_none(media_el.attrib.get('height')),
60ca389c 895 'preference': preference,
31bb8d3f
JMF
896 })
897 self._sort_formats(formats)
898
899 return formats
900
f0b5d6af 901 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
60ca389c 902 entry_protocol='m3u8', preference=None,
13af92fd
YCH
903 m3u8_id=None, note=None, errnote=None,
904 fatal=True):
f0b5d6af 905
704df56d 906 formats = [{
f207019c 907 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
704df56d
PH
908 'url': m3u8_url,
909 'ext': ext,
910 'protocol': 'm3u8',
9fe6ef7a 911 'preference': preference - 1 if preference else -1,
704df56d
PH
912 'resolution': 'multiple',
913 'format_note': 'Quality selection URL',
914 }]
915
f0b5d6af
PH
916 format_url = lambda u: (
917 u
918 if re.match(r'^https?://', u)
919 else compat_urlparse.urljoin(m3u8_url, u))
920
81515ad9
PH
921 m3u8_doc = self._download_webpage(
922 m3u8_url, video_id,
621ed9f5 923 note=note or 'Downloading m3u8 information',
13af92fd
YCH
924 errnote=errnote or 'Failed to download m3u8 information',
925 fatal=fatal)
926 if m3u8_doc is False:
927 return m3u8_doc
704df56d 928 last_info = None
fa156077 929 last_media = None
704df56d
PH
930 kv_rex = re.compile(
931 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
932 for line in m3u8_doc.splitlines():
933 if line.startswith('#EXT-X-STREAM-INF:'):
934 last_info = {}
935 for m in kv_rex.finditer(line):
936 v = m.group('val')
937 if v.startswith('"'):
938 v = v[1:-1]
939 last_info[m.group('key')] = v
4cd95bcb
JMF
940 elif line.startswith('#EXT-X-MEDIA:'):
941 last_media = {}
942 for m in kv_rex.finditer(line):
943 v = m.group('val')
944 if v.startswith('"'):
945 v = v[1:-1]
946 last_media[m.group('key')] = v
704df56d
PH
947 elif line.startswith('#') or not line.strip():
948 continue
949 else:
daebaab6 950 if last_info is None:
f0b5d6af 951 formats.append({'url': format_url(line)})
3524cc25 952 continue
704df56d 953 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
8dc9d361
S
954 format_id = []
955 if m3u8_id:
956 format_id.append(m3u8_id)
05d5392c 957 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
8dc9d361 958 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
704df56d 959 f = {
8dc9d361 960 'format_id': '-'.join(format_id),
f0b5d6af 961 'url': format_url(line.strip()),
704df56d
PH
962 'tbr': tbr,
963 'ext': ext,
f0b5d6af
PH
964 'protocol': entry_protocol,
965 'preference': preference,
704df56d
PH
966 }
967 codecs = last_info.get('CODECS')
968 if codecs:
9ebf22b7
S
969 # TODO: looks like video codec is not always necessarily goes first
970 va_codecs = codecs.split(',')
971 if va_codecs[0]:
972 f['vcodec'] = va_codecs[0].partition('.')[0]
973 if len(va_codecs) > 1 and va_codecs[1]:
974 f['acodec'] = va_codecs[1].partition('.')[0]
704df56d
PH
975 resolution = last_info.get('RESOLUTION')
976 if resolution:
977 width_str, height_str = resolution.split('x')
978 f['width'] = int(width_str)
979 f['height'] = int(height_str)
4cd95bcb
JMF
980 if last_media is not None:
981 f['m3u8_media'] = last_media
982 last_media = None
704df56d
PH
983 formats.append(f)
984 last_info = {}
985 self._sort_formats(formats)
986 return formats
987
a107193e
S
988 @staticmethod
989 def _xpath_ns(path, namespace=None):
990 if not namespace:
991 return path
992 out = []
993 for c in path.split('/'):
994 if not c or c == '.':
995 out.append(c)
996 else:
997 out.append('{%s}%s' % (namespace, c))
998 return '/'.join(out)
999
1000 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1001 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1002
995029a1
PH
1003 if smil is False:
1004 assert not fatal
1005 return []
e89a2aab 1006
17712eeb 1007 namespace = self._parse_smil_namespace(smil)
a107193e
S
1008
1009 return self._parse_smil_formats(
1010 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1011
1012 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1013 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1014 if smil is False:
1015 return {}
1016 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1017
1018 def _download_smil(self, smil_url, video_id, fatal=True):
1019 return self._download_xml(
1020 smil_url, video_id, 'Downloading SMIL file',
1021 'Unable to download SMIL file', fatal=fatal)
1022
1023 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
17712eeb 1024 namespace = self._parse_smil_namespace(smil)
a107193e
S
1025
1026 formats = self._parse_smil_formats(
1027 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1028 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1029
1030 video_id = os.path.splitext(url_basename(smil_url))[0]
1031 title = None
1032 description = None
1033 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1034 name = meta.attrib.get('name')
1035 content = meta.attrib.get('content')
1036 if not name or not content:
1037 continue
1038 if not title and name == 'title':
1039 title = content
1040 elif not description and name in ('description', 'abstract'):
1041 description = content
1042
1043 return {
1044 'id': video_id,
1045 'title': title or video_id,
1046 'description': description,
1047 'formats': formats,
1048 'subtitles': subtitles,
1049 }
1050
17712eeb
S
1051 def _parse_smil_namespace(self, smil):
1052 return self._search_regex(
1053 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1054
a107193e
S
1055 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None):
1056 base = smil_url
1057 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1058 b = meta.get('base') or meta.get('httpBase')
1059 if b:
1060 base = b
1061 break
e89a2aab
S
1062
1063 formats = []
1064 rtmp_count = 0
a107193e
S
1065 http_count = 0
1066
1067 videos = smil.findall(self._xpath_ns('.//video', namespace))
1068 for video in videos:
1069 src = video.get('src')
1070 if not src:
1071 continue
1072
1073 bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1074 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1075 width = int_or_none(video.get('width'))
1076 height = int_or_none(video.get('height'))
1077 proto = video.get('proto')
1078 ext = video.get('ext')
1079 src_ext = determine_ext(src)
1080 streamer = video.get('streamer') or base
1081
1082 if proto == 'rtmp' or streamer.startswith('rtmp'):
1083 rtmp_count += 1
1084 formats.append({
1085 'url': streamer,
1086 'play_path': src,
1087 'ext': 'flv',
1088 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1089 'tbr': bitrate,
1090 'filesize': filesize,
1091 'width': width,
1092 'height': height,
1093 })
1094 continue
1095
1096 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1097
1098 if proto == 'm3u8' or src_ext == 'm3u8':
1099 formats.extend(self._extract_m3u8_formats(
1100 src_url, video_id, ext or 'mp4', m3u8_id='hls'))
1101 continue
1102
1103 if src_ext == 'f4m':
1104 f4m_url = src_url
1105 if not f4m_params:
1106 f4m_params = {
1107 'hdcore': '3.2.0',
1108 'plugin': 'flowplayer-3.2.0.1',
1109 }
1110 f4m_url += '&' if '?' in f4m_url else '?'
41c3a5a7 1111 f4m_url += compat_urllib_parse.urlencode(f4m_params)
a107193e
S
1112 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
1113 continue
1114
1115 if src_url.startswith('http'):
1116 http_count += 1
1117 formats.append({
1118 'url': src_url,
1119 'ext': ext or src_ext or 'flv',
1120 'format_id': 'http-%d' % (bitrate or http_count),
1121 'tbr': bitrate,
1122 'filesize': filesize,
1123 'width': width,
1124 'height': height,
1125 })
1126 continue
63757032 1127
e89a2aab
S
1128 self._sort_formats(formats)
1129
1130 return formats
1131
a107193e
S
1132 def _parse_smil_subtitles(self, smil, namespace=None):
1133 subtitles = {}
1134 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1135 src = textstream.get('src')
1136 if not src:
1137 continue
1138 ext = textstream.get('ext') or determine_ext(src)
1139 if not ext:
1140 type_ = textstream.get('type')
1141 if type_ == 'text/srt':
1142 ext = 'srt'
1143 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName')
1144 subtitles.setdefault(lang, []).append({
1145 'url': src,
1146 'ext': ext,
1147 })
1148 return subtitles
63757032 1149
942acef5
S
1150 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1151 xspf = self._download_xml(
8d6765cf 1152 playlist_url, playlist_id, 'Downloading xpsf playlist',
942acef5
S
1153 'Unable to download xspf manifest', fatal=fatal)
1154 if xspf is False:
1155 return []
1156 return self._parse_xspf(xspf, playlist_id)
8d6765cf 1157
942acef5 1158 def _parse_xspf(self, playlist, playlist_id):
8d6765cf
S
1159 NS_MAP = {
1160 'xspf': 'http://xspf.org/ns/0/',
1161 's1': 'http://static.streamone.nl/player/ns/0',
1162 }
1163
1164 entries = []
1165 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1166 title = xpath_text(
98044462 1167 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
8d6765cf
S
1168 description = xpath_text(
1169 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1170 thumbnail = xpath_text(
1171 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1172 duration = float_or_none(
1173 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1174
1175 formats = [{
1176 'url': location.text,
1177 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1178 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1179 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1180 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1181 self._sort_formats(formats)
1182
1183 entries.append({
1184 'id': playlist_id,
1185 'title': title,
1186 'description': description,
1187 'thumbnail': thumbnail,
1188 'duration': duration,
1189 'formats': formats,
1190 })
1191 return entries
1192
f4b1c7ad
PH
1193 def _live_title(self, name):
1194 """ Generate the title for a live video """
1195 now = datetime.datetime.now()
1196 now_str = now.strftime("%Y-%m-%d %H:%M")
1197 return name + ' ' + now_str
1198
b14f3a4c
PH
1199 def _int(self, v, name, fatal=False, **kwargs):
1200 res = int_or_none(v, **kwargs)
1201 if 'get_attr' in kwargs:
1202 print(getattr(v, kwargs['get_attr']))
1203 if res is None:
1204 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1205 if fatal:
1206 raise ExtractorError(msg)
1207 else:
1208 self._downloader.report_warning(msg)
1209 return res
1210
1211 def _float(self, v, name, fatal=False, **kwargs):
1212 res = float_or_none(v, **kwargs)
1213 if res is None:
1214 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1215 if fatal:
1216 raise ExtractorError(msg)
1217 else:
1218 self._downloader.report_warning(msg)
1219 return res
1220
42939b61 1221 def _set_cookie(self, domain, name, value, expire_time=None):
810fb84d
PH
1222 cookie = compat_cookiejar.Cookie(
1223 0, name, value, None, None, domain, None,
42939b61
JMF
1224 None, '/', True, False, expire_time, '', None, None, None)
1225 self._downloader.cookiejar.set_cookie(cookie)
1226
799207e8 1227 def _get_cookies(self, url):
1228 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1229 req = compat_urllib_request.Request(url)
1230 self._downloader.cookiejar.add_cookie_header(req)
1231 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1232
05900629
PH
1233 def get_testcases(self, include_onlymatching=False):
1234 t = getattr(self, '_TEST', None)
1235 if t:
1236 assert not hasattr(self, '_TESTS'), \
1237 '%s has _TEST and _TESTS' % type(self).__name__
1238 tests = [t]
1239 else:
1240 tests = getattr(self, '_TESTS', [])
1241 for t in tests:
1242 if not include_onlymatching and t.get('only_matching', False):
1243 continue
1244 t['name'] = type(self).__name__[:-len('IE')]
1245 yield t
1246
1247 def is_suitable(self, age_limit):
1248 """ Test whether the extractor is generally suitable for the given
1249 age limit (i.e. pornographic sites are not, all others usually are) """
1250
1251 any_restricted = False
1252 for tc in self.get_testcases(include_onlymatching=False):
1253 if 'playlist' in tc:
1254 tc = tc['playlist'][0]
1255 is_restricted = age_restricted(
1256 tc.get('info_dict', {}).get('age_limit'), age_limit)
1257 if not is_restricted:
1258 return True
1259 any_restricted = any_restricted or is_restricted
1260 return not any_restricted
1261
a504ced0 1262 def extract_subtitles(self, *args, **kwargs):
9868ea49
JMF
1263 if (self._downloader.params.get('writesubtitles', False) or
1264 self._downloader.params.get('listsubtitles')):
1265 return self._get_subtitles(*args, **kwargs)
1266 return {}
a504ced0
JMF
1267
1268 def _get_subtitles(self, *args, **kwargs):
1269 raise NotImplementedError("This method must be implemented by subclasses")
1270
360e1ca5 1271 def extract_automatic_captions(self, *args, **kwargs):
9868ea49
JMF
1272 if (self._downloader.params.get('writeautomaticsub', False) or
1273 self._downloader.params.get('listsubtitles')):
1274 return self._get_automatic_captions(*args, **kwargs)
1275 return {}
360e1ca5
JMF
1276
1277 def _get_automatic_captions(self, *args, **kwargs):
1278 raise NotImplementedError("This method must be implemented by subclasses")
1279
8dbe9899 1280
d6983cb4
PH
1281class SearchInfoExtractor(InfoExtractor):
1282 """
1283 Base class for paged search queries extractors.
10952eb2 1284 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
d6983cb4
PH
1285 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1286 """
1287
1288 @classmethod
1289 def _make_valid_url(cls):
1290 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1291
1292 @classmethod
1293 def suitable(cls, url):
1294 return re.match(cls._make_valid_url(), url) is not None
1295
1296 def _real_extract(self, query):
1297 mobj = re.match(self._make_valid_url(), query)
1298 if mobj is None:
f1a9d64e 1299 raise ExtractorError('Invalid search query "%s"' % query)
d6983cb4
PH
1300
1301 prefix = mobj.group('prefix')
1302 query = mobj.group('query')
1303 if prefix == '':
1304 return self._get_n_results(query, 1)
1305 elif prefix == 'all':
1306 return self._get_n_results(query, self._MAX_RESULTS)
1307 else:
1308 n = int(prefix)
1309 if n <= 0:
f1a9d64e 1310 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
d6983cb4 1311 elif n > self._MAX_RESULTS:
f1a9d64e 1312 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
d6983cb4
PH
1313 n = self._MAX_RESULTS
1314 return self._get_n_results(query, n)
1315
1316 def _get_n_results(self, query, n):
1317 """Get a specified number of results for a query"""
416a5efc 1318 raise NotImplementedError("This method must be implemented by subclasses")
0f818663
PH
1319
1320 @property
1321 def SEARCH_KEY(self):
1322 return self._SEARCH_KEY