]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[pluralsight] Rephrase
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import base64
7 import calendar
8 import codecs
9 import contextlib
10 import ctypes
11 import datetime
12 import email.utils
13 import errno
14 import functools
15 import gzip
16 import itertools
17 import io
18 import json
19 import locale
20 import math
21 import operator
22 import os
23 import pipes
24 import platform
25 import re
26 import ssl
27 import socket
28 import struct
29 import subprocess
30 import sys
31 import tempfile
32 import traceback
33 import xml.etree.ElementTree
34 import zlib
35
36 from .compat import (
37 compat_basestring,
38 compat_chr,
39 compat_etree_fromstring,
40 compat_html_entities,
41 compat_http_client,
42 compat_kwargs,
43 compat_parse_qs,
44 compat_socket_create_connection,
45 compat_str,
46 compat_urllib_error,
47 compat_urllib_parse,
48 compat_urllib_parse_urlparse,
49 compat_urllib_request,
50 compat_urlparse,
51 shlex_quote,
52 )
53
54
55 # This is not clearly defined otherwise
56 compiled_regex_type = type(re.compile(''))
57
58 std_headers = {
59 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
60 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
61 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
62 'Accept-Encoding': 'gzip, deflate',
63 'Accept-Language': 'en-us,en;q=0.5',
64 }
65
66
67 NO_DEFAULT = object()
68
69 ENGLISH_MONTH_NAMES = [
70 'January', 'February', 'March', 'April', 'May', 'June',
71 'July', 'August', 'September', 'October', 'November', 'December']
72
73
74 def preferredencoding():
75 """Get preferred encoding.
76
77 Returns the best encoding scheme for the system, based on
78 locale.getpreferredencoding() and some further tweaks.
79 """
80 try:
81 pref = locale.getpreferredencoding()
82 'TEST'.encode(pref)
83 except Exception:
84 pref = 'UTF-8'
85
86 return pref
87
88
89 def write_json_file(obj, fn):
90 """ Encode obj as JSON and write it to fn, atomically if possible """
91
92 fn = encodeFilename(fn)
93 if sys.version_info < (3, 0) and sys.platform != 'win32':
94 encoding = get_filesystem_encoding()
95 # os.path.basename returns a bytes object, but NamedTemporaryFile
96 # will fail if the filename contains non ascii characters unless we
97 # use a unicode object
98 path_basename = lambda f: os.path.basename(fn).decode(encoding)
99 # the same for os.path.dirname
100 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
101 else:
102 path_basename = os.path.basename
103 path_dirname = os.path.dirname
104
105 args = {
106 'suffix': '.tmp',
107 'prefix': path_basename(fn) + '.',
108 'dir': path_dirname(fn),
109 'delete': False,
110 }
111
112 # In Python 2.x, json.dump expects a bytestream.
113 # In Python 3.x, it writes to a character stream
114 if sys.version_info < (3, 0):
115 args['mode'] = 'wb'
116 else:
117 args.update({
118 'mode': 'w',
119 'encoding': 'utf-8',
120 })
121
122 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
123
124 try:
125 with tf:
126 json.dump(obj, tf)
127 if sys.platform == 'win32':
128 # Need to remove existing file on Windows, else os.rename raises
129 # WindowsError or FileExistsError.
130 try:
131 os.unlink(fn)
132 except OSError:
133 pass
134 os.rename(tf.name, fn)
135 except Exception:
136 try:
137 os.remove(tf.name)
138 except OSError:
139 pass
140 raise
141
142
143 if sys.version_info >= (2, 7):
144 def find_xpath_attr(node, xpath, key, val=None):
145 """ Find the xpath xpath[@key=val] """
146 assert re.match(r'^[a-zA-Z_-]+$', key)
147 if val:
148 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
149 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
150 return node.find(expr)
151 else:
152 def find_xpath_attr(node, xpath, key, val=None):
153 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
154 # .//node does not match if a node is a direct child of . !
155 if isinstance(xpath, compat_str):
156 xpath = xpath.encode('ascii')
157
158 for f in node.findall(xpath):
159 if key not in f.attrib:
160 continue
161 if val is None or f.attrib.get(key) == val:
162 return f
163 return None
164
165 # On python2.6 the xml.etree.ElementTree.Element methods don't support
166 # the namespace parameter
167
168
169 def xpath_with_ns(path, ns_map):
170 components = [c.split(':') for c in path.split('/')]
171 replaced = []
172 for c in components:
173 if len(c) == 1:
174 replaced.append(c[0])
175 else:
176 ns, tag = c
177 replaced.append('{%s}%s' % (ns_map[ns], tag))
178 return '/'.join(replaced)
179
180
181 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
182 def _find_xpath(xpath):
183 if sys.version_info < (2, 7): # Crazy 2.6
184 xpath = xpath.encode('ascii')
185 return node.find(xpath)
186
187 if isinstance(xpath, (str, compat_str)):
188 n = _find_xpath(xpath)
189 else:
190 for xp in xpath:
191 n = _find_xpath(xp)
192 if n is not None:
193 break
194
195 if n is None:
196 if default is not NO_DEFAULT:
197 return default
198 elif fatal:
199 name = xpath if name is None else name
200 raise ExtractorError('Could not find XML element %s' % name)
201 else:
202 return None
203 return n
204
205
206 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
207 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
208 if n is None or n == default:
209 return n
210 if n.text is None:
211 if default is not NO_DEFAULT:
212 return default
213 elif fatal:
214 name = xpath if name is None else name
215 raise ExtractorError('Could not find XML element\'s text %s' % name)
216 else:
217 return None
218 return n.text
219
220
221 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
222 n = find_xpath_attr(node, xpath, key)
223 if n is None:
224 if default is not NO_DEFAULT:
225 return default
226 elif fatal:
227 name = '%s[@%s]' % (xpath, key) if name is None else name
228 raise ExtractorError('Could not find XML attribute %s' % name)
229 else:
230 return None
231 return n.attrib[key]
232
233
234 def get_element_by_id(id, html):
235 """Return the content of the tag with the specified ID in the passed HTML document"""
236 return get_element_by_attribute("id", id, html)
237
238
239 def get_element_by_attribute(attribute, value, html):
240 """Return the content of the tag with the specified attribute in the passed HTML document"""
241
242 m = re.search(r'''(?xs)
243 <([a-zA-Z0-9:._-]+)
244 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
245 \s+%s=['"]?%s['"]?
246 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
247 \s*>
248 (?P<content>.*?)
249 </\1>
250 ''' % (re.escape(attribute), re.escape(value)), html)
251
252 if not m:
253 return None
254 res = m.group('content')
255
256 if res.startswith('"') or res.startswith("'"):
257 res = res[1:-1]
258
259 return unescapeHTML(res)
260
261
262 def clean_html(html):
263 """Clean an HTML snippet into a readable string"""
264
265 if html is None: # Convenience for sanitizing descriptions etc.
266 return html
267
268 # Newline vs <br />
269 html = html.replace('\n', ' ')
270 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
271 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
272 # Strip html tags
273 html = re.sub('<.*?>', '', html)
274 # Replace html entities
275 html = unescapeHTML(html)
276 return html.strip()
277
278
279 def sanitize_open(filename, open_mode):
280 """Try to open the given filename, and slightly tweak it if this fails.
281
282 Attempts to open the given filename. If this fails, it tries to change
283 the filename slightly, step by step, until it's either able to open it
284 or it fails and raises a final exception, like the standard open()
285 function.
286
287 It returns the tuple (stream, definitive_file_name).
288 """
289 try:
290 if filename == '-':
291 if sys.platform == 'win32':
292 import msvcrt
293 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
294 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
295 stream = open(encodeFilename(filename), open_mode)
296 return (stream, filename)
297 except (IOError, OSError) as err:
298 if err.errno in (errno.EACCES,):
299 raise
300
301 # In case of error, try to remove win32 forbidden chars
302 alt_filename = sanitize_path(filename)
303 if alt_filename == filename:
304 raise
305 else:
306 # An exception here should be caught in the caller
307 stream = open(encodeFilename(alt_filename), open_mode)
308 return (stream, alt_filename)
309
310
311 def timeconvert(timestr):
312 """Convert RFC 2822 defined time string into system timestamp"""
313 timestamp = None
314 timetuple = email.utils.parsedate_tz(timestr)
315 if timetuple is not None:
316 timestamp = email.utils.mktime_tz(timetuple)
317 return timestamp
318
319
320 def sanitize_filename(s, restricted=False, is_id=False):
321 """Sanitizes a string so it could be used as part of a filename.
322 If restricted is set, use a stricter subset of allowed characters.
323 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
324 """
325 def replace_insane(char):
326 if char == '?' or ord(char) < 32 or ord(char) == 127:
327 return ''
328 elif char == '"':
329 return '' if restricted else '\''
330 elif char == ':':
331 return '_-' if restricted else ' -'
332 elif char in '\\/|*<>':
333 return '_'
334 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
335 return '_'
336 if restricted and ord(char) > 127:
337 return '_'
338 return char
339
340 # Handle timestamps
341 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
342 result = ''.join(map(replace_insane, s))
343 if not is_id:
344 while '__' in result:
345 result = result.replace('__', '_')
346 result = result.strip('_')
347 # Common case of "Foreign band name - English song title"
348 if restricted and result.startswith('-_'):
349 result = result[2:]
350 if result.startswith('-'):
351 result = '_' + result[len('-'):]
352 result = result.lstrip('.')
353 if not result:
354 result = '_'
355 return result
356
357
358 def sanitize_path(s):
359 """Sanitizes and normalizes path on Windows"""
360 if sys.platform != 'win32':
361 return s
362 drive_or_unc, _ = os.path.splitdrive(s)
363 if sys.version_info < (2, 7) and not drive_or_unc:
364 drive_or_unc, _ = os.path.splitunc(s)
365 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
366 if drive_or_unc:
367 norm_path.pop(0)
368 sanitized_path = [
369 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
370 for path_part in norm_path]
371 if drive_or_unc:
372 sanitized_path.insert(0, drive_or_unc + os.path.sep)
373 return os.path.join(*sanitized_path)
374
375
376 def orderedSet(iterable):
377 """ Remove all duplicates from the input iterable """
378 res = []
379 for el in iterable:
380 if el not in res:
381 res.append(el)
382 return res
383
384
385 def _htmlentity_transform(entity):
386 """Transforms an HTML entity to a character."""
387 # Known non-numeric HTML entity
388 if entity in compat_html_entities.name2codepoint:
389 return compat_chr(compat_html_entities.name2codepoint[entity])
390
391 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
392 if mobj is not None:
393 numstr = mobj.group(1)
394 if numstr.startswith('x'):
395 base = 16
396 numstr = '0%s' % numstr
397 else:
398 base = 10
399 # See https://github.com/rg3/youtube-dl/issues/7518
400 try:
401 return compat_chr(int(numstr, base))
402 except ValueError:
403 pass
404
405 # Unknown entity in name, return its literal representation
406 return '&%s;' % entity
407
408
409 def unescapeHTML(s):
410 if s is None:
411 return None
412 assert type(s) == compat_str
413
414 return re.sub(
415 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
416
417
418 def get_subprocess_encoding():
419 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
420 # For subprocess calls, encode with locale encoding
421 # Refer to http://stackoverflow.com/a/9951851/35070
422 encoding = preferredencoding()
423 else:
424 encoding = sys.getfilesystemencoding()
425 if encoding is None:
426 encoding = 'utf-8'
427 return encoding
428
429
430 def encodeFilename(s, for_subprocess=False):
431 """
432 @param s The name of the file
433 """
434
435 assert type(s) == compat_str
436
437 # Python 3 has a Unicode API
438 if sys.version_info >= (3, 0):
439 return s
440
441 # Pass '' directly to use Unicode APIs on Windows 2000 and up
442 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
443 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
444 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
445 return s
446
447 return s.encode(get_subprocess_encoding(), 'ignore')
448
449
450 def decodeFilename(b, for_subprocess=False):
451
452 if sys.version_info >= (3, 0):
453 return b
454
455 if not isinstance(b, bytes):
456 return b
457
458 return b.decode(get_subprocess_encoding(), 'ignore')
459
460
461 def encodeArgument(s):
462 if not isinstance(s, compat_str):
463 # Legacy code that uses byte strings
464 # Uncomment the following line after fixing all post processors
465 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
466 s = s.decode('ascii')
467 return encodeFilename(s, True)
468
469
470 def decodeArgument(b):
471 return decodeFilename(b, True)
472
473
474 def decodeOption(optval):
475 if optval is None:
476 return optval
477 if isinstance(optval, bytes):
478 optval = optval.decode(preferredencoding())
479
480 assert isinstance(optval, compat_str)
481 return optval
482
483
484 def formatSeconds(secs):
485 if secs > 3600:
486 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
487 elif secs > 60:
488 return '%d:%02d' % (secs // 60, secs % 60)
489 else:
490 return '%d' % secs
491
492
493 def make_HTTPS_handler(params, **kwargs):
494 opts_no_check_certificate = params.get('nocheckcertificate', False)
495 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
496 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
497 if opts_no_check_certificate:
498 context.check_hostname = False
499 context.verify_mode = ssl.CERT_NONE
500 try:
501 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
502 except TypeError:
503 # Python 2.7.8
504 # (create_default_context present but HTTPSHandler has no context=)
505 pass
506
507 if sys.version_info < (3, 2):
508 return YoutubeDLHTTPSHandler(params, **kwargs)
509 else: # Python < 3.4
510 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
511 context.verify_mode = (ssl.CERT_NONE
512 if opts_no_check_certificate
513 else ssl.CERT_REQUIRED)
514 context.set_default_verify_paths()
515 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
516
517
518 def bug_reports_message():
519 if ytdl_is_updateable():
520 update_cmd = 'type youtube-dl -U to update'
521 else:
522 update_cmd = 'see https://yt-dl.org/update on how to update'
523 msg = '; please report this issue on https://yt-dl.org/bug .'
524 msg += ' Make sure you are using the latest version; %s.' % update_cmd
525 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
526 return msg
527
528
529 class ExtractorError(Exception):
530 """Error during info extraction."""
531
532 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
533 """ tb, if given, is the original traceback (so that it can be printed out).
534 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
535 """
536
537 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
538 expected = True
539 if video_id is not None:
540 msg = video_id + ': ' + msg
541 if cause:
542 msg += ' (caused by %r)' % cause
543 if not expected:
544 msg += bug_reports_message()
545 super(ExtractorError, self).__init__(msg)
546
547 self.traceback = tb
548 self.exc_info = sys.exc_info() # preserve original exception
549 self.cause = cause
550 self.video_id = video_id
551
552 def format_traceback(self):
553 if self.traceback is None:
554 return None
555 return ''.join(traceback.format_tb(self.traceback))
556
557
558 class UnsupportedError(ExtractorError):
559 def __init__(self, url):
560 super(UnsupportedError, self).__init__(
561 'Unsupported URL: %s' % url, expected=True)
562 self.url = url
563
564
565 class RegexNotFoundError(ExtractorError):
566 """Error when a regex didn't match"""
567 pass
568
569
570 class DownloadError(Exception):
571 """Download Error exception.
572
573 This exception may be thrown by FileDownloader objects if they are not
574 configured to continue on errors. They will contain the appropriate
575 error message.
576 """
577
578 def __init__(self, msg, exc_info=None):
579 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
580 super(DownloadError, self).__init__(msg)
581 self.exc_info = exc_info
582
583
584 class SameFileError(Exception):
585 """Same File exception.
586
587 This exception will be thrown by FileDownloader objects if they detect
588 multiple files would have to be downloaded to the same file on disk.
589 """
590 pass
591
592
593 class PostProcessingError(Exception):
594 """Post Processing exception.
595
596 This exception may be raised by PostProcessor's .run() method to
597 indicate an error in the postprocessing task.
598 """
599
600 def __init__(self, msg):
601 self.msg = msg
602
603
604 class MaxDownloadsReached(Exception):
605 """ --max-downloads limit has been reached. """
606 pass
607
608
609 class UnavailableVideoError(Exception):
610 """Unavailable Format exception.
611
612 This exception will be thrown when a video is requested
613 in a format that is not available for that video.
614 """
615 pass
616
617
618 class ContentTooShortError(Exception):
619 """Content Too Short exception.
620
621 This exception may be raised by FileDownloader objects when a file they
622 download is too small for what the server announced first, indicating
623 the connection was probably interrupted.
624 """
625
626 def __init__(self, downloaded, expected):
627 # Both in bytes
628 self.downloaded = downloaded
629 self.expected = expected
630
631
632 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
633 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
634 # expected HTTP responses to meet HTTP/1.0 or later (see also
635 # https://github.com/rg3/youtube-dl/issues/6727)
636 if sys.version_info < (3, 0):
637 kwargs[b'strict'] = True
638 hc = http_class(*args, **kwargs)
639 source_address = ydl_handler._params.get('source_address')
640 if source_address is not None:
641 sa = (source_address, 0)
642 if hasattr(hc, 'source_address'): # Python 2.7+
643 hc.source_address = sa
644 else: # Python 2.6
645 def _hc_connect(self, *args, **kwargs):
646 sock = compat_socket_create_connection(
647 (self.host, self.port), self.timeout, sa)
648 if is_https:
649 self.sock = ssl.wrap_socket(
650 sock, self.key_file, self.cert_file,
651 ssl_version=ssl.PROTOCOL_TLSv1)
652 else:
653 self.sock = sock
654 hc.connect = functools.partial(_hc_connect, hc)
655
656 return hc
657
658
659 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
660 """Handler for HTTP requests and responses.
661
662 This class, when installed with an OpenerDirector, automatically adds
663 the standard headers to every HTTP request and handles gzipped and
664 deflated responses from web servers. If compression is to be avoided in
665 a particular request, the original request in the program code only has
666 to include the HTTP header "Youtubedl-No-Compression", which will be
667 removed before making the real request.
668
669 Part of this code was copied from:
670
671 http://techknack.net/python-urllib2-handlers/
672
673 Andrew Rowls, the author of that code, agreed to release it to the
674 public domain.
675 """
676
677 def __init__(self, params, *args, **kwargs):
678 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
679 self._params = params
680
681 def http_open(self, req):
682 return self.do_open(functools.partial(
683 _create_http_connection, self, compat_http_client.HTTPConnection, False),
684 req)
685
686 @staticmethod
687 def deflate(data):
688 try:
689 return zlib.decompress(data, -zlib.MAX_WBITS)
690 except zlib.error:
691 return zlib.decompress(data)
692
693 @staticmethod
694 def addinfourl_wrapper(stream, headers, url, code):
695 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
696 return compat_urllib_request.addinfourl(stream, headers, url, code)
697 ret = compat_urllib_request.addinfourl(stream, headers, url)
698 ret.code = code
699 return ret
700
701 def http_request(self, req):
702 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
703 # always respected by websites, some tend to give out URLs with non percent-encoded
704 # non-ASCII characters (see telemb.py, ard.py [#3412])
705 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
706 # To work around aforementioned issue we will replace request's original URL with
707 # percent-encoded one
708 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
709 # the code of this workaround has been moved here from YoutubeDL.urlopen()
710 url = req.get_full_url()
711 url_escaped = escape_url(url)
712
713 # Substitute URL if any change after escaping
714 if url != url_escaped:
715 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
716 new_req = req_type(
717 url_escaped, data=req.data, headers=req.headers,
718 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
719 new_req.timeout = req.timeout
720 req = new_req
721
722 for h, v in std_headers.items():
723 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
724 # The dict keys are capitalized because of this bug by urllib
725 if h.capitalize() not in req.headers:
726 req.add_header(h, v)
727 if 'Youtubedl-no-compression' in req.headers:
728 if 'Accept-encoding' in req.headers:
729 del req.headers['Accept-encoding']
730 del req.headers['Youtubedl-no-compression']
731
732 if sys.version_info < (2, 7) and '#' in req.get_full_url():
733 # Python 2.6 is brain-dead when it comes to fragments
734 req._Request__original = req._Request__original.partition('#')[0]
735 req._Request__r_type = req._Request__r_type.partition('#')[0]
736
737 return req
738
739 def http_response(self, req, resp):
740 old_resp = resp
741 # gzip
742 if resp.headers.get('Content-encoding', '') == 'gzip':
743 content = resp.read()
744 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
745 try:
746 uncompressed = io.BytesIO(gz.read())
747 except IOError as original_ioerror:
748 # There may be junk add the end of the file
749 # See http://stackoverflow.com/q/4928560/35070 for details
750 for i in range(1, 1024):
751 try:
752 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
753 uncompressed = io.BytesIO(gz.read())
754 except IOError:
755 continue
756 break
757 else:
758 raise original_ioerror
759 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
760 resp.msg = old_resp.msg
761 # deflate
762 if resp.headers.get('Content-encoding', '') == 'deflate':
763 gz = io.BytesIO(self.deflate(resp.read()))
764 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
765 resp.msg = old_resp.msg
766 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
767 # https://github.com/rg3/youtube-dl/issues/6457).
768 if 300 <= resp.code < 400:
769 location = resp.headers.get('Location')
770 if location:
771 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
772 if sys.version_info >= (3, 0):
773 location = location.encode('iso-8859-1').decode('utf-8')
774 location_escaped = escape_url(location)
775 if location != location_escaped:
776 del resp.headers['Location']
777 resp.headers['Location'] = location_escaped
778 return resp
779
780 https_request = http_request
781 https_response = http_response
782
783
784 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
785 def __init__(self, params, https_conn_class=None, *args, **kwargs):
786 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
787 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
788 self._params = params
789
790 def https_open(self, req):
791 kwargs = {}
792 if hasattr(self, '_context'): # python > 2.6
793 kwargs['context'] = self._context
794 if hasattr(self, '_check_hostname'): # python 3.x
795 kwargs['check_hostname'] = self._check_hostname
796 return self.do_open(functools.partial(
797 _create_http_connection, self, self._https_conn_class, True),
798 req, **kwargs)
799
800
801 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
802 def __init__(self, cookiejar=None):
803 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
804
805 def http_response(self, request, response):
806 # Python 2 will choke on next HTTP request in row if there are non-ASCII
807 # characters in Set-Cookie HTTP header of last response (see
808 # https://github.com/rg3/youtube-dl/issues/6769).
809 # In order to at least prevent crashing we will percent encode Set-Cookie
810 # header before HTTPCookieProcessor starts processing it.
811 # if sys.version_info < (3, 0) and response.headers:
812 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
813 # set_cookie = response.headers.get(set_cookie_header)
814 # if set_cookie:
815 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
816 # if set_cookie != set_cookie_escaped:
817 # del response.headers[set_cookie_header]
818 # response.headers[set_cookie_header] = set_cookie_escaped
819 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
820
821 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
822 https_response = http_response
823
824
825 def parse_iso8601(date_str, delimiter='T', timezone=None):
826 """ Return a UNIX timestamp from the given date """
827
828 if date_str is None:
829 return None
830
831 date_str = re.sub(r'\.[0-9]+', '', date_str)
832
833 if timezone is None:
834 m = re.search(
835 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
836 date_str)
837 if not m:
838 timezone = datetime.timedelta()
839 else:
840 date_str = date_str[:-len(m.group(0))]
841 if not m.group('sign'):
842 timezone = datetime.timedelta()
843 else:
844 sign = 1 if m.group('sign') == '+' else -1
845 timezone = datetime.timedelta(
846 hours=sign * int(m.group('hours')),
847 minutes=sign * int(m.group('minutes')))
848 try:
849 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
850 dt = datetime.datetime.strptime(date_str, date_format) - timezone
851 return calendar.timegm(dt.timetuple())
852 except ValueError:
853 pass
854
855
856 def unified_strdate(date_str, day_first=True):
857 """Return a string with the date in the format YYYYMMDD"""
858
859 if date_str is None:
860 return None
861 upload_date = None
862 # Replace commas
863 date_str = date_str.replace(',', ' ')
864 # %z (UTC offset) is only supported in python>=3.2
865 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
866 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
867 # Remove AM/PM + timezone
868 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
869
870 format_expressions = [
871 '%d %B %Y',
872 '%d %b %Y',
873 '%B %d %Y',
874 '%b %d %Y',
875 '%b %dst %Y %I:%M%p',
876 '%b %dnd %Y %I:%M%p',
877 '%b %dth %Y %I:%M%p',
878 '%Y %m %d',
879 '%Y-%m-%d',
880 '%Y/%m/%d',
881 '%Y/%m/%d %H:%M:%S',
882 '%Y-%m-%d %H:%M:%S',
883 '%Y-%m-%d %H:%M:%S.%f',
884 '%d.%m.%Y %H:%M',
885 '%d.%m.%Y %H.%M',
886 '%Y-%m-%dT%H:%M:%SZ',
887 '%Y-%m-%dT%H:%M:%S.%fZ',
888 '%Y-%m-%dT%H:%M:%S.%f0Z',
889 '%Y-%m-%dT%H:%M:%S',
890 '%Y-%m-%dT%H:%M:%S.%f',
891 '%Y-%m-%dT%H:%M',
892 ]
893 if day_first:
894 format_expressions.extend([
895 '%d-%m-%Y',
896 '%d.%m.%Y',
897 '%d/%m/%Y',
898 '%d/%m/%y',
899 '%d/%m/%Y %H:%M:%S',
900 ])
901 else:
902 format_expressions.extend([
903 '%m-%d-%Y',
904 '%m.%d.%Y',
905 '%m/%d/%Y',
906 '%m/%d/%y',
907 '%m/%d/%Y %H:%M:%S',
908 ])
909 for expression in format_expressions:
910 try:
911 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
912 except ValueError:
913 pass
914 if upload_date is None:
915 timetuple = email.utils.parsedate_tz(date_str)
916 if timetuple:
917 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
918 if upload_date is not None:
919 return compat_str(upload_date)
920
921
922 def determine_ext(url, default_ext='unknown_video'):
923 if url is None:
924 return default_ext
925 guess = url.partition('?')[0].rpartition('.')[2]
926 if re.match(r'^[A-Za-z0-9]+$', guess):
927 return guess
928 else:
929 return default_ext
930
931
932 def subtitles_filename(filename, sub_lang, sub_format):
933 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
934
935
936 def date_from_str(date_str):
937 """
938 Return a datetime object from a string in the format YYYYMMDD or
939 (now|today)[+-][0-9](day|week|month|year)(s)?"""
940 today = datetime.date.today()
941 if date_str in ('now', 'today'):
942 return today
943 if date_str == 'yesterday':
944 return today - datetime.timedelta(days=1)
945 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
946 if match is not None:
947 sign = match.group('sign')
948 time = int(match.group('time'))
949 if sign == '-':
950 time = -time
951 unit = match.group('unit')
952 # A bad aproximation?
953 if unit == 'month':
954 unit = 'day'
955 time *= 30
956 elif unit == 'year':
957 unit = 'day'
958 time *= 365
959 unit += 's'
960 delta = datetime.timedelta(**{unit: time})
961 return today + delta
962 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
963
964
965 def hyphenate_date(date_str):
966 """
967 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
968 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
969 if match is not None:
970 return '-'.join(match.groups())
971 else:
972 return date_str
973
974
975 class DateRange(object):
976 """Represents a time interval between two dates"""
977
978 def __init__(self, start=None, end=None):
979 """start and end must be strings in the format accepted by date"""
980 if start is not None:
981 self.start = date_from_str(start)
982 else:
983 self.start = datetime.datetime.min.date()
984 if end is not None:
985 self.end = date_from_str(end)
986 else:
987 self.end = datetime.datetime.max.date()
988 if self.start > self.end:
989 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
990
991 @classmethod
992 def day(cls, day):
993 """Returns a range that only contains the given day"""
994 return cls(day, day)
995
996 def __contains__(self, date):
997 """Check if the date is in the range"""
998 if not isinstance(date, datetime.date):
999 date = date_from_str(date)
1000 return self.start <= date <= self.end
1001
1002 def __str__(self):
1003 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1004
1005
1006 def platform_name():
1007 """ Returns the platform name as a compat_str """
1008 res = platform.platform()
1009 if isinstance(res, bytes):
1010 res = res.decode(preferredencoding())
1011
1012 assert isinstance(res, compat_str)
1013 return res
1014
1015
1016 def _windows_write_string(s, out):
1017 """ Returns True if the string was written using special methods,
1018 False if it has yet to be written out."""
1019 # Adapted from http://stackoverflow.com/a/3259271/35070
1020
1021 import ctypes
1022 import ctypes.wintypes
1023
1024 WIN_OUTPUT_IDS = {
1025 1: -11,
1026 2: -12,
1027 }
1028
1029 try:
1030 fileno = out.fileno()
1031 except AttributeError:
1032 # If the output stream doesn't have a fileno, it's virtual
1033 return False
1034 except io.UnsupportedOperation:
1035 # Some strange Windows pseudo files?
1036 return False
1037 if fileno not in WIN_OUTPUT_IDS:
1038 return False
1039
1040 GetStdHandle = ctypes.WINFUNCTYPE(
1041 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1042 (b"GetStdHandle", ctypes.windll.kernel32))
1043 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1044
1045 WriteConsoleW = ctypes.WINFUNCTYPE(
1046 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1047 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1048 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
1049 written = ctypes.wintypes.DWORD(0)
1050
1051 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
1052 FILE_TYPE_CHAR = 0x0002
1053 FILE_TYPE_REMOTE = 0x8000
1054 GetConsoleMode = ctypes.WINFUNCTYPE(
1055 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1056 ctypes.POINTER(ctypes.wintypes.DWORD))(
1057 (b"GetConsoleMode", ctypes.windll.kernel32))
1058 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1059
1060 def not_a_console(handle):
1061 if handle == INVALID_HANDLE_VALUE or handle is None:
1062 return True
1063 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1064 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1065
1066 if not_a_console(h):
1067 return False
1068
1069 def next_nonbmp_pos(s):
1070 try:
1071 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1072 except StopIteration:
1073 return len(s)
1074
1075 while s:
1076 count = min(next_nonbmp_pos(s), 1024)
1077
1078 ret = WriteConsoleW(
1079 h, s, count if count else 2, ctypes.byref(written), None)
1080 if ret == 0:
1081 raise OSError('Failed to write string')
1082 if not count: # We just wrote a non-BMP character
1083 assert written.value == 2
1084 s = s[1:]
1085 else:
1086 assert written.value > 0
1087 s = s[written.value:]
1088 return True
1089
1090
1091 def write_string(s, out=None, encoding=None):
1092 if out is None:
1093 out = sys.stderr
1094 assert type(s) == compat_str
1095
1096 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1097 if _windows_write_string(s, out):
1098 return
1099
1100 if ('b' in getattr(out, 'mode', '') or
1101 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1102 byt = s.encode(encoding or preferredencoding(), 'ignore')
1103 out.write(byt)
1104 elif hasattr(out, 'buffer'):
1105 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1106 byt = s.encode(enc, 'ignore')
1107 out.buffer.write(byt)
1108 else:
1109 out.write(s)
1110 out.flush()
1111
1112
1113 def bytes_to_intlist(bs):
1114 if not bs:
1115 return []
1116 if isinstance(bs[0], int): # Python 3
1117 return list(bs)
1118 else:
1119 return [ord(c) for c in bs]
1120
1121
1122 def intlist_to_bytes(xs):
1123 if not xs:
1124 return b''
1125 return struct_pack('%dB' % len(xs), *xs)
1126
1127
1128 # Cross-platform file locking
1129 if sys.platform == 'win32':
1130 import ctypes.wintypes
1131 import msvcrt
1132
1133 class OVERLAPPED(ctypes.Structure):
1134 _fields_ = [
1135 ('Internal', ctypes.wintypes.LPVOID),
1136 ('InternalHigh', ctypes.wintypes.LPVOID),
1137 ('Offset', ctypes.wintypes.DWORD),
1138 ('OffsetHigh', ctypes.wintypes.DWORD),
1139 ('hEvent', ctypes.wintypes.HANDLE),
1140 ]
1141
1142 kernel32 = ctypes.windll.kernel32
1143 LockFileEx = kernel32.LockFileEx
1144 LockFileEx.argtypes = [
1145 ctypes.wintypes.HANDLE, # hFile
1146 ctypes.wintypes.DWORD, # dwFlags
1147 ctypes.wintypes.DWORD, # dwReserved
1148 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1149 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1150 ctypes.POINTER(OVERLAPPED) # Overlapped
1151 ]
1152 LockFileEx.restype = ctypes.wintypes.BOOL
1153 UnlockFileEx = kernel32.UnlockFileEx
1154 UnlockFileEx.argtypes = [
1155 ctypes.wintypes.HANDLE, # hFile
1156 ctypes.wintypes.DWORD, # dwReserved
1157 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1158 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1159 ctypes.POINTER(OVERLAPPED) # Overlapped
1160 ]
1161 UnlockFileEx.restype = ctypes.wintypes.BOOL
1162 whole_low = 0xffffffff
1163 whole_high = 0x7fffffff
1164
1165 def _lock_file(f, exclusive):
1166 overlapped = OVERLAPPED()
1167 overlapped.Offset = 0
1168 overlapped.OffsetHigh = 0
1169 overlapped.hEvent = 0
1170 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1171 handle = msvcrt.get_osfhandle(f.fileno())
1172 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1173 whole_low, whole_high, f._lock_file_overlapped_p):
1174 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1175
1176 def _unlock_file(f):
1177 assert f._lock_file_overlapped_p
1178 handle = msvcrt.get_osfhandle(f.fileno())
1179 if not UnlockFileEx(handle, 0,
1180 whole_low, whole_high, f._lock_file_overlapped_p):
1181 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1182
1183 else:
1184 import fcntl
1185
1186 def _lock_file(f, exclusive):
1187 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1188
1189 def _unlock_file(f):
1190 fcntl.flock(f, fcntl.LOCK_UN)
1191
1192
1193 class locked_file(object):
1194 def __init__(self, filename, mode, encoding=None):
1195 assert mode in ['r', 'a', 'w']
1196 self.f = io.open(filename, mode, encoding=encoding)
1197 self.mode = mode
1198
1199 def __enter__(self):
1200 exclusive = self.mode != 'r'
1201 try:
1202 _lock_file(self.f, exclusive)
1203 except IOError:
1204 self.f.close()
1205 raise
1206 return self
1207
1208 def __exit__(self, etype, value, traceback):
1209 try:
1210 _unlock_file(self.f)
1211 finally:
1212 self.f.close()
1213
1214 def __iter__(self):
1215 return iter(self.f)
1216
1217 def write(self, *args):
1218 return self.f.write(*args)
1219
1220 def read(self, *args):
1221 return self.f.read(*args)
1222
1223
1224 def get_filesystem_encoding():
1225 encoding = sys.getfilesystemencoding()
1226 return encoding if encoding is not None else 'utf-8'
1227
1228
1229 def shell_quote(args):
1230 quoted_args = []
1231 encoding = get_filesystem_encoding()
1232 for a in args:
1233 if isinstance(a, bytes):
1234 # We may get a filename encoded with 'encodeFilename'
1235 a = a.decode(encoding)
1236 quoted_args.append(pipes.quote(a))
1237 return ' '.join(quoted_args)
1238
1239
1240 def smuggle_url(url, data):
1241 """ Pass additional data in a URL for internal use. """
1242
1243 sdata = compat_urllib_parse.urlencode(
1244 {'__youtubedl_smuggle': json.dumps(data)})
1245 return url + '#' + sdata
1246
1247
1248 def unsmuggle_url(smug_url, default=None):
1249 if '#__youtubedl_smuggle' not in smug_url:
1250 return smug_url, default
1251 url, _, sdata = smug_url.rpartition('#')
1252 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1253 data = json.loads(jsond)
1254 return url, data
1255
1256
1257 def format_bytes(bytes):
1258 if bytes is None:
1259 return 'N/A'
1260 if type(bytes) is str:
1261 bytes = float(bytes)
1262 if bytes == 0.0:
1263 exponent = 0
1264 else:
1265 exponent = int(math.log(bytes, 1024.0))
1266 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1267 converted = float(bytes) / float(1024 ** exponent)
1268 return '%.2f%s' % (converted, suffix)
1269
1270
1271 def parse_filesize(s):
1272 if s is None:
1273 return None
1274
1275 # The lower-case forms are of course incorrect and inofficial,
1276 # but we support those too
1277 _UNIT_TABLE = {
1278 'B': 1,
1279 'b': 1,
1280 'KiB': 1024,
1281 'KB': 1000,
1282 'kB': 1024,
1283 'Kb': 1000,
1284 'MiB': 1024 ** 2,
1285 'MB': 1000 ** 2,
1286 'mB': 1024 ** 2,
1287 'Mb': 1000 ** 2,
1288 'GiB': 1024 ** 3,
1289 'GB': 1000 ** 3,
1290 'gB': 1024 ** 3,
1291 'Gb': 1000 ** 3,
1292 'TiB': 1024 ** 4,
1293 'TB': 1000 ** 4,
1294 'tB': 1024 ** 4,
1295 'Tb': 1000 ** 4,
1296 'PiB': 1024 ** 5,
1297 'PB': 1000 ** 5,
1298 'pB': 1024 ** 5,
1299 'Pb': 1000 ** 5,
1300 'EiB': 1024 ** 6,
1301 'EB': 1000 ** 6,
1302 'eB': 1024 ** 6,
1303 'Eb': 1000 ** 6,
1304 'ZiB': 1024 ** 7,
1305 'ZB': 1000 ** 7,
1306 'zB': 1024 ** 7,
1307 'Zb': 1000 ** 7,
1308 'YiB': 1024 ** 8,
1309 'YB': 1000 ** 8,
1310 'yB': 1024 ** 8,
1311 'Yb': 1000 ** 8,
1312 }
1313
1314 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1315 m = re.match(
1316 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1317 if not m:
1318 return None
1319
1320 num_str = m.group('num').replace(',', '.')
1321 mult = _UNIT_TABLE[m.group('unit')]
1322 return int(float(num_str) * mult)
1323
1324
1325 def month_by_name(name):
1326 """ Return the number of a month by (locale-independently) English name """
1327
1328 try:
1329 return ENGLISH_MONTH_NAMES.index(name) + 1
1330 except ValueError:
1331 return None
1332
1333
1334 def month_by_abbreviation(abbrev):
1335 """ Return the number of a month by (locale-independently) English
1336 abbreviations """
1337
1338 try:
1339 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1340 except ValueError:
1341 return None
1342
1343
1344 def fix_xml_ampersands(xml_str):
1345 """Replace all the '&' by '&amp;' in XML"""
1346 return re.sub(
1347 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1348 '&amp;',
1349 xml_str)
1350
1351
1352 def setproctitle(title):
1353 assert isinstance(title, compat_str)
1354 try:
1355 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1356 except OSError:
1357 return
1358 title_bytes = title.encode('utf-8')
1359 buf = ctypes.create_string_buffer(len(title_bytes))
1360 buf.value = title_bytes
1361 try:
1362 libc.prctl(15, buf, 0, 0, 0)
1363 except AttributeError:
1364 return # Strange libc, just skip this
1365
1366
1367 def remove_start(s, start):
1368 if s.startswith(start):
1369 return s[len(start):]
1370 return s
1371
1372
1373 def remove_end(s, end):
1374 if s.endswith(end):
1375 return s[:-len(end)]
1376 return s
1377
1378
1379 def url_basename(url):
1380 path = compat_urlparse.urlparse(url).path
1381 return path.strip('/').split('/')[-1]
1382
1383
1384 class HEADRequest(compat_urllib_request.Request):
1385 def get_method(self):
1386 return "HEAD"
1387
1388
1389 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1390 if get_attr:
1391 if v is not None:
1392 v = getattr(v, get_attr, None)
1393 if v == '':
1394 v = None
1395 if v is None:
1396 return default
1397 try:
1398 return int(v) * invscale // scale
1399 except ValueError:
1400 return default
1401
1402
1403 def str_or_none(v, default=None):
1404 return default if v is None else compat_str(v)
1405
1406
1407 def str_to_int(int_str):
1408 """ A more relaxed version of int_or_none """
1409 if int_str is None:
1410 return None
1411 int_str = re.sub(r'[,\.\+]', '', int_str)
1412 return int(int_str)
1413
1414
1415 def float_or_none(v, scale=1, invscale=1, default=None):
1416 if v is None:
1417 return default
1418 try:
1419 return float(v) * invscale / scale
1420 except ValueError:
1421 return default
1422
1423
1424 def parse_duration(s):
1425 if not isinstance(s, compat_basestring):
1426 return None
1427
1428 s = s.strip()
1429
1430 m = re.match(
1431 r'''(?ix)(?:P?T)?
1432 (?:
1433 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1434 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1435
1436 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1437 (?:
1438 (?:
1439 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1440 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1441 )?
1442 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1443 )?
1444 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1445 )$''', s)
1446 if not m:
1447 return None
1448 res = 0
1449 if m.group('only_mins'):
1450 return float_or_none(m.group('only_mins'), invscale=60)
1451 if m.group('only_hours'):
1452 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1453 if m.group('secs'):
1454 res += int(m.group('secs'))
1455 if m.group('mins_reversed'):
1456 res += int(m.group('mins_reversed')) * 60
1457 if m.group('mins'):
1458 res += int(m.group('mins')) * 60
1459 if m.group('hours'):
1460 res += int(m.group('hours')) * 60 * 60
1461 if m.group('hours_reversed'):
1462 res += int(m.group('hours_reversed')) * 60 * 60
1463 if m.group('days'):
1464 res += int(m.group('days')) * 24 * 60 * 60
1465 if m.group('ms'):
1466 res += float(m.group('ms'))
1467 return res
1468
1469
1470 def prepend_extension(filename, ext, expected_real_ext=None):
1471 name, real_ext = os.path.splitext(filename)
1472 return (
1473 '{0}.{1}{2}'.format(name, ext, real_ext)
1474 if not expected_real_ext or real_ext[1:] == expected_real_ext
1475 else '{0}.{1}'.format(filename, ext))
1476
1477
1478 def replace_extension(filename, ext, expected_real_ext=None):
1479 name, real_ext = os.path.splitext(filename)
1480 return '{0}.{1}'.format(
1481 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1482 ext)
1483
1484
1485 def check_executable(exe, args=[]):
1486 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1487 args can be a list of arguments for a short output (like -version) """
1488 try:
1489 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1490 except OSError:
1491 return False
1492 return exe
1493
1494
1495 def get_exe_version(exe, args=['--version'],
1496 version_re=None, unrecognized='present'):
1497 """ Returns the version of the specified executable,
1498 or False if the executable is not present """
1499 try:
1500 out, _ = subprocess.Popen(
1501 [encodeArgument(exe)] + args,
1502 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1503 except OSError:
1504 return False
1505 if isinstance(out, bytes): # Python 2.x
1506 out = out.decode('ascii', 'ignore')
1507 return detect_exe_version(out, version_re, unrecognized)
1508
1509
1510 def detect_exe_version(output, version_re=None, unrecognized='present'):
1511 assert isinstance(output, compat_str)
1512 if version_re is None:
1513 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1514 m = re.search(version_re, output)
1515 if m:
1516 return m.group(1)
1517 else:
1518 return unrecognized
1519
1520
1521 class PagedList(object):
1522 def __len__(self):
1523 # This is only useful for tests
1524 return len(self.getslice())
1525
1526
1527 class OnDemandPagedList(PagedList):
1528 def __init__(self, pagefunc, pagesize):
1529 self._pagefunc = pagefunc
1530 self._pagesize = pagesize
1531
1532 def getslice(self, start=0, end=None):
1533 res = []
1534 for pagenum in itertools.count(start // self._pagesize):
1535 firstid = pagenum * self._pagesize
1536 nextfirstid = pagenum * self._pagesize + self._pagesize
1537 if start >= nextfirstid:
1538 continue
1539
1540 page_results = list(self._pagefunc(pagenum))
1541
1542 startv = (
1543 start % self._pagesize
1544 if firstid <= start < nextfirstid
1545 else 0)
1546
1547 endv = (
1548 ((end - 1) % self._pagesize) + 1
1549 if (end is not None and firstid <= end <= nextfirstid)
1550 else None)
1551
1552 if startv != 0 or endv is not None:
1553 page_results = page_results[startv:endv]
1554 res.extend(page_results)
1555
1556 # A little optimization - if current page is not "full", ie. does
1557 # not contain page_size videos then we can assume that this page
1558 # is the last one - there are no more ids on further pages -
1559 # i.e. no need to query again.
1560 if len(page_results) + startv < self._pagesize:
1561 break
1562
1563 # If we got the whole page, but the next page is not interesting,
1564 # break out early as well
1565 if end == nextfirstid:
1566 break
1567 return res
1568
1569
1570 class InAdvancePagedList(PagedList):
1571 def __init__(self, pagefunc, pagecount, pagesize):
1572 self._pagefunc = pagefunc
1573 self._pagecount = pagecount
1574 self._pagesize = pagesize
1575
1576 def getslice(self, start=0, end=None):
1577 res = []
1578 start_page = start // self._pagesize
1579 end_page = (
1580 self._pagecount if end is None else (end // self._pagesize + 1))
1581 skip_elems = start - start_page * self._pagesize
1582 only_more = None if end is None else end - start
1583 for pagenum in range(start_page, end_page):
1584 page = list(self._pagefunc(pagenum))
1585 if skip_elems:
1586 page = page[skip_elems:]
1587 skip_elems = None
1588 if only_more is not None:
1589 if len(page) < only_more:
1590 only_more -= len(page)
1591 else:
1592 page = page[:only_more]
1593 res.extend(page)
1594 break
1595 res.extend(page)
1596 return res
1597
1598
1599 def uppercase_escape(s):
1600 unicode_escape = codecs.getdecoder('unicode_escape')
1601 return re.sub(
1602 r'\\U[0-9a-fA-F]{8}',
1603 lambda m: unicode_escape(m.group(0))[0],
1604 s)
1605
1606
1607 def lowercase_escape(s):
1608 unicode_escape = codecs.getdecoder('unicode_escape')
1609 return re.sub(
1610 r'\\u[0-9a-fA-F]{4}',
1611 lambda m: unicode_escape(m.group(0))[0],
1612 s)
1613
1614
1615 def escape_rfc3986(s):
1616 """Escape non-ASCII characters as suggested by RFC 3986"""
1617 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1618 s = s.encode('utf-8')
1619 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1620
1621
1622 def escape_url(url):
1623 """Escape URL as suggested by RFC 3986"""
1624 url_parsed = compat_urllib_parse_urlparse(url)
1625 return url_parsed._replace(
1626 path=escape_rfc3986(url_parsed.path),
1627 params=escape_rfc3986(url_parsed.params),
1628 query=escape_rfc3986(url_parsed.query),
1629 fragment=escape_rfc3986(url_parsed.fragment)
1630 ).geturl()
1631
1632 try:
1633 struct.pack('!I', 0)
1634 except TypeError:
1635 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1636 def struct_pack(spec, *args):
1637 if isinstance(spec, compat_str):
1638 spec = spec.encode('ascii')
1639 return struct.pack(spec, *args)
1640
1641 def struct_unpack(spec, *args):
1642 if isinstance(spec, compat_str):
1643 spec = spec.encode('ascii')
1644 return struct.unpack(spec, *args)
1645 else:
1646 struct_pack = struct.pack
1647 struct_unpack = struct.unpack
1648
1649
1650 def read_batch_urls(batch_fd):
1651 def fixup(url):
1652 if not isinstance(url, compat_str):
1653 url = url.decode('utf-8', 'replace')
1654 BOM_UTF8 = '\xef\xbb\xbf'
1655 if url.startswith(BOM_UTF8):
1656 url = url[len(BOM_UTF8):]
1657 url = url.strip()
1658 if url.startswith(('#', ';', ']')):
1659 return False
1660 return url
1661
1662 with contextlib.closing(batch_fd) as fd:
1663 return [url for url in map(fixup, fd) if url]
1664
1665
1666 def urlencode_postdata(*args, **kargs):
1667 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1668
1669
1670 def encode_dict(d, encoding='utf-8'):
1671 def encode(v):
1672 return v.encode(encoding) if isinstance(v, compat_basestring) else v
1673 return dict((encode(k), encode(v)) for k, v in d.items())
1674
1675
1676 US_RATINGS = {
1677 'G': 0,
1678 'PG': 10,
1679 'PG-13': 13,
1680 'R': 16,
1681 'NC': 18,
1682 }
1683
1684
1685 def parse_age_limit(s):
1686 if s is None:
1687 return None
1688 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1689 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1690
1691
1692 def strip_jsonp(code):
1693 return re.sub(
1694 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1695
1696
1697 def js_to_json(code):
1698 def fix_kv(m):
1699 v = m.group(0)
1700 if v in ('true', 'false', 'null'):
1701 return v
1702 if v.startswith('"'):
1703 v = re.sub(r"\\'", "'", v[1:-1])
1704 elif v.startswith("'"):
1705 v = v[1:-1]
1706 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1707 '\\\\': '\\\\',
1708 "\\'": "'",
1709 '"': '\\"',
1710 }[m.group(0)], v)
1711 return '"%s"' % v
1712
1713 res = re.sub(r'''(?x)
1714 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1715 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1716 [a-zA-Z_][.a-zA-Z_0-9]*
1717 ''', fix_kv, code)
1718 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1719 return res
1720
1721
1722 def qualities(quality_ids):
1723 """ Get a numeric quality value out of a list of possible values """
1724 def q(qid):
1725 try:
1726 return quality_ids.index(qid)
1727 except ValueError:
1728 return -1
1729 return q
1730
1731
1732 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1733
1734
1735 def limit_length(s, length):
1736 """ Add ellipses to overly long strings """
1737 if s is None:
1738 return None
1739 ELLIPSES = '...'
1740 if len(s) > length:
1741 return s[:length - len(ELLIPSES)] + ELLIPSES
1742 return s
1743
1744
1745 def version_tuple(v):
1746 return tuple(int(e) for e in re.split(r'[-.]', v))
1747
1748
1749 def is_outdated_version(version, limit, assume_new=True):
1750 if not version:
1751 return not assume_new
1752 try:
1753 return version_tuple(version) < version_tuple(limit)
1754 except ValueError:
1755 return not assume_new
1756
1757
1758 def ytdl_is_updateable():
1759 """ Returns if youtube-dl can be updated with -U """
1760 from zipimport import zipimporter
1761
1762 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1763
1764
1765 def args_to_str(args):
1766 # Get a short string representation for a subprocess command
1767 return ' '.join(shlex_quote(a) for a in args)
1768
1769
1770 def mimetype2ext(mt):
1771 _, _, res = mt.rpartition('/')
1772
1773 return {
1774 'x-ms-wmv': 'wmv',
1775 'x-mp4-fragmented': 'mp4',
1776 'ttml+xml': 'ttml',
1777 }.get(res, res)
1778
1779
1780 def urlhandle_detect_ext(url_handle):
1781 try:
1782 url_handle.headers
1783 getheader = lambda h: url_handle.headers[h]
1784 except AttributeError: # Python < 3
1785 getheader = url_handle.info().getheader
1786
1787 cd = getheader('Content-Disposition')
1788 if cd:
1789 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1790 if m:
1791 e = determine_ext(m.group('filename'), default_ext=None)
1792 if e:
1793 return e
1794
1795 return mimetype2ext(getheader('Content-Type'))
1796
1797
1798 def encode_data_uri(data, mime_type):
1799 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1800
1801
1802 def age_restricted(content_limit, age_limit):
1803 """ Returns True iff the content should be blocked """
1804
1805 if age_limit is None: # No limit set
1806 return False
1807 if content_limit is None:
1808 return False # Content available for everyone
1809 return age_limit < content_limit
1810
1811
1812 def is_html(first_bytes):
1813 """ Detect whether a file contains HTML by examining its first bytes. """
1814
1815 BOMS = [
1816 (b'\xef\xbb\xbf', 'utf-8'),
1817 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1818 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1819 (b'\xff\xfe', 'utf-16-le'),
1820 (b'\xfe\xff', 'utf-16-be'),
1821 ]
1822 for bom, enc in BOMS:
1823 if first_bytes.startswith(bom):
1824 s = first_bytes[len(bom):].decode(enc, 'replace')
1825 break
1826 else:
1827 s = first_bytes.decode('utf-8', 'replace')
1828
1829 return re.match(r'^\s*<', s)
1830
1831
1832 def determine_protocol(info_dict):
1833 protocol = info_dict.get('protocol')
1834 if protocol is not None:
1835 return protocol
1836
1837 url = info_dict['url']
1838 if url.startswith('rtmp'):
1839 return 'rtmp'
1840 elif url.startswith('mms'):
1841 return 'mms'
1842 elif url.startswith('rtsp'):
1843 return 'rtsp'
1844
1845 ext = determine_ext(url)
1846 if ext == 'm3u8':
1847 return 'm3u8'
1848 elif ext == 'f4m':
1849 return 'f4m'
1850
1851 return compat_urllib_parse_urlparse(url).scheme
1852
1853
1854 def render_table(header_row, data):
1855 """ Render a list of rows, each as a list of values """
1856 table = [header_row] + data
1857 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1858 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1859 return '\n'.join(format_str % tuple(row) for row in table)
1860
1861
1862 def _match_one(filter_part, dct):
1863 COMPARISON_OPERATORS = {
1864 '<': operator.lt,
1865 '<=': operator.le,
1866 '>': operator.gt,
1867 '>=': operator.ge,
1868 '=': operator.eq,
1869 '!=': operator.ne,
1870 }
1871 operator_rex = re.compile(r'''(?x)\s*
1872 (?P<key>[a-z_]+)
1873 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1874 (?:
1875 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1876 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1877 )
1878 \s*$
1879 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1880 m = operator_rex.search(filter_part)
1881 if m:
1882 op = COMPARISON_OPERATORS[m.group('op')]
1883 if m.group('strval') is not None:
1884 if m.group('op') not in ('=', '!='):
1885 raise ValueError(
1886 'Operator %s does not support string values!' % m.group('op'))
1887 comparison_value = m.group('strval')
1888 else:
1889 try:
1890 comparison_value = int(m.group('intval'))
1891 except ValueError:
1892 comparison_value = parse_filesize(m.group('intval'))
1893 if comparison_value is None:
1894 comparison_value = parse_filesize(m.group('intval') + 'B')
1895 if comparison_value is None:
1896 raise ValueError(
1897 'Invalid integer value %r in filter part %r' % (
1898 m.group('intval'), filter_part))
1899 actual_value = dct.get(m.group('key'))
1900 if actual_value is None:
1901 return m.group('none_inclusive')
1902 return op(actual_value, comparison_value)
1903
1904 UNARY_OPERATORS = {
1905 '': lambda v: v is not None,
1906 '!': lambda v: v is None,
1907 }
1908 operator_rex = re.compile(r'''(?x)\s*
1909 (?P<op>%s)\s*(?P<key>[a-z_]+)
1910 \s*$
1911 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1912 m = operator_rex.search(filter_part)
1913 if m:
1914 op = UNARY_OPERATORS[m.group('op')]
1915 actual_value = dct.get(m.group('key'))
1916 return op(actual_value)
1917
1918 raise ValueError('Invalid filter part %r' % filter_part)
1919
1920
1921 def match_str(filter_str, dct):
1922 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1923
1924 return all(
1925 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1926
1927
1928 def match_filter_func(filter_str):
1929 def _match_func(info_dict):
1930 if match_str(filter_str, info_dict):
1931 return None
1932 else:
1933 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1934 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1935 return _match_func
1936
1937
1938 def parse_dfxp_time_expr(time_expr):
1939 if not time_expr:
1940 return 0.0
1941
1942 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1943 if mobj:
1944 return float(mobj.group('time_offset'))
1945
1946 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1947 if mobj:
1948 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1949
1950
1951 def srt_subtitles_timecode(seconds):
1952 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1953
1954
1955 def dfxp2srt(dfxp_data):
1956 _x = functools.partial(xpath_with_ns, ns_map={
1957 'ttml': 'http://www.w3.org/ns/ttml',
1958 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1959 })
1960
1961 def parse_node(node):
1962 str_or_empty = functools.partial(str_or_none, default='')
1963
1964 out = str_or_empty(node.text)
1965
1966 for child in node:
1967 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1968 out += '\n' + str_or_empty(child.tail)
1969 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1970 out += str_or_empty(parse_node(child))
1971 else:
1972 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1973
1974 return out
1975
1976 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
1977 out = []
1978 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1979
1980 if not paras:
1981 raise ValueError('Invalid dfxp/TTML subtitle')
1982
1983 for para, index in zip(paras, itertools.count(1)):
1984 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1985 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1986 if not end_time:
1987 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1988 out.append('%d\n%s --> %s\n%s\n\n' % (
1989 index,
1990 srt_subtitles_timecode(begin_time),
1991 srt_subtitles_timecode(end_time),
1992 parse_node(para)))
1993
1994 return ''.join(out)
1995
1996
1997 def cli_option(params, command_option, param):
1998 param = params.get(param)
1999 return [command_option, param] if param is not None else []
2000
2001
2002 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2003 param = params.get(param)
2004 assert isinstance(param, bool)
2005 if separator:
2006 return [command_option + separator + (true_value if param else false_value)]
2007 return [command_option, true_value if param else false_value]
2008
2009
2010 def cli_valueless_option(params, command_option, param, expected_value=True):
2011 param = params.get(param)
2012 return [command_option] if param == expected_value else []
2013
2014
2015 def cli_configuration_args(params, param, default=[]):
2016 ex_args = params.get(param)
2017 if ex_args is None:
2018 return default
2019 assert isinstance(ex_args, list)
2020 return ex_args
2021
2022
2023 class ISO639Utils(object):
2024 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2025 _lang_map = {
2026 'aa': 'aar',
2027 'ab': 'abk',
2028 'ae': 'ave',
2029 'af': 'afr',
2030 'ak': 'aka',
2031 'am': 'amh',
2032 'an': 'arg',
2033 'ar': 'ara',
2034 'as': 'asm',
2035 'av': 'ava',
2036 'ay': 'aym',
2037 'az': 'aze',
2038 'ba': 'bak',
2039 'be': 'bel',
2040 'bg': 'bul',
2041 'bh': 'bih',
2042 'bi': 'bis',
2043 'bm': 'bam',
2044 'bn': 'ben',
2045 'bo': 'bod',
2046 'br': 'bre',
2047 'bs': 'bos',
2048 'ca': 'cat',
2049 'ce': 'che',
2050 'ch': 'cha',
2051 'co': 'cos',
2052 'cr': 'cre',
2053 'cs': 'ces',
2054 'cu': 'chu',
2055 'cv': 'chv',
2056 'cy': 'cym',
2057 'da': 'dan',
2058 'de': 'deu',
2059 'dv': 'div',
2060 'dz': 'dzo',
2061 'ee': 'ewe',
2062 'el': 'ell',
2063 'en': 'eng',
2064 'eo': 'epo',
2065 'es': 'spa',
2066 'et': 'est',
2067 'eu': 'eus',
2068 'fa': 'fas',
2069 'ff': 'ful',
2070 'fi': 'fin',
2071 'fj': 'fij',
2072 'fo': 'fao',
2073 'fr': 'fra',
2074 'fy': 'fry',
2075 'ga': 'gle',
2076 'gd': 'gla',
2077 'gl': 'glg',
2078 'gn': 'grn',
2079 'gu': 'guj',
2080 'gv': 'glv',
2081 'ha': 'hau',
2082 'he': 'heb',
2083 'hi': 'hin',
2084 'ho': 'hmo',
2085 'hr': 'hrv',
2086 'ht': 'hat',
2087 'hu': 'hun',
2088 'hy': 'hye',
2089 'hz': 'her',
2090 'ia': 'ina',
2091 'id': 'ind',
2092 'ie': 'ile',
2093 'ig': 'ibo',
2094 'ii': 'iii',
2095 'ik': 'ipk',
2096 'io': 'ido',
2097 'is': 'isl',
2098 'it': 'ita',
2099 'iu': 'iku',
2100 'ja': 'jpn',
2101 'jv': 'jav',
2102 'ka': 'kat',
2103 'kg': 'kon',
2104 'ki': 'kik',
2105 'kj': 'kua',
2106 'kk': 'kaz',
2107 'kl': 'kal',
2108 'km': 'khm',
2109 'kn': 'kan',
2110 'ko': 'kor',
2111 'kr': 'kau',
2112 'ks': 'kas',
2113 'ku': 'kur',
2114 'kv': 'kom',
2115 'kw': 'cor',
2116 'ky': 'kir',
2117 'la': 'lat',
2118 'lb': 'ltz',
2119 'lg': 'lug',
2120 'li': 'lim',
2121 'ln': 'lin',
2122 'lo': 'lao',
2123 'lt': 'lit',
2124 'lu': 'lub',
2125 'lv': 'lav',
2126 'mg': 'mlg',
2127 'mh': 'mah',
2128 'mi': 'mri',
2129 'mk': 'mkd',
2130 'ml': 'mal',
2131 'mn': 'mon',
2132 'mr': 'mar',
2133 'ms': 'msa',
2134 'mt': 'mlt',
2135 'my': 'mya',
2136 'na': 'nau',
2137 'nb': 'nob',
2138 'nd': 'nde',
2139 'ne': 'nep',
2140 'ng': 'ndo',
2141 'nl': 'nld',
2142 'nn': 'nno',
2143 'no': 'nor',
2144 'nr': 'nbl',
2145 'nv': 'nav',
2146 'ny': 'nya',
2147 'oc': 'oci',
2148 'oj': 'oji',
2149 'om': 'orm',
2150 'or': 'ori',
2151 'os': 'oss',
2152 'pa': 'pan',
2153 'pi': 'pli',
2154 'pl': 'pol',
2155 'ps': 'pus',
2156 'pt': 'por',
2157 'qu': 'que',
2158 'rm': 'roh',
2159 'rn': 'run',
2160 'ro': 'ron',
2161 'ru': 'rus',
2162 'rw': 'kin',
2163 'sa': 'san',
2164 'sc': 'srd',
2165 'sd': 'snd',
2166 'se': 'sme',
2167 'sg': 'sag',
2168 'si': 'sin',
2169 'sk': 'slk',
2170 'sl': 'slv',
2171 'sm': 'smo',
2172 'sn': 'sna',
2173 'so': 'som',
2174 'sq': 'sqi',
2175 'sr': 'srp',
2176 'ss': 'ssw',
2177 'st': 'sot',
2178 'su': 'sun',
2179 'sv': 'swe',
2180 'sw': 'swa',
2181 'ta': 'tam',
2182 'te': 'tel',
2183 'tg': 'tgk',
2184 'th': 'tha',
2185 'ti': 'tir',
2186 'tk': 'tuk',
2187 'tl': 'tgl',
2188 'tn': 'tsn',
2189 'to': 'ton',
2190 'tr': 'tur',
2191 'ts': 'tso',
2192 'tt': 'tat',
2193 'tw': 'twi',
2194 'ty': 'tah',
2195 'ug': 'uig',
2196 'uk': 'ukr',
2197 'ur': 'urd',
2198 'uz': 'uzb',
2199 've': 'ven',
2200 'vi': 'vie',
2201 'vo': 'vol',
2202 'wa': 'wln',
2203 'wo': 'wol',
2204 'xh': 'xho',
2205 'yi': 'yid',
2206 'yo': 'yor',
2207 'za': 'zha',
2208 'zh': 'zho',
2209 'zu': 'zul',
2210 }
2211
2212 @classmethod
2213 def short2long(cls, code):
2214 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2215 return cls._lang_map.get(code[:2])
2216
2217 @classmethod
2218 def long2short(cls, code):
2219 """Convert language code from ISO 639-2/T to ISO 639-1"""
2220 for short_name, long_name in cls._lang_map.items():
2221 if long_name == code:
2222 return short_name
2223
2224
2225 class ISO3166Utils(object):
2226 # From http://data.okfn.org/data/core/country-list
2227 _country_map = {
2228 'AF': 'Afghanistan',
2229 'AX': 'Åland Islands',
2230 'AL': 'Albania',
2231 'DZ': 'Algeria',
2232 'AS': 'American Samoa',
2233 'AD': 'Andorra',
2234 'AO': 'Angola',
2235 'AI': 'Anguilla',
2236 'AQ': 'Antarctica',
2237 'AG': 'Antigua and Barbuda',
2238 'AR': 'Argentina',
2239 'AM': 'Armenia',
2240 'AW': 'Aruba',
2241 'AU': 'Australia',
2242 'AT': 'Austria',
2243 'AZ': 'Azerbaijan',
2244 'BS': 'Bahamas',
2245 'BH': 'Bahrain',
2246 'BD': 'Bangladesh',
2247 'BB': 'Barbados',
2248 'BY': 'Belarus',
2249 'BE': 'Belgium',
2250 'BZ': 'Belize',
2251 'BJ': 'Benin',
2252 'BM': 'Bermuda',
2253 'BT': 'Bhutan',
2254 'BO': 'Bolivia, Plurinational State of',
2255 'BQ': 'Bonaire, Sint Eustatius and Saba',
2256 'BA': 'Bosnia and Herzegovina',
2257 'BW': 'Botswana',
2258 'BV': 'Bouvet Island',
2259 'BR': 'Brazil',
2260 'IO': 'British Indian Ocean Territory',
2261 'BN': 'Brunei Darussalam',
2262 'BG': 'Bulgaria',
2263 'BF': 'Burkina Faso',
2264 'BI': 'Burundi',
2265 'KH': 'Cambodia',
2266 'CM': 'Cameroon',
2267 'CA': 'Canada',
2268 'CV': 'Cape Verde',
2269 'KY': 'Cayman Islands',
2270 'CF': 'Central African Republic',
2271 'TD': 'Chad',
2272 'CL': 'Chile',
2273 'CN': 'China',
2274 'CX': 'Christmas Island',
2275 'CC': 'Cocos (Keeling) Islands',
2276 'CO': 'Colombia',
2277 'KM': 'Comoros',
2278 'CG': 'Congo',
2279 'CD': 'Congo, the Democratic Republic of the',
2280 'CK': 'Cook Islands',
2281 'CR': 'Costa Rica',
2282 'CI': 'Côte d\'Ivoire',
2283 'HR': 'Croatia',
2284 'CU': 'Cuba',
2285 'CW': 'Curaçao',
2286 'CY': 'Cyprus',
2287 'CZ': 'Czech Republic',
2288 'DK': 'Denmark',
2289 'DJ': 'Djibouti',
2290 'DM': 'Dominica',
2291 'DO': 'Dominican Republic',
2292 'EC': 'Ecuador',
2293 'EG': 'Egypt',
2294 'SV': 'El Salvador',
2295 'GQ': 'Equatorial Guinea',
2296 'ER': 'Eritrea',
2297 'EE': 'Estonia',
2298 'ET': 'Ethiopia',
2299 'FK': 'Falkland Islands (Malvinas)',
2300 'FO': 'Faroe Islands',
2301 'FJ': 'Fiji',
2302 'FI': 'Finland',
2303 'FR': 'France',
2304 'GF': 'French Guiana',
2305 'PF': 'French Polynesia',
2306 'TF': 'French Southern Territories',
2307 'GA': 'Gabon',
2308 'GM': 'Gambia',
2309 'GE': 'Georgia',
2310 'DE': 'Germany',
2311 'GH': 'Ghana',
2312 'GI': 'Gibraltar',
2313 'GR': 'Greece',
2314 'GL': 'Greenland',
2315 'GD': 'Grenada',
2316 'GP': 'Guadeloupe',
2317 'GU': 'Guam',
2318 'GT': 'Guatemala',
2319 'GG': 'Guernsey',
2320 'GN': 'Guinea',
2321 'GW': 'Guinea-Bissau',
2322 'GY': 'Guyana',
2323 'HT': 'Haiti',
2324 'HM': 'Heard Island and McDonald Islands',
2325 'VA': 'Holy See (Vatican City State)',
2326 'HN': 'Honduras',
2327 'HK': 'Hong Kong',
2328 'HU': 'Hungary',
2329 'IS': 'Iceland',
2330 'IN': 'India',
2331 'ID': 'Indonesia',
2332 'IR': 'Iran, Islamic Republic of',
2333 'IQ': 'Iraq',
2334 'IE': 'Ireland',
2335 'IM': 'Isle of Man',
2336 'IL': 'Israel',
2337 'IT': 'Italy',
2338 'JM': 'Jamaica',
2339 'JP': 'Japan',
2340 'JE': 'Jersey',
2341 'JO': 'Jordan',
2342 'KZ': 'Kazakhstan',
2343 'KE': 'Kenya',
2344 'KI': 'Kiribati',
2345 'KP': 'Korea, Democratic People\'s Republic of',
2346 'KR': 'Korea, Republic of',
2347 'KW': 'Kuwait',
2348 'KG': 'Kyrgyzstan',
2349 'LA': 'Lao People\'s Democratic Republic',
2350 'LV': 'Latvia',
2351 'LB': 'Lebanon',
2352 'LS': 'Lesotho',
2353 'LR': 'Liberia',
2354 'LY': 'Libya',
2355 'LI': 'Liechtenstein',
2356 'LT': 'Lithuania',
2357 'LU': 'Luxembourg',
2358 'MO': 'Macao',
2359 'MK': 'Macedonia, the Former Yugoslav Republic of',
2360 'MG': 'Madagascar',
2361 'MW': 'Malawi',
2362 'MY': 'Malaysia',
2363 'MV': 'Maldives',
2364 'ML': 'Mali',
2365 'MT': 'Malta',
2366 'MH': 'Marshall Islands',
2367 'MQ': 'Martinique',
2368 'MR': 'Mauritania',
2369 'MU': 'Mauritius',
2370 'YT': 'Mayotte',
2371 'MX': 'Mexico',
2372 'FM': 'Micronesia, Federated States of',
2373 'MD': 'Moldova, Republic of',
2374 'MC': 'Monaco',
2375 'MN': 'Mongolia',
2376 'ME': 'Montenegro',
2377 'MS': 'Montserrat',
2378 'MA': 'Morocco',
2379 'MZ': 'Mozambique',
2380 'MM': 'Myanmar',
2381 'NA': 'Namibia',
2382 'NR': 'Nauru',
2383 'NP': 'Nepal',
2384 'NL': 'Netherlands',
2385 'NC': 'New Caledonia',
2386 'NZ': 'New Zealand',
2387 'NI': 'Nicaragua',
2388 'NE': 'Niger',
2389 'NG': 'Nigeria',
2390 'NU': 'Niue',
2391 'NF': 'Norfolk Island',
2392 'MP': 'Northern Mariana Islands',
2393 'NO': 'Norway',
2394 'OM': 'Oman',
2395 'PK': 'Pakistan',
2396 'PW': 'Palau',
2397 'PS': 'Palestine, State of',
2398 'PA': 'Panama',
2399 'PG': 'Papua New Guinea',
2400 'PY': 'Paraguay',
2401 'PE': 'Peru',
2402 'PH': 'Philippines',
2403 'PN': 'Pitcairn',
2404 'PL': 'Poland',
2405 'PT': 'Portugal',
2406 'PR': 'Puerto Rico',
2407 'QA': 'Qatar',
2408 'RE': 'Réunion',
2409 'RO': 'Romania',
2410 'RU': 'Russian Federation',
2411 'RW': 'Rwanda',
2412 'BL': 'Saint Barthélemy',
2413 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2414 'KN': 'Saint Kitts and Nevis',
2415 'LC': 'Saint Lucia',
2416 'MF': 'Saint Martin (French part)',
2417 'PM': 'Saint Pierre and Miquelon',
2418 'VC': 'Saint Vincent and the Grenadines',
2419 'WS': 'Samoa',
2420 'SM': 'San Marino',
2421 'ST': 'Sao Tome and Principe',
2422 'SA': 'Saudi Arabia',
2423 'SN': 'Senegal',
2424 'RS': 'Serbia',
2425 'SC': 'Seychelles',
2426 'SL': 'Sierra Leone',
2427 'SG': 'Singapore',
2428 'SX': 'Sint Maarten (Dutch part)',
2429 'SK': 'Slovakia',
2430 'SI': 'Slovenia',
2431 'SB': 'Solomon Islands',
2432 'SO': 'Somalia',
2433 'ZA': 'South Africa',
2434 'GS': 'South Georgia and the South Sandwich Islands',
2435 'SS': 'South Sudan',
2436 'ES': 'Spain',
2437 'LK': 'Sri Lanka',
2438 'SD': 'Sudan',
2439 'SR': 'Suriname',
2440 'SJ': 'Svalbard and Jan Mayen',
2441 'SZ': 'Swaziland',
2442 'SE': 'Sweden',
2443 'CH': 'Switzerland',
2444 'SY': 'Syrian Arab Republic',
2445 'TW': 'Taiwan, Province of China',
2446 'TJ': 'Tajikistan',
2447 'TZ': 'Tanzania, United Republic of',
2448 'TH': 'Thailand',
2449 'TL': 'Timor-Leste',
2450 'TG': 'Togo',
2451 'TK': 'Tokelau',
2452 'TO': 'Tonga',
2453 'TT': 'Trinidad and Tobago',
2454 'TN': 'Tunisia',
2455 'TR': 'Turkey',
2456 'TM': 'Turkmenistan',
2457 'TC': 'Turks and Caicos Islands',
2458 'TV': 'Tuvalu',
2459 'UG': 'Uganda',
2460 'UA': 'Ukraine',
2461 'AE': 'United Arab Emirates',
2462 'GB': 'United Kingdom',
2463 'US': 'United States',
2464 'UM': 'United States Minor Outlying Islands',
2465 'UY': 'Uruguay',
2466 'UZ': 'Uzbekistan',
2467 'VU': 'Vanuatu',
2468 'VE': 'Venezuela, Bolivarian Republic of',
2469 'VN': 'Viet Nam',
2470 'VG': 'Virgin Islands, British',
2471 'VI': 'Virgin Islands, U.S.',
2472 'WF': 'Wallis and Futuna',
2473 'EH': 'Western Sahara',
2474 'YE': 'Yemen',
2475 'ZM': 'Zambia',
2476 'ZW': 'Zimbabwe',
2477 }
2478
2479 @classmethod
2480 def short2full(cls, code):
2481 """Convert an ISO 3166-2 country code to the corresponding full name"""
2482 return cls._country_map.get(code.upper())
2483
2484
2485 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2486 def __init__(self, proxies=None):
2487 # Set default handlers
2488 for type in ('http', 'https'):
2489 setattr(self, '%s_open' % type,
2490 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2491 meth(r, proxy, type))
2492 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2493
2494 def proxy_open(self, req, proxy, type):
2495 req_proxy = req.headers.get('Ytdl-request-proxy')
2496 if req_proxy is not None:
2497 proxy = req_proxy
2498 del req.headers['Ytdl-request-proxy']
2499
2500 if proxy == '__noproxy__':
2501 return None # No Proxy
2502 return compat_urllib_request.ProxyHandler.proxy_open(
2503 self, req, proxy, type)