]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
Merge branch 'pr-democracynow' of https://github.com/atomicdryad/youtube-dl into...
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import base64
7 import calendar
8 import codecs
9 import contextlib
10 import ctypes
11 import datetime
12 import email.utils
13 import errno
14 import functools
15 import gzip
16 import itertools
17 import io
18 import json
19 import locale
20 import math
21 import operator
22 import os
23 import pipes
24 import platform
25 import re
26 import ssl
27 import socket
28 import struct
29 import subprocess
30 import sys
31 import tempfile
32 import traceback
33 import xml.etree.ElementTree
34 import zlib
35
36 from .compat import (
37 compat_basestring,
38 compat_chr,
39 compat_html_entities,
40 compat_http_client,
41 compat_kwargs,
42 compat_parse_qs,
43 compat_socket_create_connection,
44 compat_str,
45 compat_urllib_error,
46 compat_urllib_parse,
47 compat_urllib_parse_urlparse,
48 compat_urllib_request,
49 compat_urlparse,
50 shlex_quote,
51 )
52
53
54 # This is not clearly defined otherwise
55 compiled_regex_type = type(re.compile(''))
56
57 std_headers = {
58 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
59 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
60 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
61 'Accept-Encoding': 'gzip, deflate',
62 'Accept-Language': 'en-us,en;q=0.5',
63 }
64
65
66 NO_DEFAULT = object()
67
68 ENGLISH_MONTH_NAMES = [
69 'January', 'February', 'March', 'April', 'May', 'June',
70 'July', 'August', 'September', 'October', 'November', 'December']
71
72
73 def preferredencoding():
74 """Get preferred encoding.
75
76 Returns the best encoding scheme for the system, based on
77 locale.getpreferredencoding() and some further tweaks.
78 """
79 try:
80 pref = locale.getpreferredencoding()
81 'TEST'.encode(pref)
82 except Exception:
83 pref = 'UTF-8'
84
85 return pref
86
87
88 def write_json_file(obj, fn):
89 """ Encode obj as JSON and write it to fn, atomically if possible """
90
91 fn = encodeFilename(fn)
92 if sys.version_info < (3, 0) and sys.platform != 'win32':
93 encoding = get_filesystem_encoding()
94 # os.path.basename returns a bytes object, but NamedTemporaryFile
95 # will fail if the filename contains non ascii characters unless we
96 # use a unicode object
97 path_basename = lambda f: os.path.basename(fn).decode(encoding)
98 # the same for os.path.dirname
99 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
100 else:
101 path_basename = os.path.basename
102 path_dirname = os.path.dirname
103
104 args = {
105 'suffix': '.tmp',
106 'prefix': path_basename(fn) + '.',
107 'dir': path_dirname(fn),
108 'delete': False,
109 }
110
111 # In Python 2.x, json.dump expects a bytestream.
112 # In Python 3.x, it writes to a character stream
113 if sys.version_info < (3, 0):
114 args['mode'] = 'wb'
115 else:
116 args.update({
117 'mode': 'w',
118 'encoding': 'utf-8',
119 })
120
121 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
122
123 try:
124 with tf:
125 json.dump(obj, tf)
126 if sys.platform == 'win32':
127 # Need to remove existing file on Windows, else os.rename raises
128 # WindowsError or FileExistsError.
129 try:
130 os.unlink(fn)
131 except OSError:
132 pass
133 os.rename(tf.name, fn)
134 except Exception:
135 try:
136 os.remove(tf.name)
137 except OSError:
138 pass
139 raise
140
141
142 if sys.version_info >= (2, 7):
143 def find_xpath_attr(node, xpath, key, val=None):
144 """ Find the xpath xpath[@key=val] """
145 assert re.match(r'^[a-zA-Z_-]+$', key)
146 if val:
147 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
148 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
149 return node.find(expr)
150 else:
151 def find_xpath_attr(node, xpath, key, val=None):
152 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
153 # .//node does not match if a node is a direct child of . !
154 if isinstance(xpath, compat_str):
155 xpath = xpath.encode('ascii')
156
157 for f in node.findall(xpath):
158 if key not in f.attrib:
159 continue
160 if val is None or f.attrib.get(key) == val:
161 return f
162 return None
163
164 # On python2.6 the xml.etree.ElementTree.Element methods don't support
165 # the namespace parameter
166
167
168 def xpath_with_ns(path, ns_map):
169 components = [c.split(':') for c in path.split('/')]
170 replaced = []
171 for c in components:
172 if len(c) == 1:
173 replaced.append(c[0])
174 else:
175 ns, tag = c
176 replaced.append('{%s}%s' % (ns_map[ns], tag))
177 return '/'.join(replaced)
178
179
180 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
181 if sys.version_info < (2, 7): # Crazy 2.6
182 xpath = xpath.encode('ascii')
183
184 n = node.find(xpath)
185 if n is None:
186 if default is not NO_DEFAULT:
187 return default
188 elif fatal:
189 name = xpath if name is None else name
190 raise ExtractorError('Could not find XML element %s' % name)
191 else:
192 return None
193 return n
194
195
196 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
197 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
198 if n is None or n == default:
199 return n
200 if n.text is None:
201 if default is not NO_DEFAULT:
202 return default
203 elif fatal:
204 name = xpath if name is None else name
205 raise ExtractorError('Could not find XML element\'s text %s' % name)
206 else:
207 return None
208 return n.text
209
210
211 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
212 n = find_xpath_attr(node, xpath, key)
213 if n is None:
214 if default is not NO_DEFAULT:
215 return default
216 elif fatal:
217 name = '%s[@%s]' % (xpath, key) if name is None else name
218 raise ExtractorError('Could not find XML attribute %s' % name)
219 else:
220 return None
221 return n.attrib[key]
222
223
224 def get_element_by_id(id, html):
225 """Return the content of the tag with the specified ID in the passed HTML document"""
226 return get_element_by_attribute("id", id, html)
227
228
229 def get_element_by_attribute(attribute, value, html):
230 """Return the content of the tag with the specified attribute in the passed HTML document"""
231
232 m = re.search(r'''(?xs)
233 <([a-zA-Z0-9:._-]+)
234 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
235 \s+%s=['"]?%s['"]?
236 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
237 \s*>
238 (?P<content>.*?)
239 </\1>
240 ''' % (re.escape(attribute), re.escape(value)), html)
241
242 if not m:
243 return None
244 res = m.group('content')
245
246 if res.startswith('"') or res.startswith("'"):
247 res = res[1:-1]
248
249 return unescapeHTML(res)
250
251
252 def clean_html(html):
253 """Clean an HTML snippet into a readable string"""
254
255 if html is None: # Convenience for sanitizing descriptions etc.
256 return html
257
258 # Newline vs <br />
259 html = html.replace('\n', ' ')
260 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
261 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
262 # Strip html tags
263 html = re.sub('<.*?>', '', html)
264 # Replace html entities
265 html = unescapeHTML(html)
266 return html.strip()
267
268
269 def sanitize_open(filename, open_mode):
270 """Try to open the given filename, and slightly tweak it if this fails.
271
272 Attempts to open the given filename. If this fails, it tries to change
273 the filename slightly, step by step, until it's either able to open it
274 or it fails and raises a final exception, like the standard open()
275 function.
276
277 It returns the tuple (stream, definitive_file_name).
278 """
279 try:
280 if filename == '-':
281 if sys.platform == 'win32':
282 import msvcrt
283 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
284 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
285 stream = open(encodeFilename(filename), open_mode)
286 return (stream, filename)
287 except (IOError, OSError) as err:
288 if err.errno in (errno.EACCES,):
289 raise
290
291 # In case of error, try to remove win32 forbidden chars
292 alt_filename = sanitize_path(filename)
293 if alt_filename == filename:
294 raise
295 else:
296 # An exception here should be caught in the caller
297 stream = open(encodeFilename(alt_filename), open_mode)
298 return (stream, alt_filename)
299
300
301 def timeconvert(timestr):
302 """Convert RFC 2822 defined time string into system timestamp"""
303 timestamp = None
304 timetuple = email.utils.parsedate_tz(timestr)
305 if timetuple is not None:
306 timestamp = email.utils.mktime_tz(timetuple)
307 return timestamp
308
309
310 def sanitize_filename(s, restricted=False, is_id=False):
311 """Sanitizes a string so it could be used as part of a filename.
312 If restricted is set, use a stricter subset of allowed characters.
313 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
314 """
315 def replace_insane(char):
316 if char == '?' or ord(char) < 32 or ord(char) == 127:
317 return ''
318 elif char == '"':
319 return '' if restricted else '\''
320 elif char == ':':
321 return '_-' if restricted else ' -'
322 elif char in '\\/|*<>':
323 return '_'
324 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
325 return '_'
326 if restricted and ord(char) > 127:
327 return '_'
328 return char
329
330 # Handle timestamps
331 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
332 result = ''.join(map(replace_insane, s))
333 if not is_id:
334 while '__' in result:
335 result = result.replace('__', '_')
336 result = result.strip('_')
337 # Common case of "Foreign band name - English song title"
338 if restricted and result.startswith('-_'):
339 result = result[2:]
340 if result.startswith('-'):
341 result = '_' + result[len('-'):]
342 result = result.lstrip('.')
343 if not result:
344 result = '_'
345 return result
346
347
348 def sanitize_path(s):
349 """Sanitizes and normalizes path on Windows"""
350 if sys.platform != 'win32':
351 return s
352 drive_or_unc, _ = os.path.splitdrive(s)
353 if sys.version_info < (2, 7) and not drive_or_unc:
354 drive_or_unc, _ = os.path.splitunc(s)
355 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
356 if drive_or_unc:
357 norm_path.pop(0)
358 sanitized_path = [
359 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
360 for path_part in norm_path]
361 if drive_or_unc:
362 sanitized_path.insert(0, drive_or_unc + os.path.sep)
363 return os.path.join(*sanitized_path)
364
365
366 def orderedSet(iterable):
367 """ Remove all duplicates from the input iterable """
368 res = []
369 for el in iterable:
370 if el not in res:
371 res.append(el)
372 return res
373
374
375 def _htmlentity_transform(entity):
376 """Transforms an HTML entity to a character."""
377 # Known non-numeric HTML entity
378 if entity in compat_html_entities.name2codepoint:
379 return compat_chr(compat_html_entities.name2codepoint[entity])
380
381 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
382 if mobj is not None:
383 numstr = mobj.group(1)
384 if numstr.startswith('x'):
385 base = 16
386 numstr = '0%s' % numstr
387 else:
388 base = 10
389 return compat_chr(int(numstr, base))
390
391 # Unknown entity in name, return its literal representation
392 return ('&%s;' % entity)
393
394
395 def unescapeHTML(s):
396 if s is None:
397 return None
398 assert type(s) == compat_str
399
400 return re.sub(
401 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
402
403
404 def get_subprocess_encoding():
405 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
406 # For subprocess calls, encode with locale encoding
407 # Refer to http://stackoverflow.com/a/9951851/35070
408 encoding = preferredencoding()
409 else:
410 encoding = sys.getfilesystemencoding()
411 if encoding is None:
412 encoding = 'utf-8'
413 return encoding
414
415
416 def encodeFilename(s, for_subprocess=False):
417 """
418 @param s The name of the file
419 """
420
421 assert type(s) == compat_str
422
423 # Python 3 has a Unicode API
424 if sys.version_info >= (3, 0):
425 return s
426
427 # Pass '' directly to use Unicode APIs on Windows 2000 and up
428 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
429 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
430 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
431 return s
432
433 return s.encode(get_subprocess_encoding(), 'ignore')
434
435
436 def decodeFilename(b, for_subprocess=False):
437
438 if sys.version_info >= (3, 0):
439 return b
440
441 if not isinstance(b, bytes):
442 return b
443
444 return b.decode(get_subprocess_encoding(), 'ignore')
445
446
447 def encodeArgument(s):
448 if not isinstance(s, compat_str):
449 # Legacy code that uses byte strings
450 # Uncomment the following line after fixing all post processors
451 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
452 s = s.decode('ascii')
453 return encodeFilename(s, True)
454
455
456 def decodeArgument(b):
457 return decodeFilename(b, True)
458
459
460 def decodeOption(optval):
461 if optval is None:
462 return optval
463 if isinstance(optval, bytes):
464 optval = optval.decode(preferredencoding())
465
466 assert isinstance(optval, compat_str)
467 return optval
468
469
470 def formatSeconds(secs):
471 if secs > 3600:
472 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
473 elif secs > 60:
474 return '%d:%02d' % (secs // 60, secs % 60)
475 else:
476 return '%d' % secs
477
478
479 def make_HTTPS_handler(params, **kwargs):
480 opts_no_check_certificate = params.get('nocheckcertificate', False)
481 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
482 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
483 if opts_no_check_certificate:
484 context.check_hostname = False
485 context.verify_mode = ssl.CERT_NONE
486 try:
487 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
488 except TypeError:
489 # Python 2.7.8
490 # (create_default_context present but HTTPSHandler has no context=)
491 pass
492
493 if sys.version_info < (3, 2):
494 return YoutubeDLHTTPSHandler(params, **kwargs)
495 else: # Python < 3.4
496 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
497 context.verify_mode = (ssl.CERT_NONE
498 if opts_no_check_certificate
499 else ssl.CERT_REQUIRED)
500 context.set_default_verify_paths()
501 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
502
503
504 def bug_reports_message():
505 if ytdl_is_updateable():
506 update_cmd = 'type youtube-dl -U to update'
507 else:
508 update_cmd = 'see https://yt-dl.org/update on how to update'
509 msg = '; please report this issue on https://yt-dl.org/bug .'
510 msg += ' Make sure you are using the latest version; %s.' % update_cmd
511 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
512 return msg
513
514
515 class ExtractorError(Exception):
516 """Error during info extraction."""
517
518 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
519 """ tb, if given, is the original traceback (so that it can be printed out).
520 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
521 """
522
523 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
524 expected = True
525 if video_id is not None:
526 msg = video_id + ': ' + msg
527 if cause:
528 msg += ' (caused by %r)' % cause
529 if not expected:
530 msg += bug_reports_message()
531 super(ExtractorError, self).__init__(msg)
532
533 self.traceback = tb
534 self.exc_info = sys.exc_info() # preserve original exception
535 self.cause = cause
536 self.video_id = video_id
537
538 def format_traceback(self):
539 if self.traceback is None:
540 return None
541 return ''.join(traceback.format_tb(self.traceback))
542
543
544 class UnsupportedError(ExtractorError):
545 def __init__(self, url):
546 super(UnsupportedError, self).__init__(
547 'Unsupported URL: %s' % url, expected=True)
548 self.url = url
549
550
551 class RegexNotFoundError(ExtractorError):
552 """Error when a regex didn't match"""
553 pass
554
555
556 class DownloadError(Exception):
557 """Download Error exception.
558
559 This exception may be thrown by FileDownloader objects if they are not
560 configured to continue on errors. They will contain the appropriate
561 error message.
562 """
563
564 def __init__(self, msg, exc_info=None):
565 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
566 super(DownloadError, self).__init__(msg)
567 self.exc_info = exc_info
568
569
570 class SameFileError(Exception):
571 """Same File exception.
572
573 This exception will be thrown by FileDownloader objects if they detect
574 multiple files would have to be downloaded to the same file on disk.
575 """
576 pass
577
578
579 class PostProcessingError(Exception):
580 """Post Processing exception.
581
582 This exception may be raised by PostProcessor's .run() method to
583 indicate an error in the postprocessing task.
584 """
585
586 def __init__(self, msg):
587 self.msg = msg
588
589
590 class MaxDownloadsReached(Exception):
591 """ --max-downloads limit has been reached. """
592 pass
593
594
595 class UnavailableVideoError(Exception):
596 """Unavailable Format exception.
597
598 This exception will be thrown when a video is requested
599 in a format that is not available for that video.
600 """
601 pass
602
603
604 class ContentTooShortError(Exception):
605 """Content Too Short exception.
606
607 This exception may be raised by FileDownloader objects when a file they
608 download is too small for what the server announced first, indicating
609 the connection was probably interrupted.
610 """
611
612 def __init__(self, downloaded, expected):
613 # Both in bytes
614 self.downloaded = downloaded
615 self.expected = expected
616
617
618 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
619 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
620 # expected HTTP responses to meet HTTP/1.0 or later (see also
621 # https://github.com/rg3/youtube-dl/issues/6727)
622 if sys.version_info < (3, 0):
623 kwargs[b'strict'] = True
624 hc = http_class(*args, **kwargs)
625 source_address = ydl_handler._params.get('source_address')
626 if source_address is not None:
627 sa = (source_address, 0)
628 if hasattr(hc, 'source_address'): # Python 2.7+
629 hc.source_address = sa
630 else: # Python 2.6
631 def _hc_connect(self, *args, **kwargs):
632 sock = compat_socket_create_connection(
633 (self.host, self.port), self.timeout, sa)
634 if is_https:
635 self.sock = ssl.wrap_socket(
636 sock, self.key_file, self.cert_file,
637 ssl_version=ssl.PROTOCOL_TLSv1)
638 else:
639 self.sock = sock
640 hc.connect = functools.partial(_hc_connect, hc)
641
642 return hc
643
644
645 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
646 """Handler for HTTP requests and responses.
647
648 This class, when installed with an OpenerDirector, automatically adds
649 the standard headers to every HTTP request and handles gzipped and
650 deflated responses from web servers. If compression is to be avoided in
651 a particular request, the original request in the program code only has
652 to include the HTTP header "Youtubedl-No-Compression", which will be
653 removed before making the real request.
654
655 Part of this code was copied from:
656
657 http://techknack.net/python-urllib2-handlers/
658
659 Andrew Rowls, the author of that code, agreed to release it to the
660 public domain.
661 """
662
663 def __init__(self, params, *args, **kwargs):
664 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
665 self._params = params
666
667 def http_open(self, req):
668 return self.do_open(functools.partial(
669 _create_http_connection, self, compat_http_client.HTTPConnection, False),
670 req)
671
672 @staticmethod
673 def deflate(data):
674 try:
675 return zlib.decompress(data, -zlib.MAX_WBITS)
676 except zlib.error:
677 return zlib.decompress(data)
678
679 @staticmethod
680 def addinfourl_wrapper(stream, headers, url, code):
681 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
682 return compat_urllib_request.addinfourl(stream, headers, url, code)
683 ret = compat_urllib_request.addinfourl(stream, headers, url)
684 ret.code = code
685 return ret
686
687 def http_request(self, req):
688 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
689 # always respected by websites, some tend to give out URLs with non percent-encoded
690 # non-ASCII characters (see telemb.py, ard.py [#3412])
691 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
692 # To work around aforementioned issue we will replace request's original URL with
693 # percent-encoded one
694 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
695 # the code of this workaround has been moved here from YoutubeDL.urlopen()
696 url = req.get_full_url()
697 url_escaped = escape_url(url)
698
699 # Substitute URL if any change after escaping
700 if url != url_escaped:
701 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
702 new_req = req_type(
703 url_escaped, data=req.data, headers=req.headers,
704 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
705 new_req.timeout = req.timeout
706 req = new_req
707
708 for h, v in std_headers.items():
709 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
710 # The dict keys are capitalized because of this bug by urllib
711 if h.capitalize() not in req.headers:
712 req.add_header(h, v)
713 if 'Youtubedl-no-compression' in req.headers:
714 if 'Accept-encoding' in req.headers:
715 del req.headers['Accept-encoding']
716 del req.headers['Youtubedl-no-compression']
717
718 if sys.version_info < (2, 7) and '#' in req.get_full_url():
719 # Python 2.6 is brain-dead when it comes to fragments
720 req._Request__original = req._Request__original.partition('#')[0]
721 req._Request__r_type = req._Request__r_type.partition('#')[0]
722
723 return req
724
725 def http_response(self, req, resp):
726 old_resp = resp
727 # gzip
728 if resp.headers.get('Content-encoding', '') == 'gzip':
729 content = resp.read()
730 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
731 try:
732 uncompressed = io.BytesIO(gz.read())
733 except IOError as original_ioerror:
734 # There may be junk add the end of the file
735 # See http://stackoverflow.com/q/4928560/35070 for details
736 for i in range(1, 1024):
737 try:
738 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
739 uncompressed = io.BytesIO(gz.read())
740 except IOError:
741 continue
742 break
743 else:
744 raise original_ioerror
745 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
746 resp.msg = old_resp.msg
747 # deflate
748 if resp.headers.get('Content-encoding', '') == 'deflate':
749 gz = io.BytesIO(self.deflate(resp.read()))
750 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
751 resp.msg = old_resp.msg
752 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
753 # https://github.com/rg3/youtube-dl/issues/6457).
754 if 300 <= resp.code < 400:
755 location = resp.headers.get('Location')
756 if location:
757 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
758 if sys.version_info >= (3, 0):
759 location = location.encode('iso-8859-1').decode('utf-8')
760 location_escaped = escape_url(location)
761 if location != location_escaped:
762 del resp.headers['Location']
763 resp.headers['Location'] = location_escaped
764 return resp
765
766 https_request = http_request
767 https_response = http_response
768
769
770 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
771 def __init__(self, params, https_conn_class=None, *args, **kwargs):
772 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
773 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
774 self._params = params
775
776 def https_open(self, req):
777 kwargs = {}
778 if hasattr(self, '_context'): # python > 2.6
779 kwargs['context'] = self._context
780 if hasattr(self, '_check_hostname'): # python 3.x
781 kwargs['check_hostname'] = self._check_hostname
782 return self.do_open(functools.partial(
783 _create_http_connection, self, self._https_conn_class, True),
784 req, **kwargs)
785
786
787 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
788 def __init__(self, cookiejar=None):
789 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
790
791 def http_response(self, request, response):
792 # Python 2 will choke on next HTTP request in row if there are non-ASCII
793 # characters in Set-Cookie HTTP header of last response (see
794 # https://github.com/rg3/youtube-dl/issues/6769).
795 # In order to at least prevent crashing we will percent encode Set-Cookie
796 # header before HTTPCookieProcessor starts processing it.
797 # if sys.version_info < (3, 0) and response.headers:
798 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
799 # set_cookie = response.headers.get(set_cookie_header)
800 # if set_cookie:
801 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
802 # if set_cookie != set_cookie_escaped:
803 # del response.headers[set_cookie_header]
804 # response.headers[set_cookie_header] = set_cookie_escaped
805 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
806
807 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
808 https_response = http_response
809
810
811 def parse_iso8601(date_str, delimiter='T', timezone=None):
812 """ Return a UNIX timestamp from the given date """
813
814 if date_str is None:
815 return None
816
817 date_str = re.sub(r'\.[0-9]+', '', date_str)
818
819 if timezone is None:
820 m = re.search(
821 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
822 date_str)
823 if not m:
824 timezone = datetime.timedelta()
825 else:
826 date_str = date_str[:-len(m.group(0))]
827 if not m.group('sign'):
828 timezone = datetime.timedelta()
829 else:
830 sign = 1 if m.group('sign') == '+' else -1
831 timezone = datetime.timedelta(
832 hours=sign * int(m.group('hours')),
833 minutes=sign * int(m.group('minutes')))
834 try:
835 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
836 dt = datetime.datetime.strptime(date_str, date_format) - timezone
837 return calendar.timegm(dt.timetuple())
838 except ValueError:
839 pass
840
841
842 def unified_strdate(date_str, day_first=True):
843 """Return a string with the date in the format YYYYMMDD"""
844
845 if date_str is None:
846 return None
847 upload_date = None
848 # Replace commas
849 date_str = date_str.replace(',', ' ')
850 # %z (UTC offset) is only supported in python>=3.2
851 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
852 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
853 # Remove AM/PM + timezone
854 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
855
856 format_expressions = [
857 '%d %B %Y',
858 '%d %b %Y',
859 '%B %d %Y',
860 '%b %d %Y',
861 '%b %dst %Y %I:%M%p',
862 '%b %dnd %Y %I:%M%p',
863 '%b %dth %Y %I:%M%p',
864 '%Y %m %d',
865 '%Y-%m-%d',
866 '%Y/%m/%d',
867 '%Y/%m/%d %H:%M:%S',
868 '%Y-%m-%d %H:%M:%S',
869 '%Y-%m-%d %H:%M:%S.%f',
870 '%d.%m.%Y %H:%M',
871 '%d.%m.%Y %H.%M',
872 '%Y-%m-%dT%H:%M:%SZ',
873 '%Y-%m-%dT%H:%M:%S.%fZ',
874 '%Y-%m-%dT%H:%M:%S.%f0Z',
875 '%Y-%m-%dT%H:%M:%S',
876 '%Y-%m-%dT%H:%M:%S.%f',
877 '%Y-%m-%dT%H:%M',
878 ]
879 if day_first:
880 format_expressions.extend([
881 '%d-%m-%Y',
882 '%d.%m.%Y',
883 '%d/%m/%Y',
884 '%d/%m/%y',
885 '%d/%m/%Y %H:%M:%S',
886 ])
887 else:
888 format_expressions.extend([
889 '%m-%d-%Y',
890 '%m.%d.%Y',
891 '%m/%d/%Y',
892 '%m/%d/%y',
893 '%m/%d/%Y %H:%M:%S',
894 ])
895 for expression in format_expressions:
896 try:
897 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
898 except ValueError:
899 pass
900 if upload_date is None:
901 timetuple = email.utils.parsedate_tz(date_str)
902 if timetuple:
903 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
904 return upload_date
905
906
907 def determine_ext(url, default_ext='unknown_video'):
908 if url is None:
909 return default_ext
910 guess = url.partition('?')[0].rpartition('.')[2]
911 if re.match(r'^[A-Za-z0-9]+$', guess):
912 return guess
913 else:
914 return default_ext
915
916
917 def subtitles_filename(filename, sub_lang, sub_format):
918 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
919
920
921 def date_from_str(date_str):
922 """
923 Return a datetime object from a string in the format YYYYMMDD or
924 (now|today)[+-][0-9](day|week|month|year)(s)?"""
925 today = datetime.date.today()
926 if date_str in ('now', 'today'):
927 return today
928 if date_str == 'yesterday':
929 return today - datetime.timedelta(days=1)
930 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
931 if match is not None:
932 sign = match.group('sign')
933 time = int(match.group('time'))
934 if sign == '-':
935 time = -time
936 unit = match.group('unit')
937 # A bad aproximation?
938 if unit == 'month':
939 unit = 'day'
940 time *= 30
941 elif unit == 'year':
942 unit = 'day'
943 time *= 365
944 unit += 's'
945 delta = datetime.timedelta(**{unit: time})
946 return today + delta
947 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
948
949
950 def hyphenate_date(date_str):
951 """
952 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
953 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
954 if match is not None:
955 return '-'.join(match.groups())
956 else:
957 return date_str
958
959
960 class DateRange(object):
961 """Represents a time interval between two dates"""
962
963 def __init__(self, start=None, end=None):
964 """start and end must be strings in the format accepted by date"""
965 if start is not None:
966 self.start = date_from_str(start)
967 else:
968 self.start = datetime.datetime.min.date()
969 if end is not None:
970 self.end = date_from_str(end)
971 else:
972 self.end = datetime.datetime.max.date()
973 if self.start > self.end:
974 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
975
976 @classmethod
977 def day(cls, day):
978 """Returns a range that only contains the given day"""
979 return cls(day, day)
980
981 def __contains__(self, date):
982 """Check if the date is in the range"""
983 if not isinstance(date, datetime.date):
984 date = date_from_str(date)
985 return self.start <= date <= self.end
986
987 def __str__(self):
988 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
989
990
991 def platform_name():
992 """ Returns the platform name as a compat_str """
993 res = platform.platform()
994 if isinstance(res, bytes):
995 res = res.decode(preferredencoding())
996
997 assert isinstance(res, compat_str)
998 return res
999
1000
1001 def _windows_write_string(s, out):
1002 """ Returns True if the string was written using special methods,
1003 False if it has yet to be written out."""
1004 # Adapted from http://stackoverflow.com/a/3259271/35070
1005
1006 import ctypes
1007 import ctypes.wintypes
1008
1009 WIN_OUTPUT_IDS = {
1010 1: -11,
1011 2: -12,
1012 }
1013
1014 try:
1015 fileno = out.fileno()
1016 except AttributeError:
1017 # If the output stream doesn't have a fileno, it's virtual
1018 return False
1019 except io.UnsupportedOperation:
1020 # Some strange Windows pseudo files?
1021 return False
1022 if fileno not in WIN_OUTPUT_IDS:
1023 return False
1024
1025 GetStdHandle = ctypes.WINFUNCTYPE(
1026 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1027 (b"GetStdHandle", ctypes.windll.kernel32))
1028 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1029
1030 WriteConsoleW = ctypes.WINFUNCTYPE(
1031 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1032 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1033 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
1034 written = ctypes.wintypes.DWORD(0)
1035
1036 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
1037 FILE_TYPE_CHAR = 0x0002
1038 FILE_TYPE_REMOTE = 0x8000
1039 GetConsoleMode = ctypes.WINFUNCTYPE(
1040 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1041 ctypes.POINTER(ctypes.wintypes.DWORD))(
1042 (b"GetConsoleMode", ctypes.windll.kernel32))
1043 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1044
1045 def not_a_console(handle):
1046 if handle == INVALID_HANDLE_VALUE or handle is None:
1047 return True
1048 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1049 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1050
1051 if not_a_console(h):
1052 return False
1053
1054 def next_nonbmp_pos(s):
1055 try:
1056 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1057 except StopIteration:
1058 return len(s)
1059
1060 while s:
1061 count = min(next_nonbmp_pos(s), 1024)
1062
1063 ret = WriteConsoleW(
1064 h, s, count if count else 2, ctypes.byref(written), None)
1065 if ret == 0:
1066 raise OSError('Failed to write string')
1067 if not count: # We just wrote a non-BMP character
1068 assert written.value == 2
1069 s = s[1:]
1070 else:
1071 assert written.value > 0
1072 s = s[written.value:]
1073 return True
1074
1075
1076 def write_string(s, out=None, encoding=None):
1077 if out is None:
1078 out = sys.stderr
1079 assert type(s) == compat_str
1080
1081 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1082 if _windows_write_string(s, out):
1083 return
1084
1085 if ('b' in getattr(out, 'mode', '') or
1086 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1087 byt = s.encode(encoding or preferredencoding(), 'ignore')
1088 out.write(byt)
1089 elif hasattr(out, 'buffer'):
1090 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1091 byt = s.encode(enc, 'ignore')
1092 out.buffer.write(byt)
1093 else:
1094 out.write(s)
1095 out.flush()
1096
1097
1098 def bytes_to_intlist(bs):
1099 if not bs:
1100 return []
1101 if isinstance(bs[0], int): # Python 3
1102 return list(bs)
1103 else:
1104 return [ord(c) for c in bs]
1105
1106
1107 def intlist_to_bytes(xs):
1108 if not xs:
1109 return b''
1110 return struct_pack('%dB' % len(xs), *xs)
1111
1112
1113 # Cross-platform file locking
1114 if sys.platform == 'win32':
1115 import ctypes.wintypes
1116 import msvcrt
1117
1118 class OVERLAPPED(ctypes.Structure):
1119 _fields_ = [
1120 ('Internal', ctypes.wintypes.LPVOID),
1121 ('InternalHigh', ctypes.wintypes.LPVOID),
1122 ('Offset', ctypes.wintypes.DWORD),
1123 ('OffsetHigh', ctypes.wintypes.DWORD),
1124 ('hEvent', ctypes.wintypes.HANDLE),
1125 ]
1126
1127 kernel32 = ctypes.windll.kernel32
1128 LockFileEx = kernel32.LockFileEx
1129 LockFileEx.argtypes = [
1130 ctypes.wintypes.HANDLE, # hFile
1131 ctypes.wintypes.DWORD, # dwFlags
1132 ctypes.wintypes.DWORD, # dwReserved
1133 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1134 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1135 ctypes.POINTER(OVERLAPPED) # Overlapped
1136 ]
1137 LockFileEx.restype = ctypes.wintypes.BOOL
1138 UnlockFileEx = kernel32.UnlockFileEx
1139 UnlockFileEx.argtypes = [
1140 ctypes.wintypes.HANDLE, # hFile
1141 ctypes.wintypes.DWORD, # dwReserved
1142 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1143 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1144 ctypes.POINTER(OVERLAPPED) # Overlapped
1145 ]
1146 UnlockFileEx.restype = ctypes.wintypes.BOOL
1147 whole_low = 0xffffffff
1148 whole_high = 0x7fffffff
1149
1150 def _lock_file(f, exclusive):
1151 overlapped = OVERLAPPED()
1152 overlapped.Offset = 0
1153 overlapped.OffsetHigh = 0
1154 overlapped.hEvent = 0
1155 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1156 handle = msvcrt.get_osfhandle(f.fileno())
1157 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1158 whole_low, whole_high, f._lock_file_overlapped_p):
1159 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1160
1161 def _unlock_file(f):
1162 assert f._lock_file_overlapped_p
1163 handle = msvcrt.get_osfhandle(f.fileno())
1164 if not UnlockFileEx(handle, 0,
1165 whole_low, whole_high, f._lock_file_overlapped_p):
1166 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1167
1168 else:
1169 import fcntl
1170
1171 def _lock_file(f, exclusive):
1172 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1173
1174 def _unlock_file(f):
1175 fcntl.flock(f, fcntl.LOCK_UN)
1176
1177
1178 class locked_file(object):
1179 def __init__(self, filename, mode, encoding=None):
1180 assert mode in ['r', 'a', 'w']
1181 self.f = io.open(filename, mode, encoding=encoding)
1182 self.mode = mode
1183
1184 def __enter__(self):
1185 exclusive = self.mode != 'r'
1186 try:
1187 _lock_file(self.f, exclusive)
1188 except IOError:
1189 self.f.close()
1190 raise
1191 return self
1192
1193 def __exit__(self, etype, value, traceback):
1194 try:
1195 _unlock_file(self.f)
1196 finally:
1197 self.f.close()
1198
1199 def __iter__(self):
1200 return iter(self.f)
1201
1202 def write(self, *args):
1203 return self.f.write(*args)
1204
1205 def read(self, *args):
1206 return self.f.read(*args)
1207
1208
1209 def get_filesystem_encoding():
1210 encoding = sys.getfilesystemencoding()
1211 return encoding if encoding is not None else 'utf-8'
1212
1213
1214 def shell_quote(args):
1215 quoted_args = []
1216 encoding = get_filesystem_encoding()
1217 for a in args:
1218 if isinstance(a, bytes):
1219 # We may get a filename encoded with 'encodeFilename'
1220 a = a.decode(encoding)
1221 quoted_args.append(pipes.quote(a))
1222 return ' '.join(quoted_args)
1223
1224
1225 def smuggle_url(url, data):
1226 """ Pass additional data in a URL for internal use. """
1227
1228 sdata = compat_urllib_parse.urlencode(
1229 {'__youtubedl_smuggle': json.dumps(data)})
1230 return url + '#' + sdata
1231
1232
1233 def unsmuggle_url(smug_url, default=None):
1234 if '#__youtubedl_smuggle' not in smug_url:
1235 return smug_url, default
1236 url, _, sdata = smug_url.rpartition('#')
1237 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1238 data = json.loads(jsond)
1239 return url, data
1240
1241
1242 def format_bytes(bytes):
1243 if bytes is None:
1244 return 'N/A'
1245 if type(bytes) is str:
1246 bytes = float(bytes)
1247 if bytes == 0.0:
1248 exponent = 0
1249 else:
1250 exponent = int(math.log(bytes, 1024.0))
1251 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1252 converted = float(bytes) / float(1024 ** exponent)
1253 return '%.2f%s' % (converted, suffix)
1254
1255
1256 def parse_filesize(s):
1257 if s is None:
1258 return None
1259
1260 # The lower-case forms are of course incorrect and inofficial,
1261 # but we support those too
1262 _UNIT_TABLE = {
1263 'B': 1,
1264 'b': 1,
1265 'KiB': 1024,
1266 'KB': 1000,
1267 'kB': 1024,
1268 'Kb': 1000,
1269 'MiB': 1024 ** 2,
1270 'MB': 1000 ** 2,
1271 'mB': 1024 ** 2,
1272 'Mb': 1000 ** 2,
1273 'GiB': 1024 ** 3,
1274 'GB': 1000 ** 3,
1275 'gB': 1024 ** 3,
1276 'Gb': 1000 ** 3,
1277 'TiB': 1024 ** 4,
1278 'TB': 1000 ** 4,
1279 'tB': 1024 ** 4,
1280 'Tb': 1000 ** 4,
1281 'PiB': 1024 ** 5,
1282 'PB': 1000 ** 5,
1283 'pB': 1024 ** 5,
1284 'Pb': 1000 ** 5,
1285 'EiB': 1024 ** 6,
1286 'EB': 1000 ** 6,
1287 'eB': 1024 ** 6,
1288 'Eb': 1000 ** 6,
1289 'ZiB': 1024 ** 7,
1290 'ZB': 1000 ** 7,
1291 'zB': 1024 ** 7,
1292 'Zb': 1000 ** 7,
1293 'YiB': 1024 ** 8,
1294 'YB': 1000 ** 8,
1295 'yB': 1024 ** 8,
1296 'Yb': 1000 ** 8,
1297 }
1298
1299 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1300 m = re.match(
1301 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1302 if not m:
1303 return None
1304
1305 num_str = m.group('num').replace(',', '.')
1306 mult = _UNIT_TABLE[m.group('unit')]
1307 return int(float(num_str) * mult)
1308
1309
1310 def month_by_name(name):
1311 """ Return the number of a month by (locale-independently) English name """
1312
1313 try:
1314 return ENGLISH_MONTH_NAMES.index(name) + 1
1315 except ValueError:
1316 return None
1317
1318
1319 def month_by_abbreviation(abbrev):
1320 """ Return the number of a month by (locale-independently) English
1321 abbreviations """
1322
1323 try:
1324 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1325 except ValueError:
1326 return None
1327
1328
1329 def fix_xml_ampersands(xml_str):
1330 """Replace all the '&' by '&amp;' in XML"""
1331 return re.sub(
1332 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1333 '&amp;',
1334 xml_str)
1335
1336
1337 def setproctitle(title):
1338 assert isinstance(title, compat_str)
1339 try:
1340 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1341 except OSError:
1342 return
1343 title_bytes = title.encode('utf-8')
1344 buf = ctypes.create_string_buffer(len(title_bytes))
1345 buf.value = title_bytes
1346 try:
1347 libc.prctl(15, buf, 0, 0, 0)
1348 except AttributeError:
1349 return # Strange libc, just skip this
1350
1351
1352 def remove_start(s, start):
1353 if s.startswith(start):
1354 return s[len(start):]
1355 return s
1356
1357
1358 def remove_end(s, end):
1359 if s.endswith(end):
1360 return s[:-len(end)]
1361 return s
1362
1363
1364 def url_basename(url):
1365 path = compat_urlparse.urlparse(url).path
1366 return path.strip('/').split('/')[-1]
1367
1368
1369 class HEADRequest(compat_urllib_request.Request):
1370 def get_method(self):
1371 return "HEAD"
1372
1373
1374 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1375 if get_attr:
1376 if v is not None:
1377 v = getattr(v, get_attr, None)
1378 if v == '':
1379 v = None
1380 if v is None:
1381 return default
1382 try:
1383 return int(v) * invscale // scale
1384 except ValueError:
1385 return default
1386
1387
1388 def str_or_none(v, default=None):
1389 return default if v is None else compat_str(v)
1390
1391
1392 def str_to_int(int_str):
1393 """ A more relaxed version of int_or_none """
1394 if int_str is None:
1395 return None
1396 int_str = re.sub(r'[,\.\+]', '', int_str)
1397 return int(int_str)
1398
1399
1400 def float_or_none(v, scale=1, invscale=1, default=None):
1401 if v is None:
1402 return default
1403 try:
1404 return float(v) * invscale / scale
1405 except ValueError:
1406 return default
1407
1408
1409 def parse_duration(s):
1410 if not isinstance(s, compat_basestring):
1411 return None
1412
1413 s = s.strip()
1414
1415 m = re.match(
1416 r'''(?ix)(?:P?T)?
1417 (?:
1418 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1419 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1420
1421 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1422 (?:
1423 (?:
1424 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1425 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1426 )?
1427 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1428 )?
1429 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1430 )$''', s)
1431 if not m:
1432 return None
1433 res = 0
1434 if m.group('only_mins'):
1435 return float_or_none(m.group('only_mins'), invscale=60)
1436 if m.group('only_hours'):
1437 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1438 if m.group('secs'):
1439 res += int(m.group('secs'))
1440 if m.group('mins_reversed'):
1441 res += int(m.group('mins_reversed')) * 60
1442 if m.group('mins'):
1443 res += int(m.group('mins')) * 60
1444 if m.group('hours'):
1445 res += int(m.group('hours')) * 60 * 60
1446 if m.group('hours_reversed'):
1447 res += int(m.group('hours_reversed')) * 60 * 60
1448 if m.group('days'):
1449 res += int(m.group('days')) * 24 * 60 * 60
1450 if m.group('ms'):
1451 res += float(m.group('ms'))
1452 return res
1453
1454
1455 def prepend_extension(filename, ext, expected_real_ext=None):
1456 name, real_ext = os.path.splitext(filename)
1457 return (
1458 '{0}.{1}{2}'.format(name, ext, real_ext)
1459 if not expected_real_ext or real_ext[1:] == expected_real_ext
1460 else '{0}.{1}'.format(filename, ext))
1461
1462
1463 def replace_extension(filename, ext, expected_real_ext=None):
1464 name, real_ext = os.path.splitext(filename)
1465 return '{0}.{1}'.format(
1466 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1467 ext)
1468
1469
1470 def check_executable(exe, args=[]):
1471 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1472 args can be a list of arguments for a short output (like -version) """
1473 try:
1474 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1475 except OSError:
1476 return False
1477 return exe
1478
1479
1480 def get_exe_version(exe, args=['--version'],
1481 version_re=None, unrecognized='present'):
1482 """ Returns the version of the specified executable,
1483 or False if the executable is not present """
1484 try:
1485 out, _ = subprocess.Popen(
1486 [encodeArgument(exe)] + args,
1487 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1488 except OSError:
1489 return False
1490 if isinstance(out, bytes): # Python 2.x
1491 out = out.decode('ascii', 'ignore')
1492 return detect_exe_version(out, version_re, unrecognized)
1493
1494
1495 def detect_exe_version(output, version_re=None, unrecognized='present'):
1496 assert isinstance(output, compat_str)
1497 if version_re is None:
1498 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1499 m = re.search(version_re, output)
1500 if m:
1501 return m.group(1)
1502 else:
1503 return unrecognized
1504
1505
1506 class PagedList(object):
1507 def __len__(self):
1508 # This is only useful for tests
1509 return len(self.getslice())
1510
1511
1512 class OnDemandPagedList(PagedList):
1513 def __init__(self, pagefunc, pagesize):
1514 self._pagefunc = pagefunc
1515 self._pagesize = pagesize
1516
1517 def getslice(self, start=0, end=None):
1518 res = []
1519 for pagenum in itertools.count(start // self._pagesize):
1520 firstid = pagenum * self._pagesize
1521 nextfirstid = pagenum * self._pagesize + self._pagesize
1522 if start >= nextfirstid:
1523 continue
1524
1525 page_results = list(self._pagefunc(pagenum))
1526
1527 startv = (
1528 start % self._pagesize
1529 if firstid <= start < nextfirstid
1530 else 0)
1531
1532 endv = (
1533 ((end - 1) % self._pagesize) + 1
1534 if (end is not None and firstid <= end <= nextfirstid)
1535 else None)
1536
1537 if startv != 0 or endv is not None:
1538 page_results = page_results[startv:endv]
1539 res.extend(page_results)
1540
1541 # A little optimization - if current page is not "full", ie. does
1542 # not contain page_size videos then we can assume that this page
1543 # is the last one - there are no more ids on further pages -
1544 # i.e. no need to query again.
1545 if len(page_results) + startv < self._pagesize:
1546 break
1547
1548 # If we got the whole page, but the next page is not interesting,
1549 # break out early as well
1550 if end == nextfirstid:
1551 break
1552 return res
1553
1554
1555 class InAdvancePagedList(PagedList):
1556 def __init__(self, pagefunc, pagecount, pagesize):
1557 self._pagefunc = pagefunc
1558 self._pagecount = pagecount
1559 self._pagesize = pagesize
1560
1561 def getslice(self, start=0, end=None):
1562 res = []
1563 start_page = start // self._pagesize
1564 end_page = (
1565 self._pagecount if end is None else (end // self._pagesize + 1))
1566 skip_elems = start - start_page * self._pagesize
1567 only_more = None if end is None else end - start
1568 for pagenum in range(start_page, end_page):
1569 page = list(self._pagefunc(pagenum))
1570 if skip_elems:
1571 page = page[skip_elems:]
1572 skip_elems = None
1573 if only_more is not None:
1574 if len(page) < only_more:
1575 only_more -= len(page)
1576 else:
1577 page = page[:only_more]
1578 res.extend(page)
1579 break
1580 res.extend(page)
1581 return res
1582
1583
1584 def uppercase_escape(s):
1585 unicode_escape = codecs.getdecoder('unicode_escape')
1586 return re.sub(
1587 r'\\U[0-9a-fA-F]{8}',
1588 lambda m: unicode_escape(m.group(0))[0],
1589 s)
1590
1591
1592 def lowercase_escape(s):
1593 unicode_escape = codecs.getdecoder('unicode_escape')
1594 return re.sub(
1595 r'\\u[0-9a-fA-F]{4}',
1596 lambda m: unicode_escape(m.group(0))[0],
1597 s)
1598
1599
1600 def escape_rfc3986(s):
1601 """Escape non-ASCII characters as suggested by RFC 3986"""
1602 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1603 s = s.encode('utf-8')
1604 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1605
1606
1607 def escape_url(url):
1608 """Escape URL as suggested by RFC 3986"""
1609 url_parsed = compat_urllib_parse_urlparse(url)
1610 return url_parsed._replace(
1611 path=escape_rfc3986(url_parsed.path),
1612 params=escape_rfc3986(url_parsed.params),
1613 query=escape_rfc3986(url_parsed.query),
1614 fragment=escape_rfc3986(url_parsed.fragment)
1615 ).geturl()
1616
1617 try:
1618 struct.pack('!I', 0)
1619 except TypeError:
1620 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1621 def struct_pack(spec, *args):
1622 if isinstance(spec, compat_str):
1623 spec = spec.encode('ascii')
1624 return struct.pack(spec, *args)
1625
1626 def struct_unpack(spec, *args):
1627 if isinstance(spec, compat_str):
1628 spec = spec.encode('ascii')
1629 return struct.unpack(spec, *args)
1630 else:
1631 struct_pack = struct.pack
1632 struct_unpack = struct.unpack
1633
1634
1635 def read_batch_urls(batch_fd):
1636 def fixup(url):
1637 if not isinstance(url, compat_str):
1638 url = url.decode('utf-8', 'replace')
1639 BOM_UTF8 = '\xef\xbb\xbf'
1640 if url.startswith(BOM_UTF8):
1641 url = url[len(BOM_UTF8):]
1642 url = url.strip()
1643 if url.startswith(('#', ';', ']')):
1644 return False
1645 return url
1646
1647 with contextlib.closing(batch_fd) as fd:
1648 return [url for url in map(fixup, fd) if url]
1649
1650
1651 def urlencode_postdata(*args, **kargs):
1652 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1653
1654
1655 def encode_dict(d, encoding='utf-8'):
1656 return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
1657
1658
1659 try:
1660 etree_iter = xml.etree.ElementTree.Element.iter
1661 except AttributeError: # Python <=2.6
1662 etree_iter = lambda n: n.findall('.//*')
1663
1664
1665 def parse_xml(s):
1666 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1667 def doctype(self, name, pubid, system):
1668 pass # Ignore doctypes
1669
1670 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1671 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1672 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1673 # Fix up XML parser in Python 2.x
1674 if sys.version_info < (3, 0):
1675 for n in etree_iter(tree):
1676 if n.text is not None:
1677 if not isinstance(n.text, compat_str):
1678 n.text = n.text.decode('utf-8')
1679 return tree
1680
1681
1682 US_RATINGS = {
1683 'G': 0,
1684 'PG': 10,
1685 'PG-13': 13,
1686 'R': 16,
1687 'NC': 18,
1688 }
1689
1690
1691 def parse_age_limit(s):
1692 if s is None:
1693 return None
1694 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1695 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1696
1697
1698 def strip_jsonp(code):
1699 return re.sub(
1700 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1701
1702
1703 def js_to_json(code):
1704 def fix_kv(m):
1705 v = m.group(0)
1706 if v in ('true', 'false', 'null'):
1707 return v
1708 if v.startswith('"'):
1709 v = re.sub(r"\\'", "'", v[1:-1])
1710 elif v.startswith("'"):
1711 v = v[1:-1]
1712 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1713 '\\\\': '\\\\',
1714 "\\'": "'",
1715 '"': '\\"',
1716 }[m.group(0)], v)
1717 return '"%s"' % v
1718
1719 res = re.sub(r'''(?x)
1720 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1721 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1722 [a-zA-Z_][.a-zA-Z_0-9]*
1723 ''', fix_kv, code)
1724 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1725 return res
1726
1727
1728 def qualities(quality_ids):
1729 """ Get a numeric quality value out of a list of possible values """
1730 def q(qid):
1731 try:
1732 return quality_ids.index(qid)
1733 except ValueError:
1734 return -1
1735 return q
1736
1737
1738 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1739
1740
1741 def limit_length(s, length):
1742 """ Add ellipses to overly long strings """
1743 if s is None:
1744 return None
1745 ELLIPSES = '...'
1746 if len(s) > length:
1747 return s[:length - len(ELLIPSES)] + ELLIPSES
1748 return s
1749
1750
1751 def version_tuple(v):
1752 return tuple(int(e) for e in re.split(r'[-.]', v))
1753
1754
1755 def is_outdated_version(version, limit, assume_new=True):
1756 if not version:
1757 return not assume_new
1758 try:
1759 return version_tuple(version) < version_tuple(limit)
1760 except ValueError:
1761 return not assume_new
1762
1763
1764 def ytdl_is_updateable():
1765 """ Returns if youtube-dl can be updated with -U """
1766 from zipimport import zipimporter
1767
1768 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1769
1770
1771 def args_to_str(args):
1772 # Get a short string representation for a subprocess command
1773 return ' '.join(shlex_quote(a) for a in args)
1774
1775
1776 def mimetype2ext(mt):
1777 _, _, res = mt.rpartition('/')
1778
1779 return {
1780 'x-ms-wmv': 'wmv',
1781 'x-mp4-fragmented': 'mp4',
1782 'ttml+xml': 'ttml',
1783 }.get(res, res)
1784
1785
1786 def urlhandle_detect_ext(url_handle):
1787 try:
1788 url_handle.headers
1789 getheader = lambda h: url_handle.headers[h]
1790 except AttributeError: # Python < 3
1791 getheader = url_handle.info().getheader
1792
1793 cd = getheader('Content-Disposition')
1794 if cd:
1795 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1796 if m:
1797 e = determine_ext(m.group('filename'), default_ext=None)
1798 if e:
1799 return e
1800
1801 return mimetype2ext(getheader('Content-Type'))
1802
1803
1804 def encode_data_uri(data, mime_type):
1805 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1806
1807
1808 def age_restricted(content_limit, age_limit):
1809 """ Returns True iff the content should be blocked """
1810
1811 if age_limit is None: # No limit set
1812 return False
1813 if content_limit is None:
1814 return False # Content available for everyone
1815 return age_limit < content_limit
1816
1817
1818 def is_html(first_bytes):
1819 """ Detect whether a file contains HTML by examining its first bytes. """
1820
1821 BOMS = [
1822 (b'\xef\xbb\xbf', 'utf-8'),
1823 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1824 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1825 (b'\xff\xfe', 'utf-16-le'),
1826 (b'\xfe\xff', 'utf-16-be'),
1827 ]
1828 for bom, enc in BOMS:
1829 if first_bytes.startswith(bom):
1830 s = first_bytes[len(bom):].decode(enc, 'replace')
1831 break
1832 else:
1833 s = first_bytes.decode('utf-8', 'replace')
1834
1835 return re.match(r'^\s*<', s)
1836
1837
1838 def determine_protocol(info_dict):
1839 protocol = info_dict.get('protocol')
1840 if protocol is not None:
1841 return protocol
1842
1843 url = info_dict['url']
1844 if url.startswith('rtmp'):
1845 return 'rtmp'
1846 elif url.startswith('mms'):
1847 return 'mms'
1848 elif url.startswith('rtsp'):
1849 return 'rtsp'
1850
1851 ext = determine_ext(url)
1852 if ext == 'm3u8':
1853 return 'm3u8'
1854 elif ext == 'f4m':
1855 return 'f4m'
1856
1857 return compat_urllib_parse_urlparse(url).scheme
1858
1859
1860 def render_table(header_row, data):
1861 """ Render a list of rows, each as a list of values """
1862 table = [header_row] + data
1863 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1864 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1865 return '\n'.join(format_str % tuple(row) for row in table)
1866
1867
1868 def _match_one(filter_part, dct):
1869 COMPARISON_OPERATORS = {
1870 '<': operator.lt,
1871 '<=': operator.le,
1872 '>': operator.gt,
1873 '>=': operator.ge,
1874 '=': operator.eq,
1875 '!=': operator.ne,
1876 }
1877 operator_rex = re.compile(r'''(?x)\s*
1878 (?P<key>[a-z_]+)
1879 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1880 (?:
1881 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1882 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1883 )
1884 \s*$
1885 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1886 m = operator_rex.search(filter_part)
1887 if m:
1888 op = COMPARISON_OPERATORS[m.group('op')]
1889 if m.group('strval') is not None:
1890 if m.group('op') not in ('=', '!='):
1891 raise ValueError(
1892 'Operator %s does not support string values!' % m.group('op'))
1893 comparison_value = m.group('strval')
1894 else:
1895 try:
1896 comparison_value = int(m.group('intval'))
1897 except ValueError:
1898 comparison_value = parse_filesize(m.group('intval'))
1899 if comparison_value is None:
1900 comparison_value = parse_filesize(m.group('intval') + 'B')
1901 if comparison_value is None:
1902 raise ValueError(
1903 'Invalid integer value %r in filter part %r' % (
1904 m.group('intval'), filter_part))
1905 actual_value = dct.get(m.group('key'))
1906 if actual_value is None:
1907 return m.group('none_inclusive')
1908 return op(actual_value, comparison_value)
1909
1910 UNARY_OPERATORS = {
1911 '': lambda v: v is not None,
1912 '!': lambda v: v is None,
1913 }
1914 operator_rex = re.compile(r'''(?x)\s*
1915 (?P<op>%s)\s*(?P<key>[a-z_]+)
1916 \s*$
1917 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1918 m = operator_rex.search(filter_part)
1919 if m:
1920 op = UNARY_OPERATORS[m.group('op')]
1921 actual_value = dct.get(m.group('key'))
1922 return op(actual_value)
1923
1924 raise ValueError('Invalid filter part %r' % filter_part)
1925
1926
1927 def match_str(filter_str, dct):
1928 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1929
1930 return all(
1931 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1932
1933
1934 def match_filter_func(filter_str):
1935 def _match_func(info_dict):
1936 if match_str(filter_str, info_dict):
1937 return None
1938 else:
1939 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1940 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1941 return _match_func
1942
1943
1944 def parse_dfxp_time_expr(time_expr):
1945 if not time_expr:
1946 return 0.0
1947
1948 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1949 if mobj:
1950 return float(mobj.group('time_offset'))
1951
1952 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1953 if mobj:
1954 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1955
1956
1957 def srt_subtitles_timecode(seconds):
1958 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1959
1960
1961 def dfxp2srt(dfxp_data):
1962 _x = functools.partial(xpath_with_ns, ns_map={
1963 'ttml': 'http://www.w3.org/ns/ttml',
1964 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1965 })
1966
1967 def parse_node(node):
1968 str_or_empty = functools.partial(str_or_none, default='')
1969
1970 out = str_or_empty(node.text)
1971
1972 for child in node:
1973 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1974 out += '\n' + str_or_empty(child.tail)
1975 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1976 out += str_or_empty(parse_node(child))
1977 else:
1978 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1979
1980 return out
1981
1982 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1983 out = []
1984 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1985
1986 if not paras:
1987 raise ValueError('Invalid dfxp/TTML subtitle')
1988
1989 for para, index in zip(paras, itertools.count(1)):
1990 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1991 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1992 if not end_time:
1993 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1994 out.append('%d\n%s --> %s\n%s\n\n' % (
1995 index,
1996 srt_subtitles_timecode(begin_time),
1997 srt_subtitles_timecode(end_time),
1998 parse_node(para)))
1999
2000 return ''.join(out)
2001
2002
2003 def cli_option(params, command_option, param):
2004 param = params.get(param)
2005 return [command_option, param] if param is not None else []
2006
2007
2008 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2009 param = params.get(param)
2010 assert isinstance(param, bool)
2011 if separator:
2012 return [command_option + separator + (true_value if param else false_value)]
2013 return [command_option, true_value if param else false_value]
2014
2015
2016 def cli_valueless_option(params, command_option, param, expected_value=True):
2017 param = params.get(param)
2018 return [command_option] if param == expected_value else []
2019
2020
2021 def cli_configuration_args(params, param, default=[]):
2022 ex_args = params.get(param)
2023 if ex_args is None:
2024 return default
2025 assert isinstance(ex_args, list)
2026 return ex_args
2027
2028
2029 class ISO639Utils(object):
2030 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2031 _lang_map = {
2032 'aa': 'aar',
2033 'ab': 'abk',
2034 'ae': 'ave',
2035 'af': 'afr',
2036 'ak': 'aka',
2037 'am': 'amh',
2038 'an': 'arg',
2039 'ar': 'ara',
2040 'as': 'asm',
2041 'av': 'ava',
2042 'ay': 'aym',
2043 'az': 'aze',
2044 'ba': 'bak',
2045 'be': 'bel',
2046 'bg': 'bul',
2047 'bh': 'bih',
2048 'bi': 'bis',
2049 'bm': 'bam',
2050 'bn': 'ben',
2051 'bo': 'bod',
2052 'br': 'bre',
2053 'bs': 'bos',
2054 'ca': 'cat',
2055 'ce': 'che',
2056 'ch': 'cha',
2057 'co': 'cos',
2058 'cr': 'cre',
2059 'cs': 'ces',
2060 'cu': 'chu',
2061 'cv': 'chv',
2062 'cy': 'cym',
2063 'da': 'dan',
2064 'de': 'deu',
2065 'dv': 'div',
2066 'dz': 'dzo',
2067 'ee': 'ewe',
2068 'el': 'ell',
2069 'en': 'eng',
2070 'eo': 'epo',
2071 'es': 'spa',
2072 'et': 'est',
2073 'eu': 'eus',
2074 'fa': 'fas',
2075 'ff': 'ful',
2076 'fi': 'fin',
2077 'fj': 'fij',
2078 'fo': 'fao',
2079 'fr': 'fra',
2080 'fy': 'fry',
2081 'ga': 'gle',
2082 'gd': 'gla',
2083 'gl': 'glg',
2084 'gn': 'grn',
2085 'gu': 'guj',
2086 'gv': 'glv',
2087 'ha': 'hau',
2088 'he': 'heb',
2089 'hi': 'hin',
2090 'ho': 'hmo',
2091 'hr': 'hrv',
2092 'ht': 'hat',
2093 'hu': 'hun',
2094 'hy': 'hye',
2095 'hz': 'her',
2096 'ia': 'ina',
2097 'id': 'ind',
2098 'ie': 'ile',
2099 'ig': 'ibo',
2100 'ii': 'iii',
2101 'ik': 'ipk',
2102 'io': 'ido',
2103 'is': 'isl',
2104 'it': 'ita',
2105 'iu': 'iku',
2106 'ja': 'jpn',
2107 'jv': 'jav',
2108 'ka': 'kat',
2109 'kg': 'kon',
2110 'ki': 'kik',
2111 'kj': 'kua',
2112 'kk': 'kaz',
2113 'kl': 'kal',
2114 'km': 'khm',
2115 'kn': 'kan',
2116 'ko': 'kor',
2117 'kr': 'kau',
2118 'ks': 'kas',
2119 'ku': 'kur',
2120 'kv': 'kom',
2121 'kw': 'cor',
2122 'ky': 'kir',
2123 'la': 'lat',
2124 'lb': 'ltz',
2125 'lg': 'lug',
2126 'li': 'lim',
2127 'ln': 'lin',
2128 'lo': 'lao',
2129 'lt': 'lit',
2130 'lu': 'lub',
2131 'lv': 'lav',
2132 'mg': 'mlg',
2133 'mh': 'mah',
2134 'mi': 'mri',
2135 'mk': 'mkd',
2136 'ml': 'mal',
2137 'mn': 'mon',
2138 'mr': 'mar',
2139 'ms': 'msa',
2140 'mt': 'mlt',
2141 'my': 'mya',
2142 'na': 'nau',
2143 'nb': 'nob',
2144 'nd': 'nde',
2145 'ne': 'nep',
2146 'ng': 'ndo',
2147 'nl': 'nld',
2148 'nn': 'nno',
2149 'no': 'nor',
2150 'nr': 'nbl',
2151 'nv': 'nav',
2152 'ny': 'nya',
2153 'oc': 'oci',
2154 'oj': 'oji',
2155 'om': 'orm',
2156 'or': 'ori',
2157 'os': 'oss',
2158 'pa': 'pan',
2159 'pi': 'pli',
2160 'pl': 'pol',
2161 'ps': 'pus',
2162 'pt': 'por',
2163 'qu': 'que',
2164 'rm': 'roh',
2165 'rn': 'run',
2166 'ro': 'ron',
2167 'ru': 'rus',
2168 'rw': 'kin',
2169 'sa': 'san',
2170 'sc': 'srd',
2171 'sd': 'snd',
2172 'se': 'sme',
2173 'sg': 'sag',
2174 'si': 'sin',
2175 'sk': 'slk',
2176 'sl': 'slv',
2177 'sm': 'smo',
2178 'sn': 'sna',
2179 'so': 'som',
2180 'sq': 'sqi',
2181 'sr': 'srp',
2182 'ss': 'ssw',
2183 'st': 'sot',
2184 'su': 'sun',
2185 'sv': 'swe',
2186 'sw': 'swa',
2187 'ta': 'tam',
2188 'te': 'tel',
2189 'tg': 'tgk',
2190 'th': 'tha',
2191 'ti': 'tir',
2192 'tk': 'tuk',
2193 'tl': 'tgl',
2194 'tn': 'tsn',
2195 'to': 'ton',
2196 'tr': 'tur',
2197 'ts': 'tso',
2198 'tt': 'tat',
2199 'tw': 'twi',
2200 'ty': 'tah',
2201 'ug': 'uig',
2202 'uk': 'ukr',
2203 'ur': 'urd',
2204 'uz': 'uzb',
2205 've': 'ven',
2206 'vi': 'vie',
2207 'vo': 'vol',
2208 'wa': 'wln',
2209 'wo': 'wol',
2210 'xh': 'xho',
2211 'yi': 'yid',
2212 'yo': 'yor',
2213 'za': 'zha',
2214 'zh': 'zho',
2215 'zu': 'zul',
2216 }
2217
2218 @classmethod
2219 def short2long(cls, code):
2220 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2221 return cls._lang_map.get(code[:2])
2222
2223 @classmethod
2224 def long2short(cls, code):
2225 """Convert language code from ISO 639-2/T to ISO 639-1"""
2226 for short_name, long_name in cls._lang_map.items():
2227 if long_name == code:
2228 return short_name
2229
2230
2231 class ISO3166Utils(object):
2232 # From http://data.okfn.org/data/core/country-list
2233 _country_map = {
2234 'AF': 'Afghanistan',
2235 'AX': 'Åland Islands',
2236 'AL': 'Albania',
2237 'DZ': 'Algeria',
2238 'AS': 'American Samoa',
2239 'AD': 'Andorra',
2240 'AO': 'Angola',
2241 'AI': 'Anguilla',
2242 'AQ': 'Antarctica',
2243 'AG': 'Antigua and Barbuda',
2244 'AR': 'Argentina',
2245 'AM': 'Armenia',
2246 'AW': 'Aruba',
2247 'AU': 'Australia',
2248 'AT': 'Austria',
2249 'AZ': 'Azerbaijan',
2250 'BS': 'Bahamas',
2251 'BH': 'Bahrain',
2252 'BD': 'Bangladesh',
2253 'BB': 'Barbados',
2254 'BY': 'Belarus',
2255 'BE': 'Belgium',
2256 'BZ': 'Belize',
2257 'BJ': 'Benin',
2258 'BM': 'Bermuda',
2259 'BT': 'Bhutan',
2260 'BO': 'Bolivia, Plurinational State of',
2261 'BQ': 'Bonaire, Sint Eustatius and Saba',
2262 'BA': 'Bosnia and Herzegovina',
2263 'BW': 'Botswana',
2264 'BV': 'Bouvet Island',
2265 'BR': 'Brazil',
2266 'IO': 'British Indian Ocean Territory',
2267 'BN': 'Brunei Darussalam',
2268 'BG': 'Bulgaria',
2269 'BF': 'Burkina Faso',
2270 'BI': 'Burundi',
2271 'KH': 'Cambodia',
2272 'CM': 'Cameroon',
2273 'CA': 'Canada',
2274 'CV': 'Cape Verde',
2275 'KY': 'Cayman Islands',
2276 'CF': 'Central African Republic',
2277 'TD': 'Chad',
2278 'CL': 'Chile',
2279 'CN': 'China',
2280 'CX': 'Christmas Island',
2281 'CC': 'Cocos (Keeling) Islands',
2282 'CO': 'Colombia',
2283 'KM': 'Comoros',
2284 'CG': 'Congo',
2285 'CD': 'Congo, the Democratic Republic of the',
2286 'CK': 'Cook Islands',
2287 'CR': 'Costa Rica',
2288 'CI': 'Côte d\'Ivoire',
2289 'HR': 'Croatia',
2290 'CU': 'Cuba',
2291 'CW': 'Curaçao',
2292 'CY': 'Cyprus',
2293 'CZ': 'Czech Republic',
2294 'DK': 'Denmark',
2295 'DJ': 'Djibouti',
2296 'DM': 'Dominica',
2297 'DO': 'Dominican Republic',
2298 'EC': 'Ecuador',
2299 'EG': 'Egypt',
2300 'SV': 'El Salvador',
2301 'GQ': 'Equatorial Guinea',
2302 'ER': 'Eritrea',
2303 'EE': 'Estonia',
2304 'ET': 'Ethiopia',
2305 'FK': 'Falkland Islands (Malvinas)',
2306 'FO': 'Faroe Islands',
2307 'FJ': 'Fiji',
2308 'FI': 'Finland',
2309 'FR': 'France',
2310 'GF': 'French Guiana',
2311 'PF': 'French Polynesia',
2312 'TF': 'French Southern Territories',
2313 'GA': 'Gabon',
2314 'GM': 'Gambia',
2315 'GE': 'Georgia',
2316 'DE': 'Germany',
2317 'GH': 'Ghana',
2318 'GI': 'Gibraltar',
2319 'GR': 'Greece',
2320 'GL': 'Greenland',
2321 'GD': 'Grenada',
2322 'GP': 'Guadeloupe',
2323 'GU': 'Guam',
2324 'GT': 'Guatemala',
2325 'GG': 'Guernsey',
2326 'GN': 'Guinea',
2327 'GW': 'Guinea-Bissau',
2328 'GY': 'Guyana',
2329 'HT': 'Haiti',
2330 'HM': 'Heard Island and McDonald Islands',
2331 'VA': 'Holy See (Vatican City State)',
2332 'HN': 'Honduras',
2333 'HK': 'Hong Kong',
2334 'HU': 'Hungary',
2335 'IS': 'Iceland',
2336 'IN': 'India',
2337 'ID': 'Indonesia',
2338 'IR': 'Iran, Islamic Republic of',
2339 'IQ': 'Iraq',
2340 'IE': 'Ireland',
2341 'IM': 'Isle of Man',
2342 'IL': 'Israel',
2343 'IT': 'Italy',
2344 'JM': 'Jamaica',
2345 'JP': 'Japan',
2346 'JE': 'Jersey',
2347 'JO': 'Jordan',
2348 'KZ': 'Kazakhstan',
2349 'KE': 'Kenya',
2350 'KI': 'Kiribati',
2351 'KP': 'Korea, Democratic People\'s Republic of',
2352 'KR': 'Korea, Republic of',
2353 'KW': 'Kuwait',
2354 'KG': 'Kyrgyzstan',
2355 'LA': 'Lao People\'s Democratic Republic',
2356 'LV': 'Latvia',
2357 'LB': 'Lebanon',
2358 'LS': 'Lesotho',
2359 'LR': 'Liberia',
2360 'LY': 'Libya',
2361 'LI': 'Liechtenstein',
2362 'LT': 'Lithuania',
2363 'LU': 'Luxembourg',
2364 'MO': 'Macao',
2365 'MK': 'Macedonia, the Former Yugoslav Republic of',
2366 'MG': 'Madagascar',
2367 'MW': 'Malawi',
2368 'MY': 'Malaysia',
2369 'MV': 'Maldives',
2370 'ML': 'Mali',
2371 'MT': 'Malta',
2372 'MH': 'Marshall Islands',
2373 'MQ': 'Martinique',
2374 'MR': 'Mauritania',
2375 'MU': 'Mauritius',
2376 'YT': 'Mayotte',
2377 'MX': 'Mexico',
2378 'FM': 'Micronesia, Federated States of',
2379 'MD': 'Moldova, Republic of',
2380 'MC': 'Monaco',
2381 'MN': 'Mongolia',
2382 'ME': 'Montenegro',
2383 'MS': 'Montserrat',
2384 'MA': 'Morocco',
2385 'MZ': 'Mozambique',
2386 'MM': 'Myanmar',
2387 'NA': 'Namibia',
2388 'NR': 'Nauru',
2389 'NP': 'Nepal',
2390 'NL': 'Netherlands',
2391 'NC': 'New Caledonia',
2392 'NZ': 'New Zealand',
2393 'NI': 'Nicaragua',
2394 'NE': 'Niger',
2395 'NG': 'Nigeria',
2396 'NU': 'Niue',
2397 'NF': 'Norfolk Island',
2398 'MP': 'Northern Mariana Islands',
2399 'NO': 'Norway',
2400 'OM': 'Oman',
2401 'PK': 'Pakistan',
2402 'PW': 'Palau',
2403 'PS': 'Palestine, State of',
2404 'PA': 'Panama',
2405 'PG': 'Papua New Guinea',
2406 'PY': 'Paraguay',
2407 'PE': 'Peru',
2408 'PH': 'Philippines',
2409 'PN': 'Pitcairn',
2410 'PL': 'Poland',
2411 'PT': 'Portugal',
2412 'PR': 'Puerto Rico',
2413 'QA': 'Qatar',
2414 'RE': 'Réunion',
2415 'RO': 'Romania',
2416 'RU': 'Russian Federation',
2417 'RW': 'Rwanda',
2418 'BL': 'Saint Barthélemy',
2419 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2420 'KN': 'Saint Kitts and Nevis',
2421 'LC': 'Saint Lucia',
2422 'MF': 'Saint Martin (French part)',
2423 'PM': 'Saint Pierre and Miquelon',
2424 'VC': 'Saint Vincent and the Grenadines',
2425 'WS': 'Samoa',
2426 'SM': 'San Marino',
2427 'ST': 'Sao Tome and Principe',
2428 'SA': 'Saudi Arabia',
2429 'SN': 'Senegal',
2430 'RS': 'Serbia',
2431 'SC': 'Seychelles',
2432 'SL': 'Sierra Leone',
2433 'SG': 'Singapore',
2434 'SX': 'Sint Maarten (Dutch part)',
2435 'SK': 'Slovakia',
2436 'SI': 'Slovenia',
2437 'SB': 'Solomon Islands',
2438 'SO': 'Somalia',
2439 'ZA': 'South Africa',
2440 'GS': 'South Georgia and the South Sandwich Islands',
2441 'SS': 'South Sudan',
2442 'ES': 'Spain',
2443 'LK': 'Sri Lanka',
2444 'SD': 'Sudan',
2445 'SR': 'Suriname',
2446 'SJ': 'Svalbard and Jan Mayen',
2447 'SZ': 'Swaziland',
2448 'SE': 'Sweden',
2449 'CH': 'Switzerland',
2450 'SY': 'Syrian Arab Republic',
2451 'TW': 'Taiwan, Province of China',
2452 'TJ': 'Tajikistan',
2453 'TZ': 'Tanzania, United Republic of',
2454 'TH': 'Thailand',
2455 'TL': 'Timor-Leste',
2456 'TG': 'Togo',
2457 'TK': 'Tokelau',
2458 'TO': 'Tonga',
2459 'TT': 'Trinidad and Tobago',
2460 'TN': 'Tunisia',
2461 'TR': 'Turkey',
2462 'TM': 'Turkmenistan',
2463 'TC': 'Turks and Caicos Islands',
2464 'TV': 'Tuvalu',
2465 'UG': 'Uganda',
2466 'UA': 'Ukraine',
2467 'AE': 'United Arab Emirates',
2468 'GB': 'United Kingdom',
2469 'US': 'United States',
2470 'UM': 'United States Minor Outlying Islands',
2471 'UY': 'Uruguay',
2472 'UZ': 'Uzbekistan',
2473 'VU': 'Vanuatu',
2474 'VE': 'Venezuela, Bolivarian Republic of',
2475 'VN': 'Viet Nam',
2476 'VG': 'Virgin Islands, British',
2477 'VI': 'Virgin Islands, U.S.',
2478 'WF': 'Wallis and Futuna',
2479 'EH': 'Western Sahara',
2480 'YE': 'Yemen',
2481 'ZM': 'Zambia',
2482 'ZW': 'Zimbabwe',
2483 }
2484
2485 @classmethod
2486 def short2full(cls, code):
2487 """Convert an ISO 3166-2 country code to the corresponding full name"""
2488 return cls._country_map.get(code.upper())
2489
2490
2491 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2492 def __init__(self, proxies=None):
2493 # Set default handlers
2494 for type in ('http', 'https'):
2495 setattr(self, '%s_open' % type,
2496 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2497 meth(r, proxy, type))
2498 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2499
2500 def proxy_open(self, req, proxy, type):
2501 req_proxy = req.headers.get('Ytdl-request-proxy')
2502 if req_proxy is not None:
2503 proxy = req_proxy
2504 del req.headers['Ytdl-request-proxy']
2505
2506 if proxy == '__noproxy__':
2507 return None # No Proxy
2508 return compat_urllib_request.ProxyHandler.proxy_open(
2509 self, req, proxy, type)