]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[youtube] Skip download for multiple v= test
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import calendar
7 import codecs
8 import contextlib
9 import ctypes
10 import datetime
11 import email.utils
12 import errno
13 import functools
14 import gzip
15 import itertools
16 import io
17 import json
18 import locale
19 import math
20 import operator
21 import os
22 import pipes
23 import platform
24 import re
25 import ssl
26 import socket
27 import struct
28 import subprocess
29 import sys
30 import tempfile
31 import traceback
32 import xml.etree.ElementTree
33 import zlib
34
35 from .compat import (
36 compat_basestring,
37 compat_chr,
38 compat_html_entities,
39 compat_http_client,
40 compat_kwargs,
41 compat_parse_qs,
42 compat_socket_create_connection,
43 compat_str,
44 compat_urllib_error,
45 compat_urllib_parse,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
48 compat_urlparse,
49 shlex_quote,
50 )
51
52
53 # This is not clearly defined otherwise
54 compiled_regex_type = type(re.compile(''))
55
56 std_headers = {
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
62 }
63
64
65 NO_DEFAULT = object()
66
67 ENGLISH_MONTH_NAMES = [
68 'January', 'February', 'March', 'April', 'May', 'June',
69 'July', 'August', 'September', 'October', 'November', 'December']
70
71
72 def preferredencoding():
73 """Get preferred encoding.
74
75 Returns the best encoding scheme for the system, based on
76 locale.getpreferredencoding() and some further tweaks.
77 """
78 try:
79 pref = locale.getpreferredencoding()
80 'TEST'.encode(pref)
81 except Exception:
82 pref = 'UTF-8'
83
84 return pref
85
86
87 def write_json_file(obj, fn):
88 """ Encode obj as JSON and write it to fn, atomically if possible """
89
90 fn = encodeFilename(fn)
91 if sys.version_info < (3, 0) and sys.platform != 'win32':
92 encoding = get_filesystem_encoding()
93 # os.path.basename returns a bytes object, but NamedTemporaryFile
94 # will fail if the filename contains non ascii characters unless we
95 # use a unicode object
96 path_basename = lambda f: os.path.basename(fn).decode(encoding)
97 # the same for os.path.dirname
98 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
99 else:
100 path_basename = os.path.basename
101 path_dirname = os.path.dirname
102
103 args = {
104 'suffix': '.tmp',
105 'prefix': path_basename(fn) + '.',
106 'dir': path_dirname(fn),
107 'delete': False,
108 }
109
110 # In Python 2.x, json.dump expects a bytestream.
111 # In Python 3.x, it writes to a character stream
112 if sys.version_info < (3, 0):
113 args['mode'] = 'wb'
114 else:
115 args.update({
116 'mode': 'w',
117 'encoding': 'utf-8',
118 })
119
120 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
121
122 try:
123 with tf:
124 json.dump(obj, tf)
125 if sys.platform == 'win32':
126 # Need to remove existing file on Windows, else os.rename raises
127 # WindowsError or FileExistsError.
128 try:
129 os.unlink(fn)
130 except OSError:
131 pass
132 os.rename(tf.name, fn)
133 except Exception:
134 try:
135 os.remove(tf.name)
136 except OSError:
137 pass
138 raise
139
140
141 if sys.version_info >= (2, 7):
142 def find_xpath_attr(node, xpath, key, val=None):
143 """ Find the xpath xpath[@key=val] """
144 assert re.match(r'^[a-zA-Z-]+$', key)
145 if val:
146 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
147 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
148 return node.find(expr)
149 else:
150 def find_xpath_attr(node, xpath, key, val=None):
151 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
152 # .//node does not match if a node is a direct child of . !
153 if isinstance(xpath, compat_str):
154 xpath = xpath.encode('ascii')
155
156 for f in node.findall(xpath):
157 if key not in f.attrib:
158 continue
159 if val is None or f.attrib.get(key) == val:
160 return f
161 return None
162
163 # On python2.6 the xml.etree.ElementTree.Element methods don't support
164 # the namespace parameter
165
166
167 def xpath_with_ns(path, ns_map):
168 components = [c.split(':') for c in path.split('/')]
169 replaced = []
170 for c in components:
171 if len(c) == 1:
172 replaced.append(c[0])
173 else:
174 ns, tag = c
175 replaced.append('{%s}%s' % (ns_map[ns], tag))
176 return '/'.join(replaced)
177
178
179 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
180 if sys.version_info < (2, 7): # Crazy 2.6
181 xpath = xpath.encode('ascii')
182
183 n = node.find(xpath)
184 if n is None or n.text is None:
185 if default is not NO_DEFAULT:
186 return default
187 elif fatal:
188 name = xpath if name is None else name
189 raise ExtractorError('Could not find XML element %s' % name)
190 else:
191 return None
192 return n.text
193
194
195 def get_element_by_id(id, html):
196 """Return the content of the tag with the specified ID in the passed HTML document"""
197 return get_element_by_attribute("id", id, html)
198
199
200 def get_element_by_attribute(attribute, value, html):
201 """Return the content of the tag with the specified attribute in the passed HTML document"""
202
203 m = re.search(r'''(?xs)
204 <([a-zA-Z0-9:._-]+)
205 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
206 \s+%s=['"]?%s['"]?
207 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
208 \s*>
209 (?P<content>.*?)
210 </\1>
211 ''' % (re.escape(attribute), re.escape(value)), html)
212
213 if not m:
214 return None
215 res = m.group('content')
216
217 if res.startswith('"') or res.startswith("'"):
218 res = res[1:-1]
219
220 return unescapeHTML(res)
221
222
223 def clean_html(html):
224 """Clean an HTML snippet into a readable string"""
225
226 if html is None: # Convenience for sanitizing descriptions etc.
227 return html
228
229 # Newline vs <br />
230 html = html.replace('\n', ' ')
231 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
232 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
233 # Strip html tags
234 html = re.sub('<.*?>', '', html)
235 # Replace html entities
236 html = unescapeHTML(html)
237 return html.strip()
238
239
240 def sanitize_open(filename, open_mode):
241 """Try to open the given filename, and slightly tweak it if this fails.
242
243 Attempts to open the given filename. If this fails, it tries to change
244 the filename slightly, step by step, until it's either able to open it
245 or it fails and raises a final exception, like the standard open()
246 function.
247
248 It returns the tuple (stream, definitive_file_name).
249 """
250 try:
251 if filename == '-':
252 if sys.platform == 'win32':
253 import msvcrt
254 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
255 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
256 stream = open(encodeFilename(filename), open_mode)
257 return (stream, filename)
258 except (IOError, OSError) as err:
259 if err.errno in (errno.EACCES,):
260 raise
261
262 # In case of error, try to remove win32 forbidden chars
263 alt_filename = sanitize_path(filename)
264 if alt_filename == filename:
265 raise
266 else:
267 # An exception here should be caught in the caller
268 stream = open(encodeFilename(alt_filename), open_mode)
269 return (stream, alt_filename)
270
271
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
274 timestamp = None
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
278 return timestamp
279
280
281 def sanitize_filename(s, restricted=False, is_id=False):
282 """Sanitizes a string so it could be used as part of a filename.
283 If restricted is set, use a stricter subset of allowed characters.
284 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
285 """
286 def replace_insane(char):
287 if char == '?' or ord(char) < 32 or ord(char) == 127:
288 return ''
289 elif char == '"':
290 return '' if restricted else '\''
291 elif char == ':':
292 return '_-' if restricted else ' -'
293 elif char in '\\/|*<>':
294 return '_'
295 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
296 return '_'
297 if restricted and ord(char) > 127:
298 return '_'
299 return char
300
301 # Handle timestamps
302 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
303 result = ''.join(map(replace_insane, s))
304 if not is_id:
305 while '__' in result:
306 result = result.replace('__', '_')
307 result = result.strip('_')
308 # Common case of "Foreign band name - English song title"
309 if restricted and result.startswith('-_'):
310 result = result[2:]
311 if result.startswith('-'):
312 result = '_' + result[len('-'):]
313 result = result.lstrip('.')
314 if not result:
315 result = '_'
316 return result
317
318
319 def sanitize_path(s):
320 """Sanitizes and normalizes path on Windows"""
321 if sys.platform != 'win32':
322 return s
323 drive_or_unc, _ = os.path.splitdrive(s)
324 if sys.version_info < (2, 7) and not drive_or_unc:
325 drive_or_unc, _ = os.path.splitunc(s)
326 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
327 if drive_or_unc:
328 norm_path.pop(0)
329 sanitized_path = [
330 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
331 for path_part in norm_path]
332 if drive_or_unc:
333 sanitized_path.insert(0, drive_or_unc + os.path.sep)
334 return os.path.join(*sanitized_path)
335
336
337 def orderedSet(iterable):
338 """ Remove all duplicates from the input iterable """
339 res = []
340 for el in iterable:
341 if el not in res:
342 res.append(el)
343 return res
344
345
346 def _htmlentity_transform(entity):
347 """Transforms an HTML entity to a character."""
348 # Known non-numeric HTML entity
349 if entity in compat_html_entities.name2codepoint:
350 return compat_chr(compat_html_entities.name2codepoint[entity])
351
352 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
353 if mobj is not None:
354 numstr = mobj.group(1)
355 if numstr.startswith('x'):
356 base = 16
357 numstr = '0%s' % numstr
358 else:
359 base = 10
360 return compat_chr(int(numstr, base))
361
362 # Unknown entity in name, return its literal representation
363 return ('&%s;' % entity)
364
365
366 def unescapeHTML(s):
367 if s is None:
368 return None
369 assert type(s) == compat_str
370
371 return re.sub(
372 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
373
374
375 def get_subprocess_encoding():
376 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
377 # For subprocess calls, encode with locale encoding
378 # Refer to http://stackoverflow.com/a/9951851/35070
379 encoding = preferredencoding()
380 else:
381 encoding = sys.getfilesystemencoding()
382 if encoding is None:
383 encoding = 'utf-8'
384 return encoding
385
386
387 def encodeFilename(s, for_subprocess=False):
388 """
389 @param s The name of the file
390 """
391
392 assert type(s) == compat_str
393
394 # Python 3 has a Unicode API
395 if sys.version_info >= (3, 0):
396 return s
397
398 # Pass '' directly to use Unicode APIs on Windows 2000 and up
399 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
400 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
401 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
402 return s
403
404 return s.encode(get_subprocess_encoding(), 'ignore')
405
406
407 def decodeFilename(b, for_subprocess=False):
408
409 if sys.version_info >= (3, 0):
410 return b
411
412 if not isinstance(b, bytes):
413 return b
414
415 return b.decode(get_subprocess_encoding(), 'ignore')
416
417
418 def encodeArgument(s):
419 if not isinstance(s, compat_str):
420 # Legacy code that uses byte strings
421 # Uncomment the following line after fixing all post processors
422 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
423 s = s.decode('ascii')
424 return encodeFilename(s, True)
425
426
427 def decodeArgument(b):
428 return decodeFilename(b, True)
429
430
431 def decodeOption(optval):
432 if optval is None:
433 return optval
434 if isinstance(optval, bytes):
435 optval = optval.decode(preferredencoding())
436
437 assert isinstance(optval, compat_str)
438 return optval
439
440
441 def formatSeconds(secs):
442 if secs > 3600:
443 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
444 elif secs > 60:
445 return '%d:%02d' % (secs // 60, secs % 60)
446 else:
447 return '%d' % secs
448
449
450 def make_HTTPS_handler(params, **kwargs):
451 opts_no_check_certificate = params.get('nocheckcertificate', False)
452 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
453 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
454 if opts_no_check_certificate:
455 context.check_hostname = False
456 context.verify_mode = ssl.CERT_NONE
457 try:
458 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
459 except TypeError:
460 # Python 2.7.8
461 # (create_default_context present but HTTPSHandler has no context=)
462 pass
463
464 if sys.version_info < (3, 2):
465 return YoutubeDLHTTPSHandler(params, **kwargs)
466 else: # Python < 3.4
467 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
468 context.verify_mode = (ssl.CERT_NONE
469 if opts_no_check_certificate
470 else ssl.CERT_REQUIRED)
471 context.set_default_verify_paths()
472 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
473
474
475 def bug_reports_message():
476 if ytdl_is_updateable():
477 update_cmd = 'type youtube-dl -U to update'
478 else:
479 update_cmd = 'see https://yt-dl.org/update on how to update'
480 msg = '; please report this issue on https://yt-dl.org/bug .'
481 msg += ' Make sure you are using the latest version; %s.' % update_cmd
482 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
483 return msg
484
485
486 class ExtractorError(Exception):
487 """Error during info extraction."""
488
489 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
490 """ tb, if given, is the original traceback (so that it can be printed out).
491 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
492 """
493
494 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
495 expected = True
496 if video_id is not None:
497 msg = video_id + ': ' + msg
498 if cause:
499 msg += ' (caused by %r)' % cause
500 if not expected:
501 msg += bug_reports_message()
502 super(ExtractorError, self).__init__(msg)
503
504 self.traceback = tb
505 self.exc_info = sys.exc_info() # preserve original exception
506 self.cause = cause
507 self.video_id = video_id
508
509 def format_traceback(self):
510 if self.traceback is None:
511 return None
512 return ''.join(traceback.format_tb(self.traceback))
513
514
515 class UnsupportedError(ExtractorError):
516 def __init__(self, url):
517 super(UnsupportedError, self).__init__(
518 'Unsupported URL: %s' % url, expected=True)
519 self.url = url
520
521
522 class RegexNotFoundError(ExtractorError):
523 """Error when a regex didn't match"""
524 pass
525
526
527 class DownloadError(Exception):
528 """Download Error exception.
529
530 This exception may be thrown by FileDownloader objects if they are not
531 configured to continue on errors. They will contain the appropriate
532 error message.
533 """
534
535 def __init__(self, msg, exc_info=None):
536 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
537 super(DownloadError, self).__init__(msg)
538 self.exc_info = exc_info
539
540
541 class SameFileError(Exception):
542 """Same File exception.
543
544 This exception will be thrown by FileDownloader objects if they detect
545 multiple files would have to be downloaded to the same file on disk.
546 """
547 pass
548
549
550 class PostProcessingError(Exception):
551 """Post Processing exception.
552
553 This exception may be raised by PostProcessor's .run() method to
554 indicate an error in the postprocessing task.
555 """
556
557 def __init__(self, msg):
558 self.msg = msg
559
560
561 class MaxDownloadsReached(Exception):
562 """ --max-downloads limit has been reached. """
563 pass
564
565
566 class UnavailableVideoError(Exception):
567 """Unavailable Format exception.
568
569 This exception will be thrown when a video is requested
570 in a format that is not available for that video.
571 """
572 pass
573
574
575 class ContentTooShortError(Exception):
576 """Content Too Short exception.
577
578 This exception may be raised by FileDownloader objects when a file they
579 download is too small for what the server announced first, indicating
580 the connection was probably interrupted.
581 """
582
583 def __init__(self, downloaded, expected):
584 # Both in bytes
585 self.downloaded = downloaded
586 self.expected = expected
587
588
589 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
590 hc = http_class(*args, **kwargs)
591 source_address = ydl_handler._params.get('source_address')
592 if source_address is not None:
593 sa = (source_address, 0)
594 if hasattr(hc, 'source_address'): # Python 2.7+
595 hc.source_address = sa
596 else: # Python 2.6
597 def _hc_connect(self, *args, **kwargs):
598 sock = compat_socket_create_connection(
599 (self.host, self.port), self.timeout, sa)
600 if is_https:
601 self.sock = ssl.wrap_socket(
602 sock, self.key_file, self.cert_file,
603 ssl_version=ssl.PROTOCOL_TLSv1)
604 else:
605 self.sock = sock
606 hc.connect = functools.partial(_hc_connect, hc)
607
608 return hc
609
610
611 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
612 """Handler for HTTP requests and responses.
613
614 This class, when installed with an OpenerDirector, automatically adds
615 the standard headers to every HTTP request and handles gzipped and
616 deflated responses from web servers. If compression is to be avoided in
617 a particular request, the original request in the program code only has
618 to include the HTTP header "Youtubedl-No-Compression", which will be
619 removed before making the real request.
620
621 Part of this code was copied from:
622
623 http://techknack.net/python-urllib2-handlers/
624
625 Andrew Rowls, the author of that code, agreed to release it to the
626 public domain.
627 """
628
629 def __init__(self, params, *args, **kwargs):
630 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
631 self._params = params
632
633 def http_open(self, req):
634 return self.do_open(functools.partial(
635 _create_http_connection, self, compat_http_client.HTTPConnection, False),
636 req)
637
638 @staticmethod
639 def deflate(data):
640 try:
641 return zlib.decompress(data, -zlib.MAX_WBITS)
642 except zlib.error:
643 return zlib.decompress(data)
644
645 @staticmethod
646 def addinfourl_wrapper(stream, headers, url, code):
647 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
648 return compat_urllib_request.addinfourl(stream, headers, url, code)
649 ret = compat_urllib_request.addinfourl(stream, headers, url)
650 ret.code = code
651 return ret
652
653 def http_request(self, req):
654 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
655 # always respected by websites, some tend to give out URLs with non percent-encoded
656 # non-ASCII characters (see telemb.py, ard.py [#3412])
657 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
658 # To work around aforementioned issue we will replace request's original URL with
659 # percent-encoded one
660 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
661 # the code of this workaround has been moved here from YoutubeDL.urlopen()
662 url = req.get_full_url()
663 url_escaped = escape_url(url)
664
665 # Substitute URL if any change after escaping
666 if url != url_escaped:
667 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
668 new_req = req_type(
669 url_escaped, data=req.data, headers=req.headers,
670 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
671 new_req.timeout = req.timeout
672 req = new_req
673
674 for h, v in std_headers.items():
675 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
676 # The dict keys are capitalized because of this bug by urllib
677 if h.capitalize() not in req.headers:
678 req.add_header(h, v)
679 if 'Youtubedl-no-compression' in req.headers:
680 if 'Accept-encoding' in req.headers:
681 del req.headers['Accept-encoding']
682 del req.headers['Youtubedl-no-compression']
683
684 if sys.version_info < (2, 7) and '#' in req.get_full_url():
685 # Python 2.6 is brain-dead when it comes to fragments
686 req._Request__original = req._Request__original.partition('#')[0]
687 req._Request__r_type = req._Request__r_type.partition('#')[0]
688
689 return req
690
691 def http_response(self, req, resp):
692 old_resp = resp
693 # gzip
694 if resp.headers.get('Content-encoding', '') == 'gzip':
695 content = resp.read()
696 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
697 try:
698 uncompressed = io.BytesIO(gz.read())
699 except IOError as original_ioerror:
700 # There may be junk add the end of the file
701 # See http://stackoverflow.com/q/4928560/35070 for details
702 for i in range(1, 1024):
703 try:
704 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
705 uncompressed = io.BytesIO(gz.read())
706 except IOError:
707 continue
708 break
709 else:
710 raise original_ioerror
711 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
712 resp.msg = old_resp.msg
713 # deflate
714 if resp.headers.get('Content-encoding', '') == 'deflate':
715 gz = io.BytesIO(self.deflate(resp.read()))
716 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
717 resp.msg = old_resp.msg
718 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986
719 if 300 <= resp.code < 400:
720 location = resp.headers.get('Location')
721 if location:
722 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
723 if sys.version_info >= (3, 0):
724 location = location.encode('iso-8859-1').decode('utf-8')
725 location_escaped = escape_url(location)
726 if location != location_escaped:
727 del resp.headers['Location']
728 resp.headers['Location'] = location_escaped
729 return resp
730
731 https_request = http_request
732 https_response = http_response
733
734
735 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
736 def __init__(self, params, https_conn_class=None, *args, **kwargs):
737 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
738 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
739 self._params = params
740
741 def https_open(self, req):
742 kwargs = {}
743 if hasattr(self, '_context'): # python > 2.6
744 kwargs['context'] = self._context
745 if hasattr(self, '_check_hostname'): # python 3.x
746 kwargs['check_hostname'] = self._check_hostname
747 return self.do_open(functools.partial(
748 _create_http_connection, self, self._https_conn_class, True),
749 req, **kwargs)
750
751
752 def parse_iso8601(date_str, delimiter='T', timezone=None):
753 """ Return a UNIX timestamp from the given date """
754
755 if date_str is None:
756 return None
757
758 if timezone is None:
759 m = re.search(
760 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
761 date_str)
762 if not m:
763 timezone = datetime.timedelta()
764 else:
765 date_str = date_str[:-len(m.group(0))]
766 if not m.group('sign'):
767 timezone = datetime.timedelta()
768 else:
769 sign = 1 if m.group('sign') == '+' else -1
770 timezone = datetime.timedelta(
771 hours=sign * int(m.group('hours')),
772 minutes=sign * int(m.group('minutes')))
773 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
774 dt = datetime.datetime.strptime(date_str, date_format) - timezone
775 return calendar.timegm(dt.timetuple())
776
777
778 def unified_strdate(date_str, day_first=True):
779 """Return a string with the date in the format YYYYMMDD"""
780
781 if date_str is None:
782 return None
783 upload_date = None
784 # Replace commas
785 date_str = date_str.replace(',', ' ')
786 # %z (UTC offset) is only supported in python>=3.2
787 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
788 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
789 # Remove AM/PM + timezone
790 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
791
792 format_expressions = [
793 '%d %B %Y',
794 '%d %b %Y',
795 '%B %d %Y',
796 '%b %d %Y',
797 '%b %dst %Y %I:%M%p',
798 '%b %dnd %Y %I:%M%p',
799 '%b %dth %Y %I:%M%p',
800 '%Y %m %d',
801 '%Y-%m-%d',
802 '%Y/%m/%d',
803 '%Y/%m/%d %H:%M:%S',
804 '%Y-%m-%d %H:%M:%S',
805 '%Y-%m-%d %H:%M:%S.%f',
806 '%d.%m.%Y %H:%M',
807 '%d.%m.%Y %H.%M',
808 '%Y-%m-%dT%H:%M:%SZ',
809 '%Y-%m-%dT%H:%M:%S.%fZ',
810 '%Y-%m-%dT%H:%M:%S.%f0Z',
811 '%Y-%m-%dT%H:%M:%S',
812 '%Y-%m-%dT%H:%M:%S.%f',
813 '%Y-%m-%dT%H:%M',
814 ]
815 if day_first:
816 format_expressions.extend([
817 '%d-%m-%Y',
818 '%d.%m.%Y',
819 '%d/%m/%Y',
820 '%d/%m/%y',
821 '%d/%m/%Y %H:%M:%S',
822 ])
823 else:
824 format_expressions.extend([
825 '%m-%d-%Y',
826 '%m.%d.%Y',
827 '%m/%d/%Y',
828 '%m/%d/%y',
829 '%m/%d/%Y %H:%M:%S',
830 ])
831 for expression in format_expressions:
832 try:
833 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
834 except ValueError:
835 pass
836 if upload_date is None:
837 timetuple = email.utils.parsedate_tz(date_str)
838 if timetuple:
839 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
840 return upload_date
841
842
843 def determine_ext(url, default_ext='unknown_video'):
844 if url is None:
845 return default_ext
846 guess = url.partition('?')[0].rpartition('.')[2]
847 if re.match(r'^[A-Za-z0-9]+$', guess):
848 return guess
849 else:
850 return default_ext
851
852
853 def subtitles_filename(filename, sub_lang, sub_format):
854 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
855
856
857 def date_from_str(date_str):
858 """
859 Return a datetime object from a string in the format YYYYMMDD or
860 (now|today)[+-][0-9](day|week|month|year)(s)?"""
861 today = datetime.date.today()
862 if date_str in ('now', 'today'):
863 return today
864 if date_str == 'yesterday':
865 return today - datetime.timedelta(days=1)
866 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
867 if match is not None:
868 sign = match.group('sign')
869 time = int(match.group('time'))
870 if sign == '-':
871 time = -time
872 unit = match.group('unit')
873 # A bad aproximation?
874 if unit == 'month':
875 unit = 'day'
876 time *= 30
877 elif unit == 'year':
878 unit = 'day'
879 time *= 365
880 unit += 's'
881 delta = datetime.timedelta(**{unit: time})
882 return today + delta
883 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
884
885
886 def hyphenate_date(date_str):
887 """
888 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
889 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
890 if match is not None:
891 return '-'.join(match.groups())
892 else:
893 return date_str
894
895
896 class DateRange(object):
897 """Represents a time interval between two dates"""
898
899 def __init__(self, start=None, end=None):
900 """start and end must be strings in the format accepted by date"""
901 if start is not None:
902 self.start = date_from_str(start)
903 else:
904 self.start = datetime.datetime.min.date()
905 if end is not None:
906 self.end = date_from_str(end)
907 else:
908 self.end = datetime.datetime.max.date()
909 if self.start > self.end:
910 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
911
912 @classmethod
913 def day(cls, day):
914 """Returns a range that only contains the given day"""
915 return cls(day, day)
916
917 def __contains__(self, date):
918 """Check if the date is in the range"""
919 if not isinstance(date, datetime.date):
920 date = date_from_str(date)
921 return self.start <= date <= self.end
922
923 def __str__(self):
924 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
925
926
927 def platform_name():
928 """ Returns the platform name as a compat_str """
929 res = platform.platform()
930 if isinstance(res, bytes):
931 res = res.decode(preferredencoding())
932
933 assert isinstance(res, compat_str)
934 return res
935
936
937 def _windows_write_string(s, out):
938 """ Returns True if the string was written using special methods,
939 False if it has yet to be written out."""
940 # Adapted from http://stackoverflow.com/a/3259271/35070
941
942 import ctypes
943 import ctypes.wintypes
944
945 WIN_OUTPUT_IDS = {
946 1: -11,
947 2: -12,
948 }
949
950 try:
951 fileno = out.fileno()
952 except AttributeError:
953 # If the output stream doesn't have a fileno, it's virtual
954 return False
955 except io.UnsupportedOperation:
956 # Some strange Windows pseudo files?
957 return False
958 if fileno not in WIN_OUTPUT_IDS:
959 return False
960
961 GetStdHandle = ctypes.WINFUNCTYPE(
962 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
963 (b"GetStdHandle", ctypes.windll.kernel32))
964 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
965
966 WriteConsoleW = ctypes.WINFUNCTYPE(
967 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
968 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
969 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
970 written = ctypes.wintypes.DWORD(0)
971
972 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
973 FILE_TYPE_CHAR = 0x0002
974 FILE_TYPE_REMOTE = 0x8000
975 GetConsoleMode = ctypes.WINFUNCTYPE(
976 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
977 ctypes.POINTER(ctypes.wintypes.DWORD))(
978 (b"GetConsoleMode", ctypes.windll.kernel32))
979 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
980
981 def not_a_console(handle):
982 if handle == INVALID_HANDLE_VALUE or handle is None:
983 return True
984 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
985 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
986
987 if not_a_console(h):
988 return False
989
990 def next_nonbmp_pos(s):
991 try:
992 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
993 except StopIteration:
994 return len(s)
995
996 while s:
997 count = min(next_nonbmp_pos(s), 1024)
998
999 ret = WriteConsoleW(
1000 h, s, count if count else 2, ctypes.byref(written), None)
1001 if ret == 0:
1002 raise OSError('Failed to write string')
1003 if not count: # We just wrote a non-BMP character
1004 assert written.value == 2
1005 s = s[1:]
1006 else:
1007 assert written.value > 0
1008 s = s[written.value:]
1009 return True
1010
1011
1012 def write_string(s, out=None, encoding=None):
1013 if out is None:
1014 out = sys.stderr
1015 assert type(s) == compat_str
1016
1017 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1018 if _windows_write_string(s, out):
1019 return
1020
1021 if ('b' in getattr(out, 'mode', '') or
1022 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1023 byt = s.encode(encoding or preferredencoding(), 'ignore')
1024 out.write(byt)
1025 elif hasattr(out, 'buffer'):
1026 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1027 byt = s.encode(enc, 'ignore')
1028 out.buffer.write(byt)
1029 else:
1030 out.write(s)
1031 out.flush()
1032
1033
1034 def bytes_to_intlist(bs):
1035 if not bs:
1036 return []
1037 if isinstance(bs[0], int): # Python 3
1038 return list(bs)
1039 else:
1040 return [ord(c) for c in bs]
1041
1042
1043 def intlist_to_bytes(xs):
1044 if not xs:
1045 return b''
1046 return struct_pack('%dB' % len(xs), *xs)
1047
1048
1049 # Cross-platform file locking
1050 if sys.platform == 'win32':
1051 import ctypes.wintypes
1052 import msvcrt
1053
1054 class OVERLAPPED(ctypes.Structure):
1055 _fields_ = [
1056 ('Internal', ctypes.wintypes.LPVOID),
1057 ('InternalHigh', ctypes.wintypes.LPVOID),
1058 ('Offset', ctypes.wintypes.DWORD),
1059 ('OffsetHigh', ctypes.wintypes.DWORD),
1060 ('hEvent', ctypes.wintypes.HANDLE),
1061 ]
1062
1063 kernel32 = ctypes.windll.kernel32
1064 LockFileEx = kernel32.LockFileEx
1065 LockFileEx.argtypes = [
1066 ctypes.wintypes.HANDLE, # hFile
1067 ctypes.wintypes.DWORD, # dwFlags
1068 ctypes.wintypes.DWORD, # dwReserved
1069 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1070 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1071 ctypes.POINTER(OVERLAPPED) # Overlapped
1072 ]
1073 LockFileEx.restype = ctypes.wintypes.BOOL
1074 UnlockFileEx = kernel32.UnlockFileEx
1075 UnlockFileEx.argtypes = [
1076 ctypes.wintypes.HANDLE, # hFile
1077 ctypes.wintypes.DWORD, # dwReserved
1078 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1079 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1080 ctypes.POINTER(OVERLAPPED) # Overlapped
1081 ]
1082 UnlockFileEx.restype = ctypes.wintypes.BOOL
1083 whole_low = 0xffffffff
1084 whole_high = 0x7fffffff
1085
1086 def _lock_file(f, exclusive):
1087 overlapped = OVERLAPPED()
1088 overlapped.Offset = 0
1089 overlapped.OffsetHigh = 0
1090 overlapped.hEvent = 0
1091 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1092 handle = msvcrt.get_osfhandle(f.fileno())
1093 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1094 whole_low, whole_high, f._lock_file_overlapped_p):
1095 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1096
1097 def _unlock_file(f):
1098 assert f._lock_file_overlapped_p
1099 handle = msvcrt.get_osfhandle(f.fileno())
1100 if not UnlockFileEx(handle, 0,
1101 whole_low, whole_high, f._lock_file_overlapped_p):
1102 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1103
1104 else:
1105 import fcntl
1106
1107 def _lock_file(f, exclusive):
1108 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1109
1110 def _unlock_file(f):
1111 fcntl.flock(f, fcntl.LOCK_UN)
1112
1113
1114 class locked_file(object):
1115 def __init__(self, filename, mode, encoding=None):
1116 assert mode in ['r', 'a', 'w']
1117 self.f = io.open(filename, mode, encoding=encoding)
1118 self.mode = mode
1119
1120 def __enter__(self):
1121 exclusive = self.mode != 'r'
1122 try:
1123 _lock_file(self.f, exclusive)
1124 except IOError:
1125 self.f.close()
1126 raise
1127 return self
1128
1129 def __exit__(self, etype, value, traceback):
1130 try:
1131 _unlock_file(self.f)
1132 finally:
1133 self.f.close()
1134
1135 def __iter__(self):
1136 return iter(self.f)
1137
1138 def write(self, *args):
1139 return self.f.write(*args)
1140
1141 def read(self, *args):
1142 return self.f.read(*args)
1143
1144
1145 def get_filesystem_encoding():
1146 encoding = sys.getfilesystemencoding()
1147 return encoding if encoding is not None else 'utf-8'
1148
1149
1150 def shell_quote(args):
1151 quoted_args = []
1152 encoding = get_filesystem_encoding()
1153 for a in args:
1154 if isinstance(a, bytes):
1155 # We may get a filename encoded with 'encodeFilename'
1156 a = a.decode(encoding)
1157 quoted_args.append(pipes.quote(a))
1158 return ' '.join(quoted_args)
1159
1160
1161 def smuggle_url(url, data):
1162 """ Pass additional data in a URL for internal use. """
1163
1164 sdata = compat_urllib_parse.urlencode(
1165 {'__youtubedl_smuggle': json.dumps(data)})
1166 return url + '#' + sdata
1167
1168
1169 def unsmuggle_url(smug_url, default=None):
1170 if '#__youtubedl_smuggle' not in smug_url:
1171 return smug_url, default
1172 url, _, sdata = smug_url.rpartition('#')
1173 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1174 data = json.loads(jsond)
1175 return url, data
1176
1177
1178 def format_bytes(bytes):
1179 if bytes is None:
1180 return 'N/A'
1181 if type(bytes) is str:
1182 bytes = float(bytes)
1183 if bytes == 0.0:
1184 exponent = 0
1185 else:
1186 exponent = int(math.log(bytes, 1024.0))
1187 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1188 converted = float(bytes) / float(1024 ** exponent)
1189 return '%.2f%s' % (converted, suffix)
1190
1191
1192 def parse_filesize(s):
1193 if s is None:
1194 return None
1195
1196 # The lower-case forms are of course incorrect and inofficial,
1197 # but we support those too
1198 _UNIT_TABLE = {
1199 'B': 1,
1200 'b': 1,
1201 'KiB': 1024,
1202 'KB': 1000,
1203 'kB': 1024,
1204 'Kb': 1000,
1205 'MiB': 1024 ** 2,
1206 'MB': 1000 ** 2,
1207 'mB': 1024 ** 2,
1208 'Mb': 1000 ** 2,
1209 'GiB': 1024 ** 3,
1210 'GB': 1000 ** 3,
1211 'gB': 1024 ** 3,
1212 'Gb': 1000 ** 3,
1213 'TiB': 1024 ** 4,
1214 'TB': 1000 ** 4,
1215 'tB': 1024 ** 4,
1216 'Tb': 1000 ** 4,
1217 'PiB': 1024 ** 5,
1218 'PB': 1000 ** 5,
1219 'pB': 1024 ** 5,
1220 'Pb': 1000 ** 5,
1221 'EiB': 1024 ** 6,
1222 'EB': 1000 ** 6,
1223 'eB': 1024 ** 6,
1224 'Eb': 1000 ** 6,
1225 'ZiB': 1024 ** 7,
1226 'ZB': 1000 ** 7,
1227 'zB': 1024 ** 7,
1228 'Zb': 1000 ** 7,
1229 'YiB': 1024 ** 8,
1230 'YB': 1000 ** 8,
1231 'yB': 1024 ** 8,
1232 'Yb': 1000 ** 8,
1233 }
1234
1235 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1236 m = re.match(
1237 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1238 if not m:
1239 return None
1240
1241 num_str = m.group('num').replace(',', '.')
1242 mult = _UNIT_TABLE[m.group('unit')]
1243 return int(float(num_str) * mult)
1244
1245
1246 def month_by_name(name):
1247 """ Return the number of a month by (locale-independently) English name """
1248
1249 try:
1250 return ENGLISH_MONTH_NAMES.index(name) + 1
1251 except ValueError:
1252 return None
1253
1254
1255 def month_by_abbreviation(abbrev):
1256 """ Return the number of a month by (locale-independently) English
1257 abbreviations """
1258
1259 try:
1260 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1261 except ValueError:
1262 return None
1263
1264
1265 def fix_xml_ampersands(xml_str):
1266 """Replace all the '&' by '&amp;' in XML"""
1267 return re.sub(
1268 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1269 '&amp;',
1270 xml_str)
1271
1272
1273 def setproctitle(title):
1274 assert isinstance(title, compat_str)
1275 try:
1276 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1277 except OSError:
1278 return
1279 title_bytes = title.encode('utf-8')
1280 buf = ctypes.create_string_buffer(len(title_bytes))
1281 buf.value = title_bytes
1282 try:
1283 libc.prctl(15, buf, 0, 0, 0)
1284 except AttributeError:
1285 return # Strange libc, just skip this
1286
1287
1288 def remove_start(s, start):
1289 if s.startswith(start):
1290 return s[len(start):]
1291 return s
1292
1293
1294 def remove_end(s, end):
1295 if s.endswith(end):
1296 return s[:-len(end)]
1297 return s
1298
1299
1300 def url_basename(url):
1301 path = compat_urlparse.urlparse(url).path
1302 return path.strip('/').split('/')[-1]
1303
1304
1305 class HEADRequest(compat_urllib_request.Request):
1306 def get_method(self):
1307 return "HEAD"
1308
1309
1310 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1311 if get_attr:
1312 if v is not None:
1313 v = getattr(v, get_attr, None)
1314 if v == '':
1315 v = None
1316 return default if v is None else (int(v) * invscale // scale)
1317
1318
1319 def str_or_none(v, default=None):
1320 return default if v is None else compat_str(v)
1321
1322
1323 def str_to_int(int_str):
1324 """ A more relaxed version of int_or_none """
1325 if int_str is None:
1326 return None
1327 int_str = re.sub(r'[,\.\+]', '', int_str)
1328 return int(int_str)
1329
1330
1331 def float_or_none(v, scale=1, invscale=1, default=None):
1332 return default if v is None else (float(v) * invscale / scale)
1333
1334
1335 def parse_duration(s):
1336 if not isinstance(s, compat_basestring):
1337 return None
1338
1339 s = s.strip()
1340
1341 m = re.match(
1342 r'''(?ix)(?:P?T)?
1343 (?:
1344 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1345 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1346
1347 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1348 (?:
1349 (?:
1350 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1351 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1352 )?
1353 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1354 )?
1355 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1356 )$''', s)
1357 if not m:
1358 return None
1359 res = 0
1360 if m.group('only_mins'):
1361 return float_or_none(m.group('only_mins'), invscale=60)
1362 if m.group('only_hours'):
1363 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1364 if m.group('secs'):
1365 res += int(m.group('secs'))
1366 if m.group('mins_reversed'):
1367 res += int(m.group('mins_reversed')) * 60
1368 if m.group('mins'):
1369 res += int(m.group('mins')) * 60
1370 if m.group('hours'):
1371 res += int(m.group('hours')) * 60 * 60
1372 if m.group('hours_reversed'):
1373 res += int(m.group('hours_reversed')) * 60 * 60
1374 if m.group('days'):
1375 res += int(m.group('days')) * 24 * 60 * 60
1376 if m.group('ms'):
1377 res += float(m.group('ms'))
1378 return res
1379
1380
1381 def prepend_extension(filename, ext, expected_real_ext=None):
1382 name, real_ext = os.path.splitext(filename)
1383 return (
1384 '{0}.{1}{2}'.format(name, ext, real_ext)
1385 if not expected_real_ext or real_ext[1:] == expected_real_ext
1386 else '{0}.{1}'.format(filename, ext))
1387
1388
1389 def replace_extension(filename, ext, expected_real_ext=None):
1390 name, real_ext = os.path.splitext(filename)
1391 return '{0}.{1}'.format(
1392 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1393 ext)
1394
1395
1396 def check_executable(exe, args=[]):
1397 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1398 args can be a list of arguments for a short output (like -version) """
1399 try:
1400 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1401 except OSError:
1402 return False
1403 return exe
1404
1405
1406 def get_exe_version(exe, args=['--version'],
1407 version_re=None, unrecognized='present'):
1408 """ Returns the version of the specified executable,
1409 or False if the executable is not present """
1410 try:
1411 out, _ = subprocess.Popen(
1412 [encodeArgument(exe)] + args,
1413 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1414 except OSError:
1415 return False
1416 if isinstance(out, bytes): # Python 2.x
1417 out = out.decode('ascii', 'ignore')
1418 return detect_exe_version(out, version_re, unrecognized)
1419
1420
1421 def detect_exe_version(output, version_re=None, unrecognized='present'):
1422 assert isinstance(output, compat_str)
1423 if version_re is None:
1424 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1425 m = re.search(version_re, output)
1426 if m:
1427 return m.group(1)
1428 else:
1429 return unrecognized
1430
1431
1432 class PagedList(object):
1433 def __len__(self):
1434 # This is only useful for tests
1435 return len(self.getslice())
1436
1437
1438 class OnDemandPagedList(PagedList):
1439 def __init__(self, pagefunc, pagesize):
1440 self._pagefunc = pagefunc
1441 self._pagesize = pagesize
1442
1443 def getslice(self, start=0, end=None):
1444 res = []
1445 for pagenum in itertools.count(start // self._pagesize):
1446 firstid = pagenum * self._pagesize
1447 nextfirstid = pagenum * self._pagesize + self._pagesize
1448 if start >= nextfirstid:
1449 continue
1450
1451 page_results = list(self._pagefunc(pagenum))
1452
1453 startv = (
1454 start % self._pagesize
1455 if firstid <= start < nextfirstid
1456 else 0)
1457
1458 endv = (
1459 ((end - 1) % self._pagesize) + 1
1460 if (end is not None and firstid <= end <= nextfirstid)
1461 else None)
1462
1463 if startv != 0 or endv is not None:
1464 page_results = page_results[startv:endv]
1465 res.extend(page_results)
1466
1467 # A little optimization - if current page is not "full", ie. does
1468 # not contain page_size videos then we can assume that this page
1469 # is the last one - there are no more ids on further pages -
1470 # i.e. no need to query again.
1471 if len(page_results) + startv < self._pagesize:
1472 break
1473
1474 # If we got the whole page, but the next page is not interesting,
1475 # break out early as well
1476 if end == nextfirstid:
1477 break
1478 return res
1479
1480
1481 class InAdvancePagedList(PagedList):
1482 def __init__(self, pagefunc, pagecount, pagesize):
1483 self._pagefunc = pagefunc
1484 self._pagecount = pagecount
1485 self._pagesize = pagesize
1486
1487 def getslice(self, start=0, end=None):
1488 res = []
1489 start_page = start // self._pagesize
1490 end_page = (
1491 self._pagecount if end is None else (end // self._pagesize + 1))
1492 skip_elems = start - start_page * self._pagesize
1493 only_more = None if end is None else end - start
1494 for pagenum in range(start_page, end_page):
1495 page = list(self._pagefunc(pagenum))
1496 if skip_elems:
1497 page = page[skip_elems:]
1498 skip_elems = None
1499 if only_more is not None:
1500 if len(page) < only_more:
1501 only_more -= len(page)
1502 else:
1503 page = page[:only_more]
1504 res.extend(page)
1505 break
1506 res.extend(page)
1507 return res
1508
1509
1510 def uppercase_escape(s):
1511 unicode_escape = codecs.getdecoder('unicode_escape')
1512 return re.sub(
1513 r'\\U[0-9a-fA-F]{8}',
1514 lambda m: unicode_escape(m.group(0))[0],
1515 s)
1516
1517
1518 def lowercase_escape(s):
1519 unicode_escape = codecs.getdecoder('unicode_escape')
1520 return re.sub(
1521 r'\\u[0-9a-fA-F]{4}',
1522 lambda m: unicode_escape(m.group(0))[0],
1523 s)
1524
1525
1526 def escape_rfc3986(s):
1527 """Escape non-ASCII characters as suggested by RFC 3986"""
1528 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1529 s = s.encode('utf-8')
1530 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1531
1532
1533 def escape_url(url):
1534 """Escape URL as suggested by RFC 3986"""
1535 url_parsed = compat_urllib_parse_urlparse(url)
1536 return url_parsed._replace(
1537 path=escape_rfc3986(url_parsed.path),
1538 params=escape_rfc3986(url_parsed.params),
1539 query=escape_rfc3986(url_parsed.query),
1540 fragment=escape_rfc3986(url_parsed.fragment)
1541 ).geturl()
1542
1543 try:
1544 struct.pack('!I', 0)
1545 except TypeError:
1546 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1547 def struct_pack(spec, *args):
1548 if isinstance(spec, compat_str):
1549 spec = spec.encode('ascii')
1550 return struct.pack(spec, *args)
1551
1552 def struct_unpack(spec, *args):
1553 if isinstance(spec, compat_str):
1554 spec = spec.encode('ascii')
1555 return struct.unpack(spec, *args)
1556 else:
1557 struct_pack = struct.pack
1558 struct_unpack = struct.unpack
1559
1560
1561 def read_batch_urls(batch_fd):
1562 def fixup(url):
1563 if not isinstance(url, compat_str):
1564 url = url.decode('utf-8', 'replace')
1565 BOM_UTF8 = '\xef\xbb\xbf'
1566 if url.startswith(BOM_UTF8):
1567 url = url[len(BOM_UTF8):]
1568 url = url.strip()
1569 if url.startswith(('#', ';', ']')):
1570 return False
1571 return url
1572
1573 with contextlib.closing(batch_fd) as fd:
1574 return [url for url in map(fixup, fd) if url]
1575
1576
1577 def urlencode_postdata(*args, **kargs):
1578 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1579
1580
1581 try:
1582 etree_iter = xml.etree.ElementTree.Element.iter
1583 except AttributeError: # Python <=2.6
1584 etree_iter = lambda n: n.findall('.//*')
1585
1586
1587 def parse_xml(s):
1588 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1589 def doctype(self, name, pubid, system):
1590 pass # Ignore doctypes
1591
1592 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1593 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1594 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1595 # Fix up XML parser in Python 2.x
1596 if sys.version_info < (3, 0):
1597 for n in etree_iter(tree):
1598 if n.text is not None:
1599 if not isinstance(n.text, compat_str):
1600 n.text = n.text.decode('utf-8')
1601 return tree
1602
1603
1604 US_RATINGS = {
1605 'G': 0,
1606 'PG': 10,
1607 'PG-13': 13,
1608 'R': 16,
1609 'NC': 18,
1610 }
1611
1612
1613 def parse_age_limit(s):
1614 if s is None:
1615 return None
1616 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1617 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1618
1619
1620 def strip_jsonp(code):
1621 return re.sub(
1622 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1623
1624
1625 def js_to_json(code):
1626 def fix_kv(m):
1627 v = m.group(0)
1628 if v in ('true', 'false', 'null'):
1629 return v
1630 if v.startswith('"'):
1631 return v
1632 if v.startswith("'"):
1633 v = v[1:-1]
1634 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1635 '\\\\': '\\\\',
1636 "\\'": "'",
1637 '"': '\\"',
1638 }[m.group(0)], v)
1639 return '"%s"' % v
1640
1641 res = re.sub(r'''(?x)
1642 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1643 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1644 [a-zA-Z_][.a-zA-Z_0-9]*
1645 ''', fix_kv, code)
1646 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1647 return res
1648
1649
1650 def qualities(quality_ids):
1651 """ Get a numeric quality value out of a list of possible values """
1652 def q(qid):
1653 try:
1654 return quality_ids.index(qid)
1655 except ValueError:
1656 return -1
1657 return q
1658
1659
1660 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1661
1662
1663 def limit_length(s, length):
1664 """ Add ellipses to overly long strings """
1665 if s is None:
1666 return None
1667 ELLIPSES = '...'
1668 if len(s) > length:
1669 return s[:length - len(ELLIPSES)] + ELLIPSES
1670 return s
1671
1672
1673 def version_tuple(v):
1674 return tuple(int(e) for e in re.split(r'[-.]', v))
1675
1676
1677 def is_outdated_version(version, limit, assume_new=True):
1678 if not version:
1679 return not assume_new
1680 try:
1681 return version_tuple(version) < version_tuple(limit)
1682 except ValueError:
1683 return not assume_new
1684
1685
1686 def ytdl_is_updateable():
1687 """ Returns if youtube-dl can be updated with -U """
1688 from zipimport import zipimporter
1689
1690 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1691
1692
1693 def args_to_str(args):
1694 # Get a short string representation for a subprocess command
1695 return ' '.join(shlex_quote(a) for a in args)
1696
1697
1698 def mimetype2ext(mt):
1699 _, _, res = mt.rpartition('/')
1700
1701 return {
1702 'x-ms-wmv': 'wmv',
1703 'x-mp4-fragmented': 'mp4',
1704 'ttml+xml': 'ttml',
1705 }.get(res, res)
1706
1707
1708 def urlhandle_detect_ext(url_handle):
1709 try:
1710 url_handle.headers
1711 getheader = lambda h: url_handle.headers[h]
1712 except AttributeError: # Python < 3
1713 getheader = url_handle.info().getheader
1714
1715 cd = getheader('Content-Disposition')
1716 if cd:
1717 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1718 if m:
1719 e = determine_ext(m.group('filename'), default_ext=None)
1720 if e:
1721 return e
1722
1723 return mimetype2ext(getheader('Content-Type'))
1724
1725
1726 def age_restricted(content_limit, age_limit):
1727 """ Returns True iff the content should be blocked """
1728
1729 if age_limit is None: # No limit set
1730 return False
1731 if content_limit is None:
1732 return False # Content available for everyone
1733 return age_limit < content_limit
1734
1735
1736 def is_html(first_bytes):
1737 """ Detect whether a file contains HTML by examining its first bytes. """
1738
1739 BOMS = [
1740 (b'\xef\xbb\xbf', 'utf-8'),
1741 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1742 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1743 (b'\xff\xfe', 'utf-16-le'),
1744 (b'\xfe\xff', 'utf-16-be'),
1745 ]
1746 for bom, enc in BOMS:
1747 if first_bytes.startswith(bom):
1748 s = first_bytes[len(bom):].decode(enc, 'replace')
1749 break
1750 else:
1751 s = first_bytes.decode('utf-8', 'replace')
1752
1753 return re.match(r'^\s*<', s)
1754
1755
1756 def determine_protocol(info_dict):
1757 protocol = info_dict.get('protocol')
1758 if protocol is not None:
1759 return protocol
1760
1761 url = info_dict['url']
1762 if url.startswith('rtmp'):
1763 return 'rtmp'
1764 elif url.startswith('mms'):
1765 return 'mms'
1766 elif url.startswith('rtsp'):
1767 return 'rtsp'
1768
1769 ext = determine_ext(url)
1770 if ext == 'm3u8':
1771 return 'm3u8'
1772 elif ext == 'f4m':
1773 return 'f4m'
1774
1775 return compat_urllib_parse_urlparse(url).scheme
1776
1777
1778 def render_table(header_row, data):
1779 """ Render a list of rows, each as a list of values """
1780 table = [header_row] + data
1781 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1782 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1783 return '\n'.join(format_str % tuple(row) for row in table)
1784
1785
1786 def _match_one(filter_part, dct):
1787 COMPARISON_OPERATORS = {
1788 '<': operator.lt,
1789 '<=': operator.le,
1790 '>': operator.gt,
1791 '>=': operator.ge,
1792 '=': operator.eq,
1793 '!=': operator.ne,
1794 }
1795 operator_rex = re.compile(r'''(?x)\s*
1796 (?P<key>[a-z_]+)
1797 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1798 (?:
1799 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1800 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1801 )
1802 \s*$
1803 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1804 m = operator_rex.search(filter_part)
1805 if m:
1806 op = COMPARISON_OPERATORS[m.group('op')]
1807 if m.group('strval') is not None:
1808 if m.group('op') not in ('=', '!='):
1809 raise ValueError(
1810 'Operator %s does not support string values!' % m.group('op'))
1811 comparison_value = m.group('strval')
1812 else:
1813 try:
1814 comparison_value = int(m.group('intval'))
1815 except ValueError:
1816 comparison_value = parse_filesize(m.group('intval'))
1817 if comparison_value is None:
1818 comparison_value = parse_filesize(m.group('intval') + 'B')
1819 if comparison_value is None:
1820 raise ValueError(
1821 'Invalid integer value %r in filter part %r' % (
1822 m.group('intval'), filter_part))
1823 actual_value = dct.get(m.group('key'))
1824 if actual_value is None:
1825 return m.group('none_inclusive')
1826 return op(actual_value, comparison_value)
1827
1828 UNARY_OPERATORS = {
1829 '': lambda v: v is not None,
1830 '!': lambda v: v is None,
1831 }
1832 operator_rex = re.compile(r'''(?x)\s*
1833 (?P<op>%s)\s*(?P<key>[a-z_]+)
1834 \s*$
1835 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1836 m = operator_rex.search(filter_part)
1837 if m:
1838 op = UNARY_OPERATORS[m.group('op')]
1839 actual_value = dct.get(m.group('key'))
1840 return op(actual_value)
1841
1842 raise ValueError('Invalid filter part %r' % filter_part)
1843
1844
1845 def match_str(filter_str, dct):
1846 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1847
1848 return all(
1849 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1850
1851
1852 def match_filter_func(filter_str):
1853 def _match_func(info_dict):
1854 if match_str(filter_str, info_dict):
1855 return None
1856 else:
1857 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1858 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1859 return _match_func
1860
1861
1862 def parse_dfxp_time_expr(time_expr):
1863 if not time_expr:
1864 return 0.0
1865
1866 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1867 if mobj:
1868 return float(mobj.group('time_offset'))
1869
1870 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1871 if mobj:
1872 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1873
1874
1875 def srt_subtitles_timecode(seconds):
1876 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1877
1878
1879 def dfxp2srt(dfxp_data):
1880 _x = functools.partial(xpath_with_ns, ns_map={
1881 'ttml': 'http://www.w3.org/ns/ttml',
1882 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1883 })
1884
1885 def parse_node(node):
1886 str_or_empty = functools.partial(str_or_none, default='')
1887
1888 out = str_or_empty(node.text)
1889
1890 for child in node:
1891 if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1892 out += '\n' + str_or_empty(child.tail)
1893 elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1894 out += str_or_empty(parse_node(child))
1895 else:
1896 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1897
1898 return out
1899
1900 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1901 out = []
1902 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1903
1904 if not paras:
1905 raise ValueError('Invalid dfxp/TTML subtitle')
1906
1907 for para, index in zip(paras, itertools.count(1)):
1908 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1909 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1910 if not end_time:
1911 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1912 out.append('%d\n%s --> %s\n%s\n\n' % (
1913 index,
1914 srt_subtitles_timecode(begin_time),
1915 srt_subtitles_timecode(end_time),
1916 parse_node(para)))
1917
1918 return ''.join(out)
1919
1920
1921 class ISO639Utils(object):
1922 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
1923 _lang_map = {
1924 'aa': 'aar',
1925 'ab': 'abk',
1926 'ae': 'ave',
1927 'af': 'afr',
1928 'ak': 'aka',
1929 'am': 'amh',
1930 'an': 'arg',
1931 'ar': 'ara',
1932 'as': 'asm',
1933 'av': 'ava',
1934 'ay': 'aym',
1935 'az': 'aze',
1936 'ba': 'bak',
1937 'be': 'bel',
1938 'bg': 'bul',
1939 'bh': 'bih',
1940 'bi': 'bis',
1941 'bm': 'bam',
1942 'bn': 'ben',
1943 'bo': 'bod',
1944 'br': 'bre',
1945 'bs': 'bos',
1946 'ca': 'cat',
1947 'ce': 'che',
1948 'ch': 'cha',
1949 'co': 'cos',
1950 'cr': 'cre',
1951 'cs': 'ces',
1952 'cu': 'chu',
1953 'cv': 'chv',
1954 'cy': 'cym',
1955 'da': 'dan',
1956 'de': 'deu',
1957 'dv': 'div',
1958 'dz': 'dzo',
1959 'ee': 'ewe',
1960 'el': 'ell',
1961 'en': 'eng',
1962 'eo': 'epo',
1963 'es': 'spa',
1964 'et': 'est',
1965 'eu': 'eus',
1966 'fa': 'fas',
1967 'ff': 'ful',
1968 'fi': 'fin',
1969 'fj': 'fij',
1970 'fo': 'fao',
1971 'fr': 'fra',
1972 'fy': 'fry',
1973 'ga': 'gle',
1974 'gd': 'gla',
1975 'gl': 'glg',
1976 'gn': 'grn',
1977 'gu': 'guj',
1978 'gv': 'glv',
1979 'ha': 'hau',
1980 'he': 'heb',
1981 'hi': 'hin',
1982 'ho': 'hmo',
1983 'hr': 'hrv',
1984 'ht': 'hat',
1985 'hu': 'hun',
1986 'hy': 'hye',
1987 'hz': 'her',
1988 'ia': 'ina',
1989 'id': 'ind',
1990 'ie': 'ile',
1991 'ig': 'ibo',
1992 'ii': 'iii',
1993 'ik': 'ipk',
1994 'io': 'ido',
1995 'is': 'isl',
1996 'it': 'ita',
1997 'iu': 'iku',
1998 'ja': 'jpn',
1999 'jv': 'jav',
2000 'ka': 'kat',
2001 'kg': 'kon',
2002 'ki': 'kik',
2003 'kj': 'kua',
2004 'kk': 'kaz',
2005 'kl': 'kal',
2006 'km': 'khm',
2007 'kn': 'kan',
2008 'ko': 'kor',
2009 'kr': 'kau',
2010 'ks': 'kas',
2011 'ku': 'kur',
2012 'kv': 'kom',
2013 'kw': 'cor',
2014 'ky': 'kir',
2015 'la': 'lat',
2016 'lb': 'ltz',
2017 'lg': 'lug',
2018 'li': 'lim',
2019 'ln': 'lin',
2020 'lo': 'lao',
2021 'lt': 'lit',
2022 'lu': 'lub',
2023 'lv': 'lav',
2024 'mg': 'mlg',
2025 'mh': 'mah',
2026 'mi': 'mri',
2027 'mk': 'mkd',
2028 'ml': 'mal',
2029 'mn': 'mon',
2030 'mr': 'mar',
2031 'ms': 'msa',
2032 'mt': 'mlt',
2033 'my': 'mya',
2034 'na': 'nau',
2035 'nb': 'nob',
2036 'nd': 'nde',
2037 'ne': 'nep',
2038 'ng': 'ndo',
2039 'nl': 'nld',
2040 'nn': 'nno',
2041 'no': 'nor',
2042 'nr': 'nbl',
2043 'nv': 'nav',
2044 'ny': 'nya',
2045 'oc': 'oci',
2046 'oj': 'oji',
2047 'om': 'orm',
2048 'or': 'ori',
2049 'os': 'oss',
2050 'pa': 'pan',
2051 'pi': 'pli',
2052 'pl': 'pol',
2053 'ps': 'pus',
2054 'pt': 'por',
2055 'qu': 'que',
2056 'rm': 'roh',
2057 'rn': 'run',
2058 'ro': 'ron',
2059 'ru': 'rus',
2060 'rw': 'kin',
2061 'sa': 'san',
2062 'sc': 'srd',
2063 'sd': 'snd',
2064 'se': 'sme',
2065 'sg': 'sag',
2066 'si': 'sin',
2067 'sk': 'slk',
2068 'sl': 'slv',
2069 'sm': 'smo',
2070 'sn': 'sna',
2071 'so': 'som',
2072 'sq': 'sqi',
2073 'sr': 'srp',
2074 'ss': 'ssw',
2075 'st': 'sot',
2076 'su': 'sun',
2077 'sv': 'swe',
2078 'sw': 'swa',
2079 'ta': 'tam',
2080 'te': 'tel',
2081 'tg': 'tgk',
2082 'th': 'tha',
2083 'ti': 'tir',
2084 'tk': 'tuk',
2085 'tl': 'tgl',
2086 'tn': 'tsn',
2087 'to': 'ton',
2088 'tr': 'tur',
2089 'ts': 'tso',
2090 'tt': 'tat',
2091 'tw': 'twi',
2092 'ty': 'tah',
2093 'ug': 'uig',
2094 'uk': 'ukr',
2095 'ur': 'urd',
2096 'uz': 'uzb',
2097 've': 'ven',
2098 'vi': 'vie',
2099 'vo': 'vol',
2100 'wa': 'wln',
2101 'wo': 'wol',
2102 'xh': 'xho',
2103 'yi': 'yid',
2104 'yo': 'yor',
2105 'za': 'zha',
2106 'zh': 'zho',
2107 'zu': 'zul',
2108 }
2109
2110 @classmethod
2111 def short2long(cls, code):
2112 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2113 return cls._lang_map.get(code[:2])
2114
2115 @classmethod
2116 def long2short(cls, code):
2117 """Convert language code from ISO 639-2/T to ISO 639-1"""
2118 for short_name, long_name in cls._lang_map.items():
2119 if long_name == code:
2120 return short_name
2121
2122
2123 class ISO3166Utils(object):
2124 # From http://data.okfn.org/data/core/country-list
2125 _country_map = {
2126 'AF': 'Afghanistan',
2127 'AX': 'Åland Islands',
2128 'AL': 'Albania',
2129 'DZ': 'Algeria',
2130 'AS': 'American Samoa',
2131 'AD': 'Andorra',
2132 'AO': 'Angola',
2133 'AI': 'Anguilla',
2134 'AQ': 'Antarctica',
2135 'AG': 'Antigua and Barbuda',
2136 'AR': 'Argentina',
2137 'AM': 'Armenia',
2138 'AW': 'Aruba',
2139 'AU': 'Australia',
2140 'AT': 'Austria',
2141 'AZ': 'Azerbaijan',
2142 'BS': 'Bahamas',
2143 'BH': 'Bahrain',
2144 'BD': 'Bangladesh',
2145 'BB': 'Barbados',
2146 'BY': 'Belarus',
2147 'BE': 'Belgium',
2148 'BZ': 'Belize',
2149 'BJ': 'Benin',
2150 'BM': 'Bermuda',
2151 'BT': 'Bhutan',
2152 'BO': 'Bolivia, Plurinational State of',
2153 'BQ': 'Bonaire, Sint Eustatius and Saba',
2154 'BA': 'Bosnia and Herzegovina',
2155 'BW': 'Botswana',
2156 'BV': 'Bouvet Island',
2157 'BR': 'Brazil',
2158 'IO': 'British Indian Ocean Territory',
2159 'BN': 'Brunei Darussalam',
2160 'BG': 'Bulgaria',
2161 'BF': 'Burkina Faso',
2162 'BI': 'Burundi',
2163 'KH': 'Cambodia',
2164 'CM': 'Cameroon',
2165 'CA': 'Canada',
2166 'CV': 'Cape Verde',
2167 'KY': 'Cayman Islands',
2168 'CF': 'Central African Republic',
2169 'TD': 'Chad',
2170 'CL': 'Chile',
2171 'CN': 'China',
2172 'CX': 'Christmas Island',
2173 'CC': 'Cocos (Keeling) Islands',
2174 'CO': 'Colombia',
2175 'KM': 'Comoros',
2176 'CG': 'Congo',
2177 'CD': 'Congo, the Democratic Republic of the',
2178 'CK': 'Cook Islands',
2179 'CR': 'Costa Rica',
2180 'CI': 'Côte d\'Ivoire',
2181 'HR': 'Croatia',
2182 'CU': 'Cuba',
2183 'CW': 'Curaçao',
2184 'CY': 'Cyprus',
2185 'CZ': 'Czech Republic',
2186 'DK': 'Denmark',
2187 'DJ': 'Djibouti',
2188 'DM': 'Dominica',
2189 'DO': 'Dominican Republic',
2190 'EC': 'Ecuador',
2191 'EG': 'Egypt',
2192 'SV': 'El Salvador',
2193 'GQ': 'Equatorial Guinea',
2194 'ER': 'Eritrea',
2195 'EE': 'Estonia',
2196 'ET': 'Ethiopia',
2197 'FK': 'Falkland Islands (Malvinas)',
2198 'FO': 'Faroe Islands',
2199 'FJ': 'Fiji',
2200 'FI': 'Finland',
2201 'FR': 'France',
2202 'GF': 'French Guiana',
2203 'PF': 'French Polynesia',
2204 'TF': 'French Southern Territories',
2205 'GA': 'Gabon',
2206 'GM': 'Gambia',
2207 'GE': 'Georgia',
2208 'DE': 'Germany',
2209 'GH': 'Ghana',
2210 'GI': 'Gibraltar',
2211 'GR': 'Greece',
2212 'GL': 'Greenland',
2213 'GD': 'Grenada',
2214 'GP': 'Guadeloupe',
2215 'GU': 'Guam',
2216 'GT': 'Guatemala',
2217 'GG': 'Guernsey',
2218 'GN': 'Guinea',
2219 'GW': 'Guinea-Bissau',
2220 'GY': 'Guyana',
2221 'HT': 'Haiti',
2222 'HM': 'Heard Island and McDonald Islands',
2223 'VA': 'Holy See (Vatican City State)',
2224 'HN': 'Honduras',
2225 'HK': 'Hong Kong',
2226 'HU': 'Hungary',
2227 'IS': 'Iceland',
2228 'IN': 'India',
2229 'ID': 'Indonesia',
2230 'IR': 'Iran, Islamic Republic of',
2231 'IQ': 'Iraq',
2232 'IE': 'Ireland',
2233 'IM': 'Isle of Man',
2234 'IL': 'Israel',
2235 'IT': 'Italy',
2236 'JM': 'Jamaica',
2237 'JP': 'Japan',
2238 'JE': 'Jersey',
2239 'JO': 'Jordan',
2240 'KZ': 'Kazakhstan',
2241 'KE': 'Kenya',
2242 'KI': 'Kiribati',
2243 'KP': 'Korea, Democratic People\'s Republic of',
2244 'KR': 'Korea, Republic of',
2245 'KW': 'Kuwait',
2246 'KG': 'Kyrgyzstan',
2247 'LA': 'Lao People\'s Democratic Republic',
2248 'LV': 'Latvia',
2249 'LB': 'Lebanon',
2250 'LS': 'Lesotho',
2251 'LR': 'Liberia',
2252 'LY': 'Libya',
2253 'LI': 'Liechtenstein',
2254 'LT': 'Lithuania',
2255 'LU': 'Luxembourg',
2256 'MO': 'Macao',
2257 'MK': 'Macedonia, the Former Yugoslav Republic of',
2258 'MG': 'Madagascar',
2259 'MW': 'Malawi',
2260 'MY': 'Malaysia',
2261 'MV': 'Maldives',
2262 'ML': 'Mali',
2263 'MT': 'Malta',
2264 'MH': 'Marshall Islands',
2265 'MQ': 'Martinique',
2266 'MR': 'Mauritania',
2267 'MU': 'Mauritius',
2268 'YT': 'Mayotte',
2269 'MX': 'Mexico',
2270 'FM': 'Micronesia, Federated States of',
2271 'MD': 'Moldova, Republic of',
2272 'MC': 'Monaco',
2273 'MN': 'Mongolia',
2274 'ME': 'Montenegro',
2275 'MS': 'Montserrat',
2276 'MA': 'Morocco',
2277 'MZ': 'Mozambique',
2278 'MM': 'Myanmar',
2279 'NA': 'Namibia',
2280 'NR': 'Nauru',
2281 'NP': 'Nepal',
2282 'NL': 'Netherlands',
2283 'NC': 'New Caledonia',
2284 'NZ': 'New Zealand',
2285 'NI': 'Nicaragua',
2286 'NE': 'Niger',
2287 'NG': 'Nigeria',
2288 'NU': 'Niue',
2289 'NF': 'Norfolk Island',
2290 'MP': 'Northern Mariana Islands',
2291 'NO': 'Norway',
2292 'OM': 'Oman',
2293 'PK': 'Pakistan',
2294 'PW': 'Palau',
2295 'PS': 'Palestine, State of',
2296 'PA': 'Panama',
2297 'PG': 'Papua New Guinea',
2298 'PY': 'Paraguay',
2299 'PE': 'Peru',
2300 'PH': 'Philippines',
2301 'PN': 'Pitcairn',
2302 'PL': 'Poland',
2303 'PT': 'Portugal',
2304 'PR': 'Puerto Rico',
2305 'QA': 'Qatar',
2306 'RE': 'Réunion',
2307 'RO': 'Romania',
2308 'RU': 'Russian Federation',
2309 'RW': 'Rwanda',
2310 'BL': 'Saint Barthélemy',
2311 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2312 'KN': 'Saint Kitts and Nevis',
2313 'LC': 'Saint Lucia',
2314 'MF': 'Saint Martin (French part)',
2315 'PM': 'Saint Pierre and Miquelon',
2316 'VC': 'Saint Vincent and the Grenadines',
2317 'WS': 'Samoa',
2318 'SM': 'San Marino',
2319 'ST': 'Sao Tome and Principe',
2320 'SA': 'Saudi Arabia',
2321 'SN': 'Senegal',
2322 'RS': 'Serbia',
2323 'SC': 'Seychelles',
2324 'SL': 'Sierra Leone',
2325 'SG': 'Singapore',
2326 'SX': 'Sint Maarten (Dutch part)',
2327 'SK': 'Slovakia',
2328 'SI': 'Slovenia',
2329 'SB': 'Solomon Islands',
2330 'SO': 'Somalia',
2331 'ZA': 'South Africa',
2332 'GS': 'South Georgia and the South Sandwich Islands',
2333 'SS': 'South Sudan',
2334 'ES': 'Spain',
2335 'LK': 'Sri Lanka',
2336 'SD': 'Sudan',
2337 'SR': 'Suriname',
2338 'SJ': 'Svalbard and Jan Mayen',
2339 'SZ': 'Swaziland',
2340 'SE': 'Sweden',
2341 'CH': 'Switzerland',
2342 'SY': 'Syrian Arab Republic',
2343 'TW': 'Taiwan, Province of China',
2344 'TJ': 'Tajikistan',
2345 'TZ': 'Tanzania, United Republic of',
2346 'TH': 'Thailand',
2347 'TL': 'Timor-Leste',
2348 'TG': 'Togo',
2349 'TK': 'Tokelau',
2350 'TO': 'Tonga',
2351 'TT': 'Trinidad and Tobago',
2352 'TN': 'Tunisia',
2353 'TR': 'Turkey',
2354 'TM': 'Turkmenistan',
2355 'TC': 'Turks and Caicos Islands',
2356 'TV': 'Tuvalu',
2357 'UG': 'Uganda',
2358 'UA': 'Ukraine',
2359 'AE': 'United Arab Emirates',
2360 'GB': 'United Kingdom',
2361 'US': 'United States',
2362 'UM': 'United States Minor Outlying Islands',
2363 'UY': 'Uruguay',
2364 'UZ': 'Uzbekistan',
2365 'VU': 'Vanuatu',
2366 'VE': 'Venezuela, Bolivarian Republic of',
2367 'VN': 'Viet Nam',
2368 'VG': 'Virgin Islands, British',
2369 'VI': 'Virgin Islands, U.S.',
2370 'WF': 'Wallis and Futuna',
2371 'EH': 'Western Sahara',
2372 'YE': 'Yemen',
2373 'ZM': 'Zambia',
2374 'ZW': 'Zimbabwe',
2375 }
2376
2377 @classmethod
2378 def short2full(cls, code):
2379 """Convert an ISO 3166-2 country code to the corresponding full name"""
2380 return cls._country_map.get(code.upper())
2381
2382
2383 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2384 def __init__(self, proxies=None):
2385 # Set default handlers
2386 for type in ('http', 'https'):
2387 setattr(self, '%s_open' % type,
2388 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2389 meth(r, proxy, type))
2390 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2391
2392 def proxy_open(self, req, proxy, type):
2393 req_proxy = req.headers.get('Ytdl-request-proxy')
2394 if req_proxy is not None:
2395 proxy = req_proxy
2396 del req.headers['Ytdl-request-proxy']
2397
2398 if proxy == '__noproxy__':
2399 return None # No Proxy
2400 return compat_urllib_request.ProxyHandler.proxy_open(
2401 self, req, proxy, type)