]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
Fix "invalid escape sequences" error on Python 3.6
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # coding: utf-8
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import errno
15 import functools
16 import gzip
17 import io
18 import itertools
19 import json
20 import locale
21 import math
22 import operator
23 import os
24 import pipes
25 import platform
26 import re
27 import socket
28 import ssl
29 import subprocess
30 import sys
31 import tempfile
32 import traceback
33 import xml.etree.ElementTree
34 import zlib
35
36 from .compat import (
37 compat_HTMLParser,
38 compat_basestring,
39 compat_chr,
40 compat_etree_fromstring,
41 compat_html_entities,
42 compat_html_entities_html5,
43 compat_http_client,
44 compat_kwargs,
45 compat_os_name,
46 compat_parse_qs,
47 compat_shlex_quote,
48 compat_socket_create_connection,
49 compat_str,
50 compat_struct_pack,
51 compat_struct_unpack,
52 compat_urllib_error,
53 compat_urllib_parse,
54 compat_urllib_parse_urlencode,
55 compat_urllib_parse_urlparse,
56 compat_urllib_parse_unquote_plus,
57 compat_urllib_request,
58 compat_urlparse,
59 compat_xpath,
60 )
61
62 from .socks import (
63 ProxyType,
64 sockssocket,
65 )
66
67
68 def register_socks_protocols():
69 # "Register" SOCKS protocols
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
75
76
77 # This is not clearly defined otherwise
78 compiled_regex_type = type(re.compile(''))
79
80 std_headers = {
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
86 }
87
88
89 USER_AGENTS = {
90 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
91 }
92
93
94 NO_DEFAULT = object()
95
96 ENGLISH_MONTH_NAMES = [
97 'January', 'February', 'March', 'April', 'May', 'June',
98 'July', 'August', 'September', 'October', 'November', 'December']
99
100 MONTH_NAMES = {
101 'en': ENGLISH_MONTH_NAMES,
102 'fr': [
103 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
104 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
105 }
106
107 KNOWN_EXTENSIONS = (
108 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
109 'flv', 'f4v', 'f4a', 'f4b',
110 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
111 'mkv', 'mka', 'mk3d',
112 'avi', 'divx',
113 'mov',
114 'asf', 'wmv', 'wma',
115 '3gp', '3g2',
116 'mp3',
117 'flac',
118 'ape',
119 'wav',
120 'f4f', 'f4m', 'm3u8', 'smil')
121
122 # needed for sanitizing filenames in restricted mode
123 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
124 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
125 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
126
127 DATE_FORMATS = (
128 '%d %B %Y',
129 '%d %b %Y',
130 '%B %d %Y',
131 '%b %d %Y',
132 '%b %dst %Y %I:%M',
133 '%b %dnd %Y %I:%M',
134 '%b %dth %Y %I:%M',
135 '%Y %m %d',
136 '%Y-%m-%d',
137 '%Y/%m/%d',
138 '%Y/%m/%d %H:%M',
139 '%Y/%m/%d %H:%M:%S',
140 '%Y-%m-%d %H:%M:%S',
141 '%Y-%m-%d %H:%M:%S.%f',
142 '%d.%m.%Y %H:%M',
143 '%d.%m.%Y %H.%M',
144 '%Y-%m-%dT%H:%M:%SZ',
145 '%Y-%m-%dT%H:%M:%S.%fZ',
146 '%Y-%m-%dT%H:%M:%S.%f0Z',
147 '%Y-%m-%dT%H:%M:%S',
148 '%Y-%m-%dT%H:%M:%S.%f',
149 '%Y-%m-%dT%H:%M',
150 '%b %d %Y at %H:%M',
151 '%b %d %Y at %H:%M:%S',
152 )
153
154 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
155 DATE_FORMATS_DAY_FIRST.extend([
156 '%d-%m-%Y',
157 '%d.%m.%Y',
158 '%d.%m.%y',
159 '%d/%m/%Y',
160 '%d/%m/%y',
161 '%d/%m/%Y %H:%M:%S',
162 ])
163
164 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
165 DATE_FORMATS_MONTH_FIRST.extend([
166 '%m-%d-%Y',
167 '%m.%d.%Y',
168 '%m/%d/%Y',
169 '%m/%d/%y',
170 '%m/%d/%Y %H:%M:%S',
171 ])
172
173 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
174
175
176 def preferredencoding():
177 """Get preferred encoding.
178
179 Returns the best encoding scheme for the system, based on
180 locale.getpreferredencoding() and some further tweaks.
181 """
182 try:
183 pref = locale.getpreferredencoding()
184 'TEST'.encode(pref)
185 except Exception:
186 pref = 'UTF-8'
187
188 return pref
189
190
191 def write_json_file(obj, fn):
192 """ Encode obj as JSON and write it to fn, atomically if possible """
193
194 fn = encodeFilename(fn)
195 if sys.version_info < (3, 0) and sys.platform != 'win32':
196 encoding = get_filesystem_encoding()
197 # os.path.basename returns a bytes object, but NamedTemporaryFile
198 # will fail if the filename contains non ascii characters unless we
199 # use a unicode object
200 path_basename = lambda f: os.path.basename(fn).decode(encoding)
201 # the same for os.path.dirname
202 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
203 else:
204 path_basename = os.path.basename
205 path_dirname = os.path.dirname
206
207 args = {
208 'suffix': '.tmp',
209 'prefix': path_basename(fn) + '.',
210 'dir': path_dirname(fn),
211 'delete': False,
212 }
213
214 # In Python 2.x, json.dump expects a bytestream.
215 # In Python 3.x, it writes to a character stream
216 if sys.version_info < (3, 0):
217 args['mode'] = 'wb'
218 else:
219 args.update({
220 'mode': 'w',
221 'encoding': 'utf-8',
222 })
223
224 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
225
226 try:
227 with tf:
228 json.dump(obj, tf)
229 if sys.platform == 'win32':
230 # Need to remove existing file on Windows, else os.rename raises
231 # WindowsError or FileExistsError.
232 try:
233 os.unlink(fn)
234 except OSError:
235 pass
236 os.rename(tf.name, fn)
237 except Exception:
238 try:
239 os.remove(tf.name)
240 except OSError:
241 pass
242 raise
243
244
245 if sys.version_info >= (2, 7):
246 def find_xpath_attr(node, xpath, key, val=None):
247 """ Find the xpath xpath[@key=val] """
248 assert re.match(r'^[a-zA-Z_-]+$', key)
249 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
250 return node.find(expr)
251 else:
252 def find_xpath_attr(node, xpath, key, val=None):
253 for f in node.findall(compat_xpath(xpath)):
254 if key not in f.attrib:
255 continue
256 if val is None or f.attrib.get(key) == val:
257 return f
258 return None
259
260 # On python2.6 the xml.etree.ElementTree.Element methods don't support
261 # the namespace parameter
262
263
264 def xpath_with_ns(path, ns_map):
265 components = [c.split(':') for c in path.split('/')]
266 replaced = []
267 for c in components:
268 if len(c) == 1:
269 replaced.append(c[0])
270 else:
271 ns, tag = c
272 replaced.append('{%s}%s' % (ns_map[ns], tag))
273 return '/'.join(replaced)
274
275
276 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
277 def _find_xpath(xpath):
278 return node.find(compat_xpath(xpath))
279
280 if isinstance(xpath, (str, compat_str)):
281 n = _find_xpath(xpath)
282 else:
283 for xp in xpath:
284 n = _find_xpath(xp)
285 if n is not None:
286 break
287
288 if n is None:
289 if default is not NO_DEFAULT:
290 return default
291 elif fatal:
292 name = xpath if name is None else name
293 raise ExtractorError('Could not find XML element %s' % name)
294 else:
295 return None
296 return n
297
298
299 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
300 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
301 if n is None or n == default:
302 return n
303 if n.text is None:
304 if default is not NO_DEFAULT:
305 return default
306 elif fatal:
307 name = xpath if name is None else name
308 raise ExtractorError('Could not find XML element\'s text %s' % name)
309 else:
310 return None
311 return n.text
312
313
314 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
315 n = find_xpath_attr(node, xpath, key)
316 if n is None:
317 if default is not NO_DEFAULT:
318 return default
319 elif fatal:
320 name = '%s[@%s]' % (xpath, key) if name is None else name
321 raise ExtractorError('Could not find XML attribute %s' % name)
322 else:
323 return None
324 return n.attrib[key]
325
326
327 def get_element_by_id(id, html):
328 """Return the content of the tag with the specified ID in the passed HTML document"""
329 return get_element_by_attribute('id', id, html)
330
331
332 def get_element_by_class(class_name, html):
333 return get_element_by_attribute(
334 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
335 html, escape_value=False)
336
337
338 def get_element_by_attribute(attribute, value, html, escape_value=True):
339 """Return the content of the tag with the specified attribute in the passed HTML document"""
340
341 value = re.escape(value) if escape_value else value
342
343 m = re.search(r'''(?xs)
344 <([a-zA-Z0-9:._-]+)
345 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
346 \s+%s=['"]?%s['"]?
347 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
348 \s*>
349 (?P<content>.*?)
350 </\1>
351 ''' % (re.escape(attribute), value), html)
352
353 if not m:
354 return None
355 res = m.group('content')
356
357 if res.startswith('"') or res.startswith("'"):
358 res = res[1:-1]
359
360 return unescapeHTML(res)
361
362
363 class HTMLAttributeParser(compat_HTMLParser):
364 """Trivial HTML parser to gather the attributes for a single element"""
365 def __init__(self):
366 self.attrs = {}
367 compat_HTMLParser.__init__(self)
368
369 def handle_starttag(self, tag, attrs):
370 self.attrs = dict(attrs)
371
372
373 def extract_attributes(html_element):
374 """Given a string for an HTML element such as
375 <el
376 a="foo" B="bar" c="&98;az" d=boz
377 empty= noval entity="&amp;"
378 sq='"' dq="'"
379 >
380 Decode and return a dictionary of attributes.
381 {
382 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
383 'empty': '', 'noval': None, 'entity': '&',
384 'sq': '"', 'dq': '\''
385 }.
386 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
387 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
388 """
389 parser = HTMLAttributeParser()
390 parser.feed(html_element)
391 parser.close()
392 return parser.attrs
393
394
395 def clean_html(html):
396 """Clean an HTML snippet into a readable string"""
397
398 if html is None: # Convenience for sanitizing descriptions etc.
399 return html
400
401 # Newline vs <br />
402 html = html.replace('\n', ' ')
403 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
404 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
405 # Strip html tags
406 html = re.sub('<.*?>', '', html)
407 # Replace html entities
408 html = unescapeHTML(html)
409 return html.strip()
410
411
412 def sanitize_open(filename, open_mode):
413 """Try to open the given filename, and slightly tweak it if this fails.
414
415 Attempts to open the given filename. If this fails, it tries to change
416 the filename slightly, step by step, until it's either able to open it
417 or it fails and raises a final exception, like the standard open()
418 function.
419
420 It returns the tuple (stream, definitive_file_name).
421 """
422 try:
423 if filename == '-':
424 if sys.platform == 'win32':
425 import msvcrt
426 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
427 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
428 stream = open(encodeFilename(filename), open_mode)
429 return (stream, filename)
430 except (IOError, OSError) as err:
431 if err.errno in (errno.EACCES,):
432 raise
433
434 # In case of error, try to remove win32 forbidden chars
435 alt_filename = sanitize_path(filename)
436 if alt_filename == filename:
437 raise
438 else:
439 # An exception here should be caught in the caller
440 stream = open(encodeFilename(alt_filename), open_mode)
441 return (stream, alt_filename)
442
443
444 def timeconvert(timestr):
445 """Convert RFC 2822 defined time string into system timestamp"""
446 timestamp = None
447 timetuple = email.utils.parsedate_tz(timestr)
448 if timetuple is not None:
449 timestamp = email.utils.mktime_tz(timetuple)
450 return timestamp
451
452
453 def sanitize_filename(s, restricted=False, is_id=False):
454 """Sanitizes a string so it could be used as part of a filename.
455 If restricted is set, use a stricter subset of allowed characters.
456 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
457 """
458 def replace_insane(char):
459 if restricted and char in ACCENT_CHARS:
460 return ACCENT_CHARS[char]
461 if char == '?' or ord(char) < 32 or ord(char) == 127:
462 return ''
463 elif char == '"':
464 return '' if restricted else '\''
465 elif char == ':':
466 return '_-' if restricted else ' -'
467 elif char in '\\/|*<>':
468 return '_'
469 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
470 return '_'
471 if restricted and ord(char) > 127:
472 return '_'
473 return char
474
475 # Handle timestamps
476 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
477 result = ''.join(map(replace_insane, s))
478 if not is_id:
479 while '__' in result:
480 result = result.replace('__', '_')
481 result = result.strip('_')
482 # Common case of "Foreign band name - English song title"
483 if restricted and result.startswith('-_'):
484 result = result[2:]
485 if result.startswith('-'):
486 result = '_' + result[len('-'):]
487 result = result.lstrip('.')
488 if not result:
489 result = '_'
490 return result
491
492
493 def sanitize_path(s):
494 """Sanitizes and normalizes path on Windows"""
495 if sys.platform != 'win32':
496 return s
497 drive_or_unc, _ = os.path.splitdrive(s)
498 if sys.version_info < (2, 7) and not drive_or_unc:
499 drive_or_unc, _ = os.path.splitunc(s)
500 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
501 if drive_or_unc:
502 norm_path.pop(0)
503 sanitized_path = [
504 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
505 for path_part in norm_path]
506 if drive_or_unc:
507 sanitized_path.insert(0, drive_or_unc + os.path.sep)
508 return os.path.join(*sanitized_path)
509
510
511 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
512 # unwanted failures due to missing protocol
513 def sanitize_url(url):
514 return 'http:%s' % url if url.startswith('//') else url
515
516
517 def sanitized_Request(url, *args, **kwargs):
518 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
519
520
521 def orderedSet(iterable):
522 """ Remove all duplicates from the input iterable """
523 res = []
524 for el in iterable:
525 if el not in res:
526 res.append(el)
527 return res
528
529
530 def _htmlentity_transform(entity_with_semicolon):
531 """Transforms an HTML entity to a character."""
532 entity = entity_with_semicolon[:-1]
533
534 # Known non-numeric HTML entity
535 if entity in compat_html_entities.name2codepoint:
536 return compat_chr(compat_html_entities.name2codepoint[entity])
537
538 # TODO: HTML5 allows entities without a semicolon. For example,
539 # '&Eacuteric' should be decoded as 'Éric'.
540 if entity_with_semicolon in compat_html_entities_html5:
541 return compat_html_entities_html5[entity_with_semicolon]
542
543 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
544 if mobj is not None:
545 numstr = mobj.group(1)
546 if numstr.startswith('x'):
547 base = 16
548 numstr = '0%s' % numstr
549 else:
550 base = 10
551 # See https://github.com/rg3/youtube-dl/issues/7518
552 try:
553 return compat_chr(int(numstr, base))
554 except ValueError:
555 pass
556
557 # Unknown entity in name, return its literal representation
558 return '&%s;' % entity
559
560
561 def unescapeHTML(s):
562 if s is None:
563 return None
564 assert type(s) == compat_str
565
566 return re.sub(
567 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
568
569
570 def get_subprocess_encoding():
571 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
572 # For subprocess calls, encode with locale encoding
573 # Refer to http://stackoverflow.com/a/9951851/35070
574 encoding = preferredencoding()
575 else:
576 encoding = sys.getfilesystemencoding()
577 if encoding is None:
578 encoding = 'utf-8'
579 return encoding
580
581
582 def encodeFilename(s, for_subprocess=False):
583 """
584 @param s The name of the file
585 """
586
587 assert type(s) == compat_str
588
589 # Python 3 has a Unicode API
590 if sys.version_info >= (3, 0):
591 return s
592
593 # Pass '' directly to use Unicode APIs on Windows 2000 and up
594 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
595 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
596 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
597 return s
598
599 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
600 if sys.platform.startswith('java'):
601 return s
602
603 return s.encode(get_subprocess_encoding(), 'ignore')
604
605
606 def decodeFilename(b, for_subprocess=False):
607
608 if sys.version_info >= (3, 0):
609 return b
610
611 if not isinstance(b, bytes):
612 return b
613
614 return b.decode(get_subprocess_encoding(), 'ignore')
615
616
617 def encodeArgument(s):
618 if not isinstance(s, compat_str):
619 # Legacy code that uses byte strings
620 # Uncomment the following line after fixing all post processors
621 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
622 s = s.decode('ascii')
623 return encodeFilename(s, True)
624
625
626 def decodeArgument(b):
627 return decodeFilename(b, True)
628
629
630 def decodeOption(optval):
631 if optval is None:
632 return optval
633 if isinstance(optval, bytes):
634 optval = optval.decode(preferredencoding())
635
636 assert isinstance(optval, compat_str)
637 return optval
638
639
640 def formatSeconds(secs):
641 if secs > 3600:
642 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
643 elif secs > 60:
644 return '%d:%02d' % (secs // 60, secs % 60)
645 else:
646 return '%d' % secs
647
648
649 def make_HTTPS_handler(params, **kwargs):
650 opts_no_check_certificate = params.get('nocheckcertificate', False)
651 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
652 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
653 if opts_no_check_certificate:
654 context.check_hostname = False
655 context.verify_mode = ssl.CERT_NONE
656 try:
657 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
658 except TypeError:
659 # Python 2.7.8
660 # (create_default_context present but HTTPSHandler has no context=)
661 pass
662
663 if sys.version_info < (3, 2):
664 return YoutubeDLHTTPSHandler(params, **kwargs)
665 else: # Python < 3.4
666 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
667 context.verify_mode = (ssl.CERT_NONE
668 if opts_no_check_certificate
669 else ssl.CERT_REQUIRED)
670 context.set_default_verify_paths()
671 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
672
673
674 def bug_reports_message():
675 if ytdl_is_updateable():
676 update_cmd = 'type youtube-dl -U to update'
677 else:
678 update_cmd = 'see https://yt-dl.org/update on how to update'
679 msg = '; please report this issue on https://yt-dl.org/bug .'
680 msg += ' Make sure you are using the latest version; %s.' % update_cmd
681 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
682 return msg
683
684
685 class ExtractorError(Exception):
686 """Error during info extraction."""
687
688 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
689 """ tb, if given, is the original traceback (so that it can be printed out).
690 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
691 """
692
693 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
694 expected = True
695 if video_id is not None:
696 msg = video_id + ': ' + msg
697 if cause:
698 msg += ' (caused by %r)' % cause
699 if not expected:
700 msg += bug_reports_message()
701 super(ExtractorError, self).__init__(msg)
702
703 self.traceback = tb
704 self.exc_info = sys.exc_info() # preserve original exception
705 self.cause = cause
706 self.video_id = video_id
707
708 def format_traceback(self):
709 if self.traceback is None:
710 return None
711 return ''.join(traceback.format_tb(self.traceback))
712
713
714 class UnsupportedError(ExtractorError):
715 def __init__(self, url):
716 super(UnsupportedError, self).__init__(
717 'Unsupported URL: %s' % url, expected=True)
718 self.url = url
719
720
721 class RegexNotFoundError(ExtractorError):
722 """Error when a regex didn't match"""
723 pass
724
725
726 class DownloadError(Exception):
727 """Download Error exception.
728
729 This exception may be thrown by FileDownloader objects if they are not
730 configured to continue on errors. They will contain the appropriate
731 error message.
732 """
733
734 def __init__(self, msg, exc_info=None):
735 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
736 super(DownloadError, self).__init__(msg)
737 self.exc_info = exc_info
738
739
740 class SameFileError(Exception):
741 """Same File exception.
742
743 This exception will be thrown by FileDownloader objects if they detect
744 multiple files would have to be downloaded to the same file on disk.
745 """
746 pass
747
748
749 class PostProcessingError(Exception):
750 """Post Processing exception.
751
752 This exception may be raised by PostProcessor's .run() method to
753 indicate an error in the postprocessing task.
754 """
755
756 def __init__(self, msg):
757 self.msg = msg
758
759
760 class MaxDownloadsReached(Exception):
761 """ --max-downloads limit has been reached. """
762 pass
763
764
765 class UnavailableVideoError(Exception):
766 """Unavailable Format exception.
767
768 This exception will be thrown when a video is requested
769 in a format that is not available for that video.
770 """
771 pass
772
773
774 class ContentTooShortError(Exception):
775 """Content Too Short exception.
776
777 This exception may be raised by FileDownloader objects when a file they
778 download is too small for what the server announced first, indicating
779 the connection was probably interrupted.
780 """
781
782 def __init__(self, downloaded, expected):
783 # Both in bytes
784 self.downloaded = downloaded
785 self.expected = expected
786
787
788 class XAttrMetadataError(Exception):
789 def __init__(self, code=None, msg='Unknown error'):
790 super(XAttrMetadataError, self).__init__(msg)
791 self.code = code
792 self.msg = msg
793
794 # Parsing code and msg
795 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
796 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
797 self.reason = 'NO_SPACE'
798 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
799 self.reason = 'VALUE_TOO_LONG'
800 else:
801 self.reason = 'NOT_SUPPORTED'
802
803
804 class XAttrUnavailableError(Exception):
805 pass
806
807
808 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
809 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
810 # expected HTTP responses to meet HTTP/1.0 or later (see also
811 # https://github.com/rg3/youtube-dl/issues/6727)
812 if sys.version_info < (3, 0):
813 kwargs[b'strict'] = True
814 hc = http_class(*args, **kwargs)
815 source_address = ydl_handler._params.get('source_address')
816 if source_address is not None:
817 sa = (source_address, 0)
818 if hasattr(hc, 'source_address'): # Python 2.7+
819 hc.source_address = sa
820 else: # Python 2.6
821 def _hc_connect(self, *args, **kwargs):
822 sock = compat_socket_create_connection(
823 (self.host, self.port), self.timeout, sa)
824 if is_https:
825 self.sock = ssl.wrap_socket(
826 sock, self.key_file, self.cert_file,
827 ssl_version=ssl.PROTOCOL_TLSv1)
828 else:
829 self.sock = sock
830 hc.connect = functools.partial(_hc_connect, hc)
831
832 return hc
833
834
835 def handle_youtubedl_headers(headers):
836 filtered_headers = headers
837
838 if 'Youtubedl-no-compression' in filtered_headers:
839 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
840 del filtered_headers['Youtubedl-no-compression']
841
842 return filtered_headers
843
844
845 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
846 """Handler for HTTP requests and responses.
847
848 This class, when installed with an OpenerDirector, automatically adds
849 the standard headers to every HTTP request and handles gzipped and
850 deflated responses from web servers. If compression is to be avoided in
851 a particular request, the original request in the program code only has
852 to include the HTTP header "Youtubedl-no-compression", which will be
853 removed before making the real request.
854
855 Part of this code was copied from:
856
857 http://techknack.net/python-urllib2-handlers/
858
859 Andrew Rowls, the author of that code, agreed to release it to the
860 public domain.
861 """
862
863 def __init__(self, params, *args, **kwargs):
864 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
865 self._params = params
866
867 def http_open(self, req):
868 conn_class = compat_http_client.HTTPConnection
869
870 socks_proxy = req.headers.get('Ytdl-socks-proxy')
871 if socks_proxy:
872 conn_class = make_socks_conn_class(conn_class, socks_proxy)
873 del req.headers['Ytdl-socks-proxy']
874
875 return self.do_open(functools.partial(
876 _create_http_connection, self, conn_class, False),
877 req)
878
879 @staticmethod
880 def deflate(data):
881 try:
882 return zlib.decompress(data, -zlib.MAX_WBITS)
883 except zlib.error:
884 return zlib.decompress(data)
885
886 @staticmethod
887 def addinfourl_wrapper(stream, headers, url, code):
888 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
889 return compat_urllib_request.addinfourl(stream, headers, url, code)
890 ret = compat_urllib_request.addinfourl(stream, headers, url)
891 ret.code = code
892 return ret
893
894 def http_request(self, req):
895 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
896 # always respected by websites, some tend to give out URLs with non percent-encoded
897 # non-ASCII characters (see telemb.py, ard.py [#3412])
898 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
899 # To work around aforementioned issue we will replace request's original URL with
900 # percent-encoded one
901 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
902 # the code of this workaround has been moved here from YoutubeDL.urlopen()
903 url = req.get_full_url()
904 url_escaped = escape_url(url)
905
906 # Substitute URL if any change after escaping
907 if url != url_escaped:
908 req = update_Request(req, url=url_escaped)
909
910 for h, v in std_headers.items():
911 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
912 # The dict keys are capitalized because of this bug by urllib
913 if h.capitalize() not in req.headers:
914 req.add_header(h, v)
915
916 req.headers = handle_youtubedl_headers(req.headers)
917
918 if sys.version_info < (2, 7) and '#' in req.get_full_url():
919 # Python 2.6 is brain-dead when it comes to fragments
920 req._Request__original = req._Request__original.partition('#')[0]
921 req._Request__r_type = req._Request__r_type.partition('#')[0]
922
923 return req
924
925 def http_response(self, req, resp):
926 old_resp = resp
927 # gzip
928 if resp.headers.get('Content-encoding', '') == 'gzip':
929 content = resp.read()
930 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
931 try:
932 uncompressed = io.BytesIO(gz.read())
933 except IOError as original_ioerror:
934 # There may be junk add the end of the file
935 # See http://stackoverflow.com/q/4928560/35070 for details
936 for i in range(1, 1024):
937 try:
938 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
939 uncompressed = io.BytesIO(gz.read())
940 except IOError:
941 continue
942 break
943 else:
944 raise original_ioerror
945 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
946 resp.msg = old_resp.msg
947 del resp.headers['Content-encoding']
948 # deflate
949 if resp.headers.get('Content-encoding', '') == 'deflate':
950 gz = io.BytesIO(self.deflate(resp.read()))
951 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
952 resp.msg = old_resp.msg
953 del resp.headers['Content-encoding']
954 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
955 # https://github.com/rg3/youtube-dl/issues/6457).
956 if 300 <= resp.code < 400:
957 location = resp.headers.get('Location')
958 if location:
959 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
960 if sys.version_info >= (3, 0):
961 location = location.encode('iso-8859-1').decode('utf-8')
962 else:
963 location = location.decode('utf-8')
964 location_escaped = escape_url(location)
965 if location != location_escaped:
966 del resp.headers['Location']
967 if sys.version_info < (3, 0):
968 location_escaped = location_escaped.encode('utf-8')
969 resp.headers['Location'] = location_escaped
970 return resp
971
972 https_request = http_request
973 https_response = http_response
974
975
976 def make_socks_conn_class(base_class, socks_proxy):
977 assert issubclass(base_class, (
978 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
979
980 url_components = compat_urlparse.urlparse(socks_proxy)
981 if url_components.scheme.lower() == 'socks5':
982 socks_type = ProxyType.SOCKS5
983 elif url_components.scheme.lower() in ('socks', 'socks4'):
984 socks_type = ProxyType.SOCKS4
985 elif url_components.scheme.lower() == 'socks4a':
986 socks_type = ProxyType.SOCKS4A
987
988 def unquote_if_non_empty(s):
989 if not s:
990 return s
991 return compat_urllib_parse_unquote_plus(s)
992
993 proxy_args = (
994 socks_type,
995 url_components.hostname, url_components.port or 1080,
996 True, # Remote DNS
997 unquote_if_non_empty(url_components.username),
998 unquote_if_non_empty(url_components.password),
999 )
1000
1001 class SocksConnection(base_class):
1002 def connect(self):
1003 self.sock = sockssocket()
1004 self.sock.setproxy(*proxy_args)
1005 if type(self.timeout) in (int, float):
1006 self.sock.settimeout(self.timeout)
1007 self.sock.connect((self.host, self.port))
1008
1009 if isinstance(self, compat_http_client.HTTPSConnection):
1010 if hasattr(self, '_context'): # Python > 2.6
1011 self.sock = self._context.wrap_socket(
1012 self.sock, server_hostname=self.host)
1013 else:
1014 self.sock = ssl.wrap_socket(self.sock)
1015
1016 return SocksConnection
1017
1018
1019 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1020 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1021 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1022 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1023 self._params = params
1024
1025 def https_open(self, req):
1026 kwargs = {}
1027 conn_class = self._https_conn_class
1028
1029 if hasattr(self, '_context'): # python > 2.6
1030 kwargs['context'] = self._context
1031 if hasattr(self, '_check_hostname'): # python 3.x
1032 kwargs['check_hostname'] = self._check_hostname
1033
1034 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1035 if socks_proxy:
1036 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1037 del req.headers['Ytdl-socks-proxy']
1038
1039 return self.do_open(functools.partial(
1040 _create_http_connection, self, conn_class, True),
1041 req, **kwargs)
1042
1043
1044 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1045 def __init__(self, cookiejar=None):
1046 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1047
1048 def http_response(self, request, response):
1049 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1050 # characters in Set-Cookie HTTP header of last response (see
1051 # https://github.com/rg3/youtube-dl/issues/6769).
1052 # In order to at least prevent crashing we will percent encode Set-Cookie
1053 # header before HTTPCookieProcessor starts processing it.
1054 # if sys.version_info < (3, 0) and response.headers:
1055 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1056 # set_cookie = response.headers.get(set_cookie_header)
1057 # if set_cookie:
1058 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1059 # if set_cookie != set_cookie_escaped:
1060 # del response.headers[set_cookie_header]
1061 # response.headers[set_cookie_header] = set_cookie_escaped
1062 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1063
1064 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1065 https_response = http_response
1066
1067
1068 def extract_timezone(date_str):
1069 m = re.search(
1070 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1071 date_str)
1072 if not m:
1073 timezone = datetime.timedelta()
1074 else:
1075 date_str = date_str[:-len(m.group('tz'))]
1076 if not m.group('sign'):
1077 timezone = datetime.timedelta()
1078 else:
1079 sign = 1 if m.group('sign') == '+' else -1
1080 timezone = datetime.timedelta(
1081 hours=sign * int(m.group('hours')),
1082 minutes=sign * int(m.group('minutes')))
1083 return timezone, date_str
1084
1085
1086 def parse_iso8601(date_str, delimiter='T', timezone=None):
1087 """ Return a UNIX timestamp from the given date """
1088
1089 if date_str is None:
1090 return None
1091
1092 date_str = re.sub(r'\.[0-9]+', '', date_str)
1093
1094 if timezone is None:
1095 timezone, date_str = extract_timezone(date_str)
1096
1097 try:
1098 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1099 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1100 return calendar.timegm(dt.timetuple())
1101 except ValueError:
1102 pass
1103
1104
1105 def date_formats(day_first=True):
1106 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1107
1108
1109 def unified_strdate(date_str, day_first=True):
1110 """Return a string with the date in the format YYYYMMDD"""
1111
1112 if date_str is None:
1113 return None
1114 upload_date = None
1115 # Replace commas
1116 date_str = date_str.replace(',', ' ')
1117 # Remove AM/PM + timezone
1118 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1119 _, date_str = extract_timezone(date_str)
1120
1121 for expression in date_formats(day_first):
1122 try:
1123 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1124 except ValueError:
1125 pass
1126 if upload_date is None:
1127 timetuple = email.utils.parsedate_tz(date_str)
1128 if timetuple:
1129 try:
1130 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1131 except ValueError:
1132 pass
1133 if upload_date is not None:
1134 return compat_str(upload_date)
1135
1136
1137 def unified_timestamp(date_str, day_first=True):
1138 if date_str is None:
1139 return None
1140
1141 date_str = date_str.replace(',', ' ')
1142
1143 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1144 timezone, date_str = extract_timezone(date_str)
1145
1146 # Remove AM/PM + timezone
1147 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1148
1149 for expression in date_formats(day_first):
1150 try:
1151 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1152 return calendar.timegm(dt.timetuple())
1153 except ValueError:
1154 pass
1155 timetuple = email.utils.parsedate_tz(date_str)
1156 if timetuple:
1157 return calendar.timegm(timetuple) + pm_delta * 3600
1158
1159
1160 def determine_ext(url, default_ext='unknown_video'):
1161 if url is None:
1162 return default_ext
1163 guess = url.partition('?')[0].rpartition('.')[2]
1164 if re.match(r'^[A-Za-z0-9]+$', guess):
1165 return guess
1166 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1167 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1168 return guess.rstrip('/')
1169 else:
1170 return default_ext
1171
1172
1173 def subtitles_filename(filename, sub_lang, sub_format):
1174 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1175
1176
1177 def date_from_str(date_str):
1178 """
1179 Return a datetime object from a string in the format YYYYMMDD or
1180 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1181 today = datetime.date.today()
1182 if date_str in ('now', 'today'):
1183 return today
1184 if date_str == 'yesterday':
1185 return today - datetime.timedelta(days=1)
1186 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1187 if match is not None:
1188 sign = match.group('sign')
1189 time = int(match.group('time'))
1190 if sign == '-':
1191 time = -time
1192 unit = match.group('unit')
1193 # A bad approximation?
1194 if unit == 'month':
1195 unit = 'day'
1196 time *= 30
1197 elif unit == 'year':
1198 unit = 'day'
1199 time *= 365
1200 unit += 's'
1201 delta = datetime.timedelta(**{unit: time})
1202 return today + delta
1203 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1204
1205
1206 def hyphenate_date(date_str):
1207 """
1208 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1209 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1210 if match is not None:
1211 return '-'.join(match.groups())
1212 else:
1213 return date_str
1214
1215
1216 class DateRange(object):
1217 """Represents a time interval between two dates"""
1218
1219 def __init__(self, start=None, end=None):
1220 """start and end must be strings in the format accepted by date"""
1221 if start is not None:
1222 self.start = date_from_str(start)
1223 else:
1224 self.start = datetime.datetime.min.date()
1225 if end is not None:
1226 self.end = date_from_str(end)
1227 else:
1228 self.end = datetime.datetime.max.date()
1229 if self.start > self.end:
1230 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1231
1232 @classmethod
1233 def day(cls, day):
1234 """Returns a range that only contains the given day"""
1235 return cls(day, day)
1236
1237 def __contains__(self, date):
1238 """Check if the date is in the range"""
1239 if not isinstance(date, datetime.date):
1240 date = date_from_str(date)
1241 return self.start <= date <= self.end
1242
1243 def __str__(self):
1244 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1245
1246
1247 def platform_name():
1248 """ Returns the platform name as a compat_str """
1249 res = platform.platform()
1250 if isinstance(res, bytes):
1251 res = res.decode(preferredencoding())
1252
1253 assert isinstance(res, compat_str)
1254 return res
1255
1256
1257 def _windows_write_string(s, out):
1258 """ Returns True if the string was written using special methods,
1259 False if it has yet to be written out."""
1260 # Adapted from http://stackoverflow.com/a/3259271/35070
1261
1262 import ctypes
1263 import ctypes.wintypes
1264
1265 WIN_OUTPUT_IDS = {
1266 1: -11,
1267 2: -12,
1268 }
1269
1270 try:
1271 fileno = out.fileno()
1272 except AttributeError:
1273 # If the output stream doesn't have a fileno, it's virtual
1274 return False
1275 except io.UnsupportedOperation:
1276 # Some strange Windows pseudo files?
1277 return False
1278 if fileno not in WIN_OUTPUT_IDS:
1279 return False
1280
1281 GetStdHandle = ctypes.WINFUNCTYPE(
1282 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1283 (b'GetStdHandle', ctypes.windll.kernel32))
1284 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1285
1286 WriteConsoleW = ctypes.WINFUNCTYPE(
1287 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1288 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1289 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1290 written = ctypes.wintypes.DWORD(0)
1291
1292 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1293 FILE_TYPE_CHAR = 0x0002
1294 FILE_TYPE_REMOTE = 0x8000
1295 GetConsoleMode = ctypes.WINFUNCTYPE(
1296 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1297 ctypes.POINTER(ctypes.wintypes.DWORD))(
1298 (b'GetConsoleMode', ctypes.windll.kernel32))
1299 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1300
1301 def not_a_console(handle):
1302 if handle == INVALID_HANDLE_VALUE or handle is None:
1303 return True
1304 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1305 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1306
1307 if not_a_console(h):
1308 return False
1309
1310 def next_nonbmp_pos(s):
1311 try:
1312 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1313 except StopIteration:
1314 return len(s)
1315
1316 while s:
1317 count = min(next_nonbmp_pos(s), 1024)
1318
1319 ret = WriteConsoleW(
1320 h, s, count if count else 2, ctypes.byref(written), None)
1321 if ret == 0:
1322 raise OSError('Failed to write string')
1323 if not count: # We just wrote a non-BMP character
1324 assert written.value == 2
1325 s = s[1:]
1326 else:
1327 assert written.value > 0
1328 s = s[written.value:]
1329 return True
1330
1331
1332 def write_string(s, out=None, encoding=None):
1333 if out is None:
1334 out = sys.stderr
1335 assert type(s) == compat_str
1336
1337 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1338 if _windows_write_string(s, out):
1339 return
1340
1341 if ('b' in getattr(out, 'mode', '') or
1342 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1343 byt = s.encode(encoding or preferredencoding(), 'ignore')
1344 out.write(byt)
1345 elif hasattr(out, 'buffer'):
1346 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1347 byt = s.encode(enc, 'ignore')
1348 out.buffer.write(byt)
1349 else:
1350 out.write(s)
1351 out.flush()
1352
1353
1354 def bytes_to_intlist(bs):
1355 if not bs:
1356 return []
1357 if isinstance(bs[0], int): # Python 3
1358 return list(bs)
1359 else:
1360 return [ord(c) for c in bs]
1361
1362
1363 def intlist_to_bytes(xs):
1364 if not xs:
1365 return b''
1366 return compat_struct_pack('%dB' % len(xs), *xs)
1367
1368
1369 # Cross-platform file locking
1370 if sys.platform == 'win32':
1371 import ctypes.wintypes
1372 import msvcrt
1373
1374 class OVERLAPPED(ctypes.Structure):
1375 _fields_ = [
1376 ('Internal', ctypes.wintypes.LPVOID),
1377 ('InternalHigh', ctypes.wintypes.LPVOID),
1378 ('Offset', ctypes.wintypes.DWORD),
1379 ('OffsetHigh', ctypes.wintypes.DWORD),
1380 ('hEvent', ctypes.wintypes.HANDLE),
1381 ]
1382
1383 kernel32 = ctypes.windll.kernel32
1384 LockFileEx = kernel32.LockFileEx
1385 LockFileEx.argtypes = [
1386 ctypes.wintypes.HANDLE, # hFile
1387 ctypes.wintypes.DWORD, # dwFlags
1388 ctypes.wintypes.DWORD, # dwReserved
1389 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1390 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1391 ctypes.POINTER(OVERLAPPED) # Overlapped
1392 ]
1393 LockFileEx.restype = ctypes.wintypes.BOOL
1394 UnlockFileEx = kernel32.UnlockFileEx
1395 UnlockFileEx.argtypes = [
1396 ctypes.wintypes.HANDLE, # hFile
1397 ctypes.wintypes.DWORD, # dwReserved
1398 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1399 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1400 ctypes.POINTER(OVERLAPPED) # Overlapped
1401 ]
1402 UnlockFileEx.restype = ctypes.wintypes.BOOL
1403 whole_low = 0xffffffff
1404 whole_high = 0x7fffffff
1405
1406 def _lock_file(f, exclusive):
1407 overlapped = OVERLAPPED()
1408 overlapped.Offset = 0
1409 overlapped.OffsetHigh = 0
1410 overlapped.hEvent = 0
1411 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1412 handle = msvcrt.get_osfhandle(f.fileno())
1413 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1414 whole_low, whole_high, f._lock_file_overlapped_p):
1415 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1416
1417 def _unlock_file(f):
1418 assert f._lock_file_overlapped_p
1419 handle = msvcrt.get_osfhandle(f.fileno())
1420 if not UnlockFileEx(handle, 0,
1421 whole_low, whole_high, f._lock_file_overlapped_p):
1422 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1423
1424 else:
1425 # Some platforms, such as Jython, is missing fcntl
1426 try:
1427 import fcntl
1428
1429 def _lock_file(f, exclusive):
1430 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1431
1432 def _unlock_file(f):
1433 fcntl.flock(f, fcntl.LOCK_UN)
1434 except ImportError:
1435 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1436
1437 def _lock_file(f, exclusive):
1438 raise IOError(UNSUPPORTED_MSG)
1439
1440 def _unlock_file(f):
1441 raise IOError(UNSUPPORTED_MSG)
1442
1443
1444 class locked_file(object):
1445 def __init__(self, filename, mode, encoding=None):
1446 assert mode in ['r', 'a', 'w']
1447 self.f = io.open(filename, mode, encoding=encoding)
1448 self.mode = mode
1449
1450 def __enter__(self):
1451 exclusive = self.mode != 'r'
1452 try:
1453 _lock_file(self.f, exclusive)
1454 except IOError:
1455 self.f.close()
1456 raise
1457 return self
1458
1459 def __exit__(self, etype, value, traceback):
1460 try:
1461 _unlock_file(self.f)
1462 finally:
1463 self.f.close()
1464
1465 def __iter__(self):
1466 return iter(self.f)
1467
1468 def write(self, *args):
1469 return self.f.write(*args)
1470
1471 def read(self, *args):
1472 return self.f.read(*args)
1473
1474
1475 def get_filesystem_encoding():
1476 encoding = sys.getfilesystemencoding()
1477 return encoding if encoding is not None else 'utf-8'
1478
1479
1480 def shell_quote(args):
1481 quoted_args = []
1482 encoding = get_filesystem_encoding()
1483 for a in args:
1484 if isinstance(a, bytes):
1485 # We may get a filename encoded with 'encodeFilename'
1486 a = a.decode(encoding)
1487 quoted_args.append(pipes.quote(a))
1488 return ' '.join(quoted_args)
1489
1490
1491 def smuggle_url(url, data):
1492 """ Pass additional data in a URL for internal use. """
1493
1494 url, idata = unsmuggle_url(url, {})
1495 data.update(idata)
1496 sdata = compat_urllib_parse_urlencode(
1497 {'__youtubedl_smuggle': json.dumps(data)})
1498 return url + '#' + sdata
1499
1500
1501 def unsmuggle_url(smug_url, default=None):
1502 if '#__youtubedl_smuggle' not in smug_url:
1503 return smug_url, default
1504 url, _, sdata = smug_url.rpartition('#')
1505 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1506 data = json.loads(jsond)
1507 return url, data
1508
1509
1510 def format_bytes(bytes):
1511 if bytes is None:
1512 return 'N/A'
1513 if type(bytes) is str:
1514 bytes = float(bytes)
1515 if bytes == 0.0:
1516 exponent = 0
1517 else:
1518 exponent = int(math.log(bytes, 1024.0))
1519 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1520 converted = float(bytes) / float(1024 ** exponent)
1521 return '%.2f%s' % (converted, suffix)
1522
1523
1524 def lookup_unit_table(unit_table, s):
1525 units_re = '|'.join(re.escape(u) for u in unit_table)
1526 m = re.match(
1527 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1528 if not m:
1529 return None
1530 num_str = m.group('num').replace(',', '.')
1531 mult = unit_table[m.group('unit')]
1532 return int(float(num_str) * mult)
1533
1534
1535 def parse_filesize(s):
1536 if s is None:
1537 return None
1538
1539 # The lower-case forms are of course incorrect and unofficial,
1540 # but we support those too
1541 _UNIT_TABLE = {
1542 'B': 1,
1543 'b': 1,
1544 'bytes': 1,
1545 'KiB': 1024,
1546 'KB': 1000,
1547 'kB': 1024,
1548 'Kb': 1000,
1549 'kb': 1000,
1550 'kilobytes': 1000,
1551 'kibibytes': 1024,
1552 'MiB': 1024 ** 2,
1553 'MB': 1000 ** 2,
1554 'mB': 1024 ** 2,
1555 'Mb': 1000 ** 2,
1556 'mb': 1000 ** 2,
1557 'megabytes': 1000 ** 2,
1558 'mebibytes': 1024 ** 2,
1559 'GiB': 1024 ** 3,
1560 'GB': 1000 ** 3,
1561 'gB': 1024 ** 3,
1562 'Gb': 1000 ** 3,
1563 'gb': 1000 ** 3,
1564 'gigabytes': 1000 ** 3,
1565 'gibibytes': 1024 ** 3,
1566 'TiB': 1024 ** 4,
1567 'TB': 1000 ** 4,
1568 'tB': 1024 ** 4,
1569 'Tb': 1000 ** 4,
1570 'tb': 1000 ** 4,
1571 'terabytes': 1000 ** 4,
1572 'tebibytes': 1024 ** 4,
1573 'PiB': 1024 ** 5,
1574 'PB': 1000 ** 5,
1575 'pB': 1024 ** 5,
1576 'Pb': 1000 ** 5,
1577 'pb': 1000 ** 5,
1578 'petabytes': 1000 ** 5,
1579 'pebibytes': 1024 ** 5,
1580 'EiB': 1024 ** 6,
1581 'EB': 1000 ** 6,
1582 'eB': 1024 ** 6,
1583 'Eb': 1000 ** 6,
1584 'eb': 1000 ** 6,
1585 'exabytes': 1000 ** 6,
1586 'exbibytes': 1024 ** 6,
1587 'ZiB': 1024 ** 7,
1588 'ZB': 1000 ** 7,
1589 'zB': 1024 ** 7,
1590 'Zb': 1000 ** 7,
1591 'zb': 1000 ** 7,
1592 'zettabytes': 1000 ** 7,
1593 'zebibytes': 1024 ** 7,
1594 'YiB': 1024 ** 8,
1595 'YB': 1000 ** 8,
1596 'yB': 1024 ** 8,
1597 'Yb': 1000 ** 8,
1598 'yb': 1000 ** 8,
1599 'yottabytes': 1000 ** 8,
1600 'yobibytes': 1024 ** 8,
1601 }
1602
1603 return lookup_unit_table(_UNIT_TABLE, s)
1604
1605
1606 def parse_count(s):
1607 if s is None:
1608 return None
1609
1610 s = s.strip()
1611
1612 if re.match(r'^[\d,.]+$', s):
1613 return str_to_int(s)
1614
1615 _UNIT_TABLE = {
1616 'k': 1000,
1617 'K': 1000,
1618 'm': 1000 ** 2,
1619 'M': 1000 ** 2,
1620 'kk': 1000 ** 2,
1621 'KK': 1000 ** 2,
1622 }
1623
1624 return lookup_unit_table(_UNIT_TABLE, s)
1625
1626
1627 def month_by_name(name, lang='en'):
1628 """ Return the number of a month by (locale-independently) English name """
1629
1630 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1631
1632 try:
1633 return month_names.index(name) + 1
1634 except ValueError:
1635 return None
1636
1637
1638 def month_by_abbreviation(abbrev):
1639 """ Return the number of a month by (locale-independently) English
1640 abbreviations """
1641
1642 try:
1643 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1644 except ValueError:
1645 return None
1646
1647
1648 def fix_xml_ampersands(xml_str):
1649 """Replace all the '&' by '&amp;' in XML"""
1650 return re.sub(
1651 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1652 '&amp;',
1653 xml_str)
1654
1655
1656 def setproctitle(title):
1657 assert isinstance(title, compat_str)
1658
1659 # ctypes in Jython is not complete
1660 # http://bugs.jython.org/issue2148
1661 if sys.platform.startswith('java'):
1662 return
1663
1664 try:
1665 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1666 except OSError:
1667 return
1668 title_bytes = title.encode('utf-8')
1669 buf = ctypes.create_string_buffer(len(title_bytes))
1670 buf.value = title_bytes
1671 try:
1672 libc.prctl(15, buf, 0, 0, 0)
1673 except AttributeError:
1674 return # Strange libc, just skip this
1675
1676
1677 def remove_start(s, start):
1678 return s[len(start):] if s is not None and s.startswith(start) else s
1679
1680
1681 def remove_end(s, end):
1682 return s[:-len(end)] if s is not None and s.endswith(end) else s
1683
1684
1685 def remove_quotes(s):
1686 if s is None or len(s) < 2:
1687 return s
1688 for quote in ('"', "'", ):
1689 if s[0] == quote and s[-1] == quote:
1690 return s[1:-1]
1691 return s
1692
1693
1694 def url_basename(url):
1695 path = compat_urlparse.urlparse(url).path
1696 return path.strip('/').split('/')[-1]
1697
1698
1699 def base_url(url):
1700 return re.match(r'https?://[^?#&]+/', url).group()
1701
1702
1703 def urljoin(base, path):
1704 if not isinstance(path, compat_str) or not path:
1705 return None
1706 if re.match(r'^(?:https?:)?//', path):
1707 return path
1708 if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
1709 return None
1710 return compat_urlparse.urljoin(base, path)
1711
1712
1713 class HEADRequest(compat_urllib_request.Request):
1714 def get_method(self):
1715 return 'HEAD'
1716
1717
1718 class PUTRequest(compat_urllib_request.Request):
1719 def get_method(self):
1720 return 'PUT'
1721
1722
1723 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1724 if get_attr:
1725 if v is not None:
1726 v = getattr(v, get_attr, None)
1727 if v == '':
1728 v = None
1729 if v is None:
1730 return default
1731 try:
1732 return int(v) * invscale // scale
1733 except ValueError:
1734 return default
1735
1736
1737 def str_or_none(v, default=None):
1738 return default if v is None else compat_str(v)
1739
1740
1741 def str_to_int(int_str):
1742 """ A more relaxed version of int_or_none """
1743 if int_str is None:
1744 return None
1745 int_str = re.sub(r'[,\.\+]', '', int_str)
1746 return int(int_str)
1747
1748
1749 def float_or_none(v, scale=1, invscale=1, default=None):
1750 if v is None:
1751 return default
1752 try:
1753 return float(v) * invscale / scale
1754 except ValueError:
1755 return default
1756
1757
1758 def strip_or_none(v):
1759 return None if v is None else v.strip()
1760
1761
1762 def parse_duration(s):
1763 if not isinstance(s, compat_basestring):
1764 return None
1765
1766 s = s.strip()
1767
1768 days, hours, mins, secs, ms = [None] * 5
1769 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1770 if m:
1771 days, hours, mins, secs, ms = m.groups()
1772 else:
1773 m = re.match(
1774 r'''(?ix)(?:P?T)?
1775 (?:
1776 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1777 )?
1778 (?:
1779 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1780 )?
1781 (?:
1782 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1783 )?
1784 (?:
1785 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1786 )?$''', s)
1787 if m:
1788 days, hours, mins, secs, ms = m.groups()
1789 else:
1790 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1791 if m:
1792 hours, mins = m.groups()
1793 else:
1794 return None
1795
1796 duration = 0
1797 if secs:
1798 duration += float(secs)
1799 if mins:
1800 duration += float(mins) * 60
1801 if hours:
1802 duration += float(hours) * 60 * 60
1803 if days:
1804 duration += float(days) * 24 * 60 * 60
1805 if ms:
1806 duration += float(ms)
1807 return duration
1808
1809
1810 def prepend_extension(filename, ext, expected_real_ext=None):
1811 name, real_ext = os.path.splitext(filename)
1812 return (
1813 '{0}.{1}{2}'.format(name, ext, real_ext)
1814 if not expected_real_ext or real_ext[1:] == expected_real_ext
1815 else '{0}.{1}'.format(filename, ext))
1816
1817
1818 def replace_extension(filename, ext, expected_real_ext=None):
1819 name, real_ext = os.path.splitext(filename)
1820 return '{0}.{1}'.format(
1821 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1822 ext)
1823
1824
1825 def check_executable(exe, args=[]):
1826 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1827 args can be a list of arguments for a short output (like -version) """
1828 try:
1829 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1830 except OSError:
1831 return False
1832 return exe
1833
1834
1835 def get_exe_version(exe, args=['--version'],
1836 version_re=None, unrecognized='present'):
1837 """ Returns the version of the specified executable,
1838 or False if the executable is not present """
1839 try:
1840 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1841 # SIGTTOU if youtube-dl is run in the background.
1842 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1843 out, _ = subprocess.Popen(
1844 [encodeArgument(exe)] + args,
1845 stdin=subprocess.PIPE,
1846 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1847 except OSError:
1848 return False
1849 if isinstance(out, bytes): # Python 2.x
1850 out = out.decode('ascii', 'ignore')
1851 return detect_exe_version(out, version_re, unrecognized)
1852
1853
1854 def detect_exe_version(output, version_re=None, unrecognized='present'):
1855 assert isinstance(output, compat_str)
1856 if version_re is None:
1857 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1858 m = re.search(version_re, output)
1859 if m:
1860 return m.group(1)
1861 else:
1862 return unrecognized
1863
1864
1865 class PagedList(object):
1866 def __len__(self):
1867 # This is only useful for tests
1868 return len(self.getslice())
1869
1870
1871 class OnDemandPagedList(PagedList):
1872 def __init__(self, pagefunc, pagesize, use_cache=False):
1873 self._pagefunc = pagefunc
1874 self._pagesize = pagesize
1875 self._use_cache = use_cache
1876 if use_cache:
1877 self._cache = {}
1878
1879 def getslice(self, start=0, end=None):
1880 res = []
1881 for pagenum in itertools.count(start // self._pagesize):
1882 firstid = pagenum * self._pagesize
1883 nextfirstid = pagenum * self._pagesize + self._pagesize
1884 if start >= nextfirstid:
1885 continue
1886
1887 page_results = None
1888 if self._use_cache:
1889 page_results = self._cache.get(pagenum)
1890 if page_results is None:
1891 page_results = list(self._pagefunc(pagenum))
1892 if self._use_cache:
1893 self._cache[pagenum] = page_results
1894
1895 startv = (
1896 start % self._pagesize
1897 if firstid <= start < nextfirstid
1898 else 0)
1899
1900 endv = (
1901 ((end - 1) % self._pagesize) + 1
1902 if (end is not None and firstid <= end <= nextfirstid)
1903 else None)
1904
1905 if startv != 0 or endv is not None:
1906 page_results = page_results[startv:endv]
1907 res.extend(page_results)
1908
1909 # A little optimization - if current page is not "full", ie. does
1910 # not contain page_size videos then we can assume that this page
1911 # is the last one - there are no more ids on further pages -
1912 # i.e. no need to query again.
1913 if len(page_results) + startv < self._pagesize:
1914 break
1915
1916 # If we got the whole page, but the next page is not interesting,
1917 # break out early as well
1918 if end == nextfirstid:
1919 break
1920 return res
1921
1922
1923 class InAdvancePagedList(PagedList):
1924 def __init__(self, pagefunc, pagecount, pagesize):
1925 self._pagefunc = pagefunc
1926 self._pagecount = pagecount
1927 self._pagesize = pagesize
1928
1929 def getslice(self, start=0, end=None):
1930 res = []
1931 start_page = start // self._pagesize
1932 end_page = (
1933 self._pagecount if end is None else (end // self._pagesize + 1))
1934 skip_elems = start - start_page * self._pagesize
1935 only_more = None if end is None else end - start
1936 for pagenum in range(start_page, end_page):
1937 page = list(self._pagefunc(pagenum))
1938 if skip_elems:
1939 page = page[skip_elems:]
1940 skip_elems = None
1941 if only_more is not None:
1942 if len(page) < only_more:
1943 only_more -= len(page)
1944 else:
1945 page = page[:only_more]
1946 res.extend(page)
1947 break
1948 res.extend(page)
1949 return res
1950
1951
1952 def uppercase_escape(s):
1953 unicode_escape = codecs.getdecoder('unicode_escape')
1954 return re.sub(
1955 r'\\U[0-9a-fA-F]{8}',
1956 lambda m: unicode_escape(m.group(0))[0],
1957 s)
1958
1959
1960 def lowercase_escape(s):
1961 unicode_escape = codecs.getdecoder('unicode_escape')
1962 return re.sub(
1963 r'\\u[0-9a-fA-F]{4}',
1964 lambda m: unicode_escape(m.group(0))[0],
1965 s)
1966
1967
1968 def escape_rfc3986(s):
1969 """Escape non-ASCII characters as suggested by RFC 3986"""
1970 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1971 s = s.encode('utf-8')
1972 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1973
1974
1975 def escape_url(url):
1976 """Escape URL as suggested by RFC 3986"""
1977 url_parsed = compat_urllib_parse_urlparse(url)
1978 return url_parsed._replace(
1979 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1980 path=escape_rfc3986(url_parsed.path),
1981 params=escape_rfc3986(url_parsed.params),
1982 query=escape_rfc3986(url_parsed.query),
1983 fragment=escape_rfc3986(url_parsed.fragment)
1984 ).geturl()
1985
1986
1987 def read_batch_urls(batch_fd):
1988 def fixup(url):
1989 if not isinstance(url, compat_str):
1990 url = url.decode('utf-8', 'replace')
1991 BOM_UTF8 = '\xef\xbb\xbf'
1992 if url.startswith(BOM_UTF8):
1993 url = url[len(BOM_UTF8):]
1994 url = url.strip()
1995 if url.startswith(('#', ';', ']')):
1996 return False
1997 return url
1998
1999 with contextlib.closing(batch_fd) as fd:
2000 return [url for url in map(fixup, fd) if url]
2001
2002
2003 def urlencode_postdata(*args, **kargs):
2004 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2005
2006
2007 def update_url_query(url, query):
2008 if not query:
2009 return url
2010 parsed_url = compat_urlparse.urlparse(url)
2011 qs = compat_parse_qs(parsed_url.query)
2012 qs.update(query)
2013 return compat_urlparse.urlunparse(parsed_url._replace(
2014 query=compat_urllib_parse_urlencode(qs, True)))
2015
2016
2017 def update_Request(req, url=None, data=None, headers={}, query={}):
2018 req_headers = req.headers.copy()
2019 req_headers.update(headers)
2020 req_data = data or req.data
2021 req_url = update_url_query(url or req.get_full_url(), query)
2022 req_get_method = req.get_method()
2023 if req_get_method == 'HEAD':
2024 req_type = HEADRequest
2025 elif req_get_method == 'PUT':
2026 req_type = PUTRequest
2027 else:
2028 req_type = compat_urllib_request.Request
2029 new_req = req_type(
2030 req_url, data=req_data, headers=req_headers,
2031 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2032 if hasattr(req, 'timeout'):
2033 new_req.timeout = req.timeout
2034 return new_req
2035
2036
2037 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2038 if isinstance(key_or_keys, (list, tuple)):
2039 for key in key_or_keys:
2040 if key not in d or d[key] is None or skip_false_values and not d[key]:
2041 continue
2042 return d[key]
2043 return default
2044 return d.get(key_or_keys, default)
2045
2046
2047 def try_get(src, getter, expected_type=None):
2048 try:
2049 v = getter(src)
2050 except (AttributeError, KeyError, TypeError, IndexError):
2051 pass
2052 else:
2053 if expected_type is None or isinstance(v, expected_type):
2054 return v
2055
2056
2057 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2058 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2059
2060
2061 US_RATINGS = {
2062 'G': 0,
2063 'PG': 10,
2064 'PG-13': 13,
2065 'R': 16,
2066 'NC': 18,
2067 }
2068
2069
2070 TV_PARENTAL_GUIDELINES = {
2071 'TV-Y': 0,
2072 'TV-Y7': 7,
2073 'TV-G': 0,
2074 'TV-PG': 0,
2075 'TV-14': 14,
2076 'TV-MA': 17,
2077 }
2078
2079
2080 def parse_age_limit(s):
2081 if type(s) == int:
2082 return s if 0 <= s <= 21 else None
2083 if not isinstance(s, compat_basestring):
2084 return None
2085 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2086 if m:
2087 return int(m.group('age'))
2088 if s in US_RATINGS:
2089 return US_RATINGS[s]
2090 return TV_PARENTAL_GUIDELINES.get(s)
2091
2092
2093 def strip_jsonp(code):
2094 return re.sub(
2095 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2096
2097
2098 def js_to_json(code):
2099 def fix_kv(m):
2100 v = m.group(0)
2101 if v in ('true', 'false', 'null'):
2102 return v
2103 elif v.startswith('/*') or v == ',':
2104 return ""
2105
2106 if v[0] in ("'", '"'):
2107 v = re.sub(r'(?s)\\.|"', lambda m: {
2108 '"': '\\"',
2109 "\\'": "'",
2110 '\\\n': '',
2111 '\\x': '\\u00',
2112 }.get(m.group(0), m.group(0)), v[1:-1])
2113
2114 INTEGER_TABLE = (
2115 (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2116 (r'^(0+[0-7]+)\s*:?$', 8),
2117 )
2118
2119 for regex, base in INTEGER_TABLE:
2120 im = re.match(regex, v)
2121 if im:
2122 i = int(im.group(1), base)
2123 return '"%d":' % i if v.endswith(':') else '%d' % i
2124
2125 return '"%s"' % v
2126
2127 return re.sub(r'''(?sx)
2128 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2129 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2130 /\*.*?\*/|,(?=\s*[\]}])|
2131 [a-zA-Z_][.a-zA-Z_0-9]*|
2132 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2133 [0-9]+(?=\s*:)
2134 ''', fix_kv, code)
2135
2136
2137 def qualities(quality_ids):
2138 """ Get a numeric quality value out of a list of possible values """
2139 def q(qid):
2140 try:
2141 return quality_ids.index(qid)
2142 except ValueError:
2143 return -1
2144 return q
2145
2146
2147 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2148
2149
2150 def limit_length(s, length):
2151 """ Add ellipses to overly long strings """
2152 if s is None:
2153 return None
2154 ELLIPSES = '...'
2155 if len(s) > length:
2156 return s[:length - len(ELLIPSES)] + ELLIPSES
2157 return s
2158
2159
2160 def version_tuple(v):
2161 return tuple(int(e) for e in re.split(r'[-.]', v))
2162
2163
2164 def is_outdated_version(version, limit, assume_new=True):
2165 if not version:
2166 return not assume_new
2167 try:
2168 return version_tuple(version) < version_tuple(limit)
2169 except ValueError:
2170 return not assume_new
2171
2172
2173 def ytdl_is_updateable():
2174 """ Returns if youtube-dl can be updated with -U """
2175 from zipimport import zipimporter
2176
2177 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2178
2179
2180 def args_to_str(args):
2181 # Get a short string representation for a subprocess command
2182 return ' '.join(compat_shlex_quote(a) for a in args)
2183
2184
2185 def error_to_compat_str(err):
2186 err_str = str(err)
2187 # On python 2 error byte string must be decoded with proper
2188 # encoding rather than ascii
2189 if sys.version_info[0] < 3:
2190 err_str = err_str.decode(preferredencoding())
2191 return err_str
2192
2193
2194 def mimetype2ext(mt):
2195 if mt is None:
2196 return None
2197
2198 ext = {
2199 'audio/mp4': 'm4a',
2200 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2201 # it's the most popular one
2202 'audio/mpeg': 'mp3',
2203 }.get(mt)
2204 if ext is not None:
2205 return ext
2206
2207 _, _, res = mt.rpartition('/')
2208 res = res.split(';')[0].strip().lower()
2209
2210 return {
2211 '3gpp': '3gp',
2212 'smptett+xml': 'tt',
2213 'srt': 'srt',
2214 'ttaf+xml': 'dfxp',
2215 'ttml+xml': 'ttml',
2216 'vtt': 'vtt',
2217 'x-flv': 'flv',
2218 'x-mp4-fragmented': 'mp4',
2219 'x-ms-wmv': 'wmv',
2220 'mpegurl': 'm3u8',
2221 'x-mpegurl': 'm3u8',
2222 'vnd.apple.mpegurl': 'm3u8',
2223 'dash+xml': 'mpd',
2224 'f4m': 'f4m',
2225 'f4m+xml': 'f4m',
2226 'hds+xml': 'f4m',
2227 'vnd.ms-sstr+xml': 'ism',
2228 'quicktime': 'mov',
2229 }.get(res, res)
2230
2231
2232 def parse_codecs(codecs_str):
2233 # http://tools.ietf.org/html/rfc6381
2234 if not codecs_str:
2235 return {}
2236 splited_codecs = list(filter(None, map(
2237 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2238 vcodec, acodec = None, None
2239 for full_codec in splited_codecs:
2240 codec = full_codec.split('.')[0]
2241 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2242 if not vcodec:
2243 vcodec = full_codec
2244 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2245 if not acodec:
2246 acodec = full_codec
2247 else:
2248 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2249 if not vcodec and not acodec:
2250 if len(splited_codecs) == 2:
2251 return {
2252 'vcodec': vcodec,
2253 'acodec': acodec,
2254 }
2255 elif len(splited_codecs) == 1:
2256 return {
2257 'vcodec': 'none',
2258 'acodec': vcodec,
2259 }
2260 else:
2261 return {
2262 'vcodec': vcodec or 'none',
2263 'acodec': acodec or 'none',
2264 }
2265 return {}
2266
2267
2268 def urlhandle_detect_ext(url_handle):
2269 getheader = url_handle.headers.get
2270
2271 cd = getheader('Content-Disposition')
2272 if cd:
2273 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2274 if m:
2275 e = determine_ext(m.group('filename'), default_ext=None)
2276 if e:
2277 return e
2278
2279 return mimetype2ext(getheader('Content-Type'))
2280
2281
2282 def encode_data_uri(data, mime_type):
2283 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2284
2285
2286 def age_restricted(content_limit, age_limit):
2287 """ Returns True iff the content should be blocked """
2288
2289 if age_limit is None: # No limit set
2290 return False
2291 if content_limit is None:
2292 return False # Content available for everyone
2293 return age_limit < content_limit
2294
2295
2296 def is_html(first_bytes):
2297 """ Detect whether a file contains HTML by examining its first bytes. """
2298
2299 BOMS = [
2300 (b'\xef\xbb\xbf', 'utf-8'),
2301 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2302 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2303 (b'\xff\xfe', 'utf-16-le'),
2304 (b'\xfe\xff', 'utf-16-be'),
2305 ]
2306 for bom, enc in BOMS:
2307 if first_bytes.startswith(bom):
2308 s = first_bytes[len(bom):].decode(enc, 'replace')
2309 break
2310 else:
2311 s = first_bytes.decode('utf-8', 'replace')
2312
2313 return re.match(r'^\s*<', s)
2314
2315
2316 def determine_protocol(info_dict):
2317 protocol = info_dict.get('protocol')
2318 if protocol is not None:
2319 return protocol
2320
2321 url = info_dict['url']
2322 if url.startswith('rtmp'):
2323 return 'rtmp'
2324 elif url.startswith('mms'):
2325 return 'mms'
2326 elif url.startswith('rtsp'):
2327 return 'rtsp'
2328
2329 ext = determine_ext(url)
2330 if ext == 'm3u8':
2331 return 'm3u8'
2332 elif ext == 'f4m':
2333 return 'f4m'
2334
2335 return compat_urllib_parse_urlparse(url).scheme
2336
2337
2338 def render_table(header_row, data):
2339 """ Render a list of rows, each as a list of values """
2340 table = [header_row] + data
2341 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2342 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2343 return '\n'.join(format_str % tuple(row) for row in table)
2344
2345
2346 def _match_one(filter_part, dct):
2347 COMPARISON_OPERATORS = {
2348 '<': operator.lt,
2349 '<=': operator.le,
2350 '>': operator.gt,
2351 '>=': operator.ge,
2352 '=': operator.eq,
2353 '!=': operator.ne,
2354 }
2355 operator_rex = re.compile(r'''(?x)\s*
2356 (?P<key>[a-z_]+)
2357 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2358 (?:
2359 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2360 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2361 )
2362 \s*$
2363 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2364 m = operator_rex.search(filter_part)
2365 if m:
2366 op = COMPARISON_OPERATORS[m.group('op')]
2367 actual_value = dct.get(m.group('key'))
2368 if (m.group('strval') is not None or
2369 # If the original field is a string and matching comparisonvalue is
2370 # a number we should respect the origin of the original field
2371 # and process comparison value as a string (see
2372 # https://github.com/rg3/youtube-dl/issues/11082).
2373 actual_value is not None and m.group('intval') is not None and
2374 isinstance(actual_value, compat_str)):
2375 if m.group('op') not in ('=', '!='):
2376 raise ValueError(
2377 'Operator %s does not support string values!' % m.group('op'))
2378 comparison_value = m.group('strval') or m.group('intval')
2379 else:
2380 try:
2381 comparison_value = int(m.group('intval'))
2382 except ValueError:
2383 comparison_value = parse_filesize(m.group('intval'))
2384 if comparison_value is None:
2385 comparison_value = parse_filesize(m.group('intval') + 'B')
2386 if comparison_value is None:
2387 raise ValueError(
2388 'Invalid integer value %r in filter part %r' % (
2389 m.group('intval'), filter_part))
2390 if actual_value is None:
2391 return m.group('none_inclusive')
2392 return op(actual_value, comparison_value)
2393
2394 UNARY_OPERATORS = {
2395 '': lambda v: v is not None,
2396 '!': lambda v: v is None,
2397 }
2398 operator_rex = re.compile(r'''(?x)\s*
2399 (?P<op>%s)\s*(?P<key>[a-z_]+)
2400 \s*$
2401 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2402 m = operator_rex.search(filter_part)
2403 if m:
2404 op = UNARY_OPERATORS[m.group('op')]
2405 actual_value = dct.get(m.group('key'))
2406 return op(actual_value)
2407
2408 raise ValueError('Invalid filter part %r' % filter_part)
2409
2410
2411 def match_str(filter_str, dct):
2412 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2413
2414 return all(
2415 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2416
2417
2418 def match_filter_func(filter_str):
2419 def _match_func(info_dict):
2420 if match_str(filter_str, info_dict):
2421 return None
2422 else:
2423 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2424 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2425 return _match_func
2426
2427
2428 def parse_dfxp_time_expr(time_expr):
2429 if not time_expr:
2430 return
2431
2432 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2433 if mobj:
2434 return float(mobj.group('time_offset'))
2435
2436 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2437 if mobj:
2438 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2439
2440
2441 def srt_subtitles_timecode(seconds):
2442 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2443
2444
2445 def dfxp2srt(dfxp_data):
2446 _x = functools.partial(xpath_with_ns, ns_map={
2447 'ttml': 'http://www.w3.org/ns/ttml',
2448 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2449 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2450 })
2451
2452 class TTMLPElementParser(object):
2453 out = ''
2454
2455 def start(self, tag, attrib):
2456 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2457 self.out += '\n'
2458
2459 def end(self, tag):
2460 pass
2461
2462 def data(self, data):
2463 self.out += data
2464
2465 def close(self):
2466 return self.out.strip()
2467
2468 def parse_node(node):
2469 target = TTMLPElementParser()
2470 parser = xml.etree.ElementTree.XMLParser(target=target)
2471 parser.feed(xml.etree.ElementTree.tostring(node))
2472 return parser.close()
2473
2474 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2475 out = []
2476 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2477
2478 if not paras:
2479 raise ValueError('Invalid dfxp/TTML subtitle')
2480
2481 for para, index in zip(paras, itertools.count(1)):
2482 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2483 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2484 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2485 if begin_time is None:
2486 continue
2487 if not end_time:
2488 if not dur:
2489 continue
2490 end_time = begin_time + dur
2491 out.append('%d\n%s --> %s\n%s\n\n' % (
2492 index,
2493 srt_subtitles_timecode(begin_time),
2494 srt_subtitles_timecode(end_time),
2495 parse_node(para)))
2496
2497 return ''.join(out)
2498
2499
2500 def cli_option(params, command_option, param):
2501 param = params.get(param)
2502 if param:
2503 param = compat_str(param)
2504 return [command_option, param] if param is not None else []
2505
2506
2507 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2508 param = params.get(param)
2509 assert isinstance(param, bool)
2510 if separator:
2511 return [command_option + separator + (true_value if param else false_value)]
2512 return [command_option, true_value if param else false_value]
2513
2514
2515 def cli_valueless_option(params, command_option, param, expected_value=True):
2516 param = params.get(param)
2517 return [command_option] if param == expected_value else []
2518
2519
2520 def cli_configuration_args(params, param, default=[]):
2521 ex_args = params.get(param)
2522 if ex_args is None:
2523 return default
2524 assert isinstance(ex_args, list)
2525 return ex_args
2526
2527
2528 class ISO639Utils(object):
2529 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2530 _lang_map = {
2531 'aa': 'aar',
2532 'ab': 'abk',
2533 'ae': 'ave',
2534 'af': 'afr',
2535 'ak': 'aka',
2536 'am': 'amh',
2537 'an': 'arg',
2538 'ar': 'ara',
2539 'as': 'asm',
2540 'av': 'ava',
2541 'ay': 'aym',
2542 'az': 'aze',
2543 'ba': 'bak',
2544 'be': 'bel',
2545 'bg': 'bul',
2546 'bh': 'bih',
2547 'bi': 'bis',
2548 'bm': 'bam',
2549 'bn': 'ben',
2550 'bo': 'bod',
2551 'br': 'bre',
2552 'bs': 'bos',
2553 'ca': 'cat',
2554 'ce': 'che',
2555 'ch': 'cha',
2556 'co': 'cos',
2557 'cr': 'cre',
2558 'cs': 'ces',
2559 'cu': 'chu',
2560 'cv': 'chv',
2561 'cy': 'cym',
2562 'da': 'dan',
2563 'de': 'deu',
2564 'dv': 'div',
2565 'dz': 'dzo',
2566 'ee': 'ewe',
2567 'el': 'ell',
2568 'en': 'eng',
2569 'eo': 'epo',
2570 'es': 'spa',
2571 'et': 'est',
2572 'eu': 'eus',
2573 'fa': 'fas',
2574 'ff': 'ful',
2575 'fi': 'fin',
2576 'fj': 'fij',
2577 'fo': 'fao',
2578 'fr': 'fra',
2579 'fy': 'fry',
2580 'ga': 'gle',
2581 'gd': 'gla',
2582 'gl': 'glg',
2583 'gn': 'grn',
2584 'gu': 'guj',
2585 'gv': 'glv',
2586 'ha': 'hau',
2587 'he': 'heb',
2588 'hi': 'hin',
2589 'ho': 'hmo',
2590 'hr': 'hrv',
2591 'ht': 'hat',
2592 'hu': 'hun',
2593 'hy': 'hye',
2594 'hz': 'her',
2595 'ia': 'ina',
2596 'id': 'ind',
2597 'ie': 'ile',
2598 'ig': 'ibo',
2599 'ii': 'iii',
2600 'ik': 'ipk',
2601 'io': 'ido',
2602 'is': 'isl',
2603 'it': 'ita',
2604 'iu': 'iku',
2605 'ja': 'jpn',
2606 'jv': 'jav',
2607 'ka': 'kat',
2608 'kg': 'kon',
2609 'ki': 'kik',
2610 'kj': 'kua',
2611 'kk': 'kaz',
2612 'kl': 'kal',
2613 'km': 'khm',
2614 'kn': 'kan',
2615 'ko': 'kor',
2616 'kr': 'kau',
2617 'ks': 'kas',
2618 'ku': 'kur',
2619 'kv': 'kom',
2620 'kw': 'cor',
2621 'ky': 'kir',
2622 'la': 'lat',
2623 'lb': 'ltz',
2624 'lg': 'lug',
2625 'li': 'lim',
2626 'ln': 'lin',
2627 'lo': 'lao',
2628 'lt': 'lit',
2629 'lu': 'lub',
2630 'lv': 'lav',
2631 'mg': 'mlg',
2632 'mh': 'mah',
2633 'mi': 'mri',
2634 'mk': 'mkd',
2635 'ml': 'mal',
2636 'mn': 'mon',
2637 'mr': 'mar',
2638 'ms': 'msa',
2639 'mt': 'mlt',
2640 'my': 'mya',
2641 'na': 'nau',
2642 'nb': 'nob',
2643 'nd': 'nde',
2644 'ne': 'nep',
2645 'ng': 'ndo',
2646 'nl': 'nld',
2647 'nn': 'nno',
2648 'no': 'nor',
2649 'nr': 'nbl',
2650 'nv': 'nav',
2651 'ny': 'nya',
2652 'oc': 'oci',
2653 'oj': 'oji',
2654 'om': 'orm',
2655 'or': 'ori',
2656 'os': 'oss',
2657 'pa': 'pan',
2658 'pi': 'pli',
2659 'pl': 'pol',
2660 'ps': 'pus',
2661 'pt': 'por',
2662 'qu': 'que',
2663 'rm': 'roh',
2664 'rn': 'run',
2665 'ro': 'ron',
2666 'ru': 'rus',
2667 'rw': 'kin',
2668 'sa': 'san',
2669 'sc': 'srd',
2670 'sd': 'snd',
2671 'se': 'sme',
2672 'sg': 'sag',
2673 'si': 'sin',
2674 'sk': 'slk',
2675 'sl': 'slv',
2676 'sm': 'smo',
2677 'sn': 'sna',
2678 'so': 'som',
2679 'sq': 'sqi',
2680 'sr': 'srp',
2681 'ss': 'ssw',
2682 'st': 'sot',
2683 'su': 'sun',
2684 'sv': 'swe',
2685 'sw': 'swa',
2686 'ta': 'tam',
2687 'te': 'tel',
2688 'tg': 'tgk',
2689 'th': 'tha',
2690 'ti': 'tir',
2691 'tk': 'tuk',
2692 'tl': 'tgl',
2693 'tn': 'tsn',
2694 'to': 'ton',
2695 'tr': 'tur',
2696 'ts': 'tso',
2697 'tt': 'tat',
2698 'tw': 'twi',
2699 'ty': 'tah',
2700 'ug': 'uig',
2701 'uk': 'ukr',
2702 'ur': 'urd',
2703 'uz': 'uzb',
2704 've': 'ven',
2705 'vi': 'vie',
2706 'vo': 'vol',
2707 'wa': 'wln',
2708 'wo': 'wol',
2709 'xh': 'xho',
2710 'yi': 'yid',
2711 'yo': 'yor',
2712 'za': 'zha',
2713 'zh': 'zho',
2714 'zu': 'zul',
2715 }
2716
2717 @classmethod
2718 def short2long(cls, code):
2719 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2720 return cls._lang_map.get(code[:2])
2721
2722 @classmethod
2723 def long2short(cls, code):
2724 """Convert language code from ISO 639-2/T to ISO 639-1"""
2725 for short_name, long_name in cls._lang_map.items():
2726 if long_name == code:
2727 return short_name
2728
2729
2730 class ISO3166Utils(object):
2731 # From http://data.okfn.org/data/core/country-list
2732 _country_map = {
2733 'AF': 'Afghanistan',
2734 'AX': 'Åland Islands',
2735 'AL': 'Albania',
2736 'DZ': 'Algeria',
2737 'AS': 'American Samoa',
2738 'AD': 'Andorra',
2739 'AO': 'Angola',
2740 'AI': 'Anguilla',
2741 'AQ': 'Antarctica',
2742 'AG': 'Antigua and Barbuda',
2743 'AR': 'Argentina',
2744 'AM': 'Armenia',
2745 'AW': 'Aruba',
2746 'AU': 'Australia',
2747 'AT': 'Austria',
2748 'AZ': 'Azerbaijan',
2749 'BS': 'Bahamas',
2750 'BH': 'Bahrain',
2751 'BD': 'Bangladesh',
2752 'BB': 'Barbados',
2753 'BY': 'Belarus',
2754 'BE': 'Belgium',
2755 'BZ': 'Belize',
2756 'BJ': 'Benin',
2757 'BM': 'Bermuda',
2758 'BT': 'Bhutan',
2759 'BO': 'Bolivia, Plurinational State of',
2760 'BQ': 'Bonaire, Sint Eustatius and Saba',
2761 'BA': 'Bosnia and Herzegovina',
2762 'BW': 'Botswana',
2763 'BV': 'Bouvet Island',
2764 'BR': 'Brazil',
2765 'IO': 'British Indian Ocean Territory',
2766 'BN': 'Brunei Darussalam',
2767 'BG': 'Bulgaria',
2768 'BF': 'Burkina Faso',
2769 'BI': 'Burundi',
2770 'KH': 'Cambodia',
2771 'CM': 'Cameroon',
2772 'CA': 'Canada',
2773 'CV': 'Cape Verde',
2774 'KY': 'Cayman Islands',
2775 'CF': 'Central African Republic',
2776 'TD': 'Chad',
2777 'CL': 'Chile',
2778 'CN': 'China',
2779 'CX': 'Christmas Island',
2780 'CC': 'Cocos (Keeling) Islands',
2781 'CO': 'Colombia',
2782 'KM': 'Comoros',
2783 'CG': 'Congo',
2784 'CD': 'Congo, the Democratic Republic of the',
2785 'CK': 'Cook Islands',
2786 'CR': 'Costa Rica',
2787 'CI': 'Côte d\'Ivoire',
2788 'HR': 'Croatia',
2789 'CU': 'Cuba',
2790 'CW': 'Curaçao',
2791 'CY': 'Cyprus',
2792 'CZ': 'Czech Republic',
2793 'DK': 'Denmark',
2794 'DJ': 'Djibouti',
2795 'DM': 'Dominica',
2796 'DO': 'Dominican Republic',
2797 'EC': 'Ecuador',
2798 'EG': 'Egypt',
2799 'SV': 'El Salvador',
2800 'GQ': 'Equatorial Guinea',
2801 'ER': 'Eritrea',
2802 'EE': 'Estonia',
2803 'ET': 'Ethiopia',
2804 'FK': 'Falkland Islands (Malvinas)',
2805 'FO': 'Faroe Islands',
2806 'FJ': 'Fiji',
2807 'FI': 'Finland',
2808 'FR': 'France',
2809 'GF': 'French Guiana',
2810 'PF': 'French Polynesia',
2811 'TF': 'French Southern Territories',
2812 'GA': 'Gabon',
2813 'GM': 'Gambia',
2814 'GE': 'Georgia',
2815 'DE': 'Germany',
2816 'GH': 'Ghana',
2817 'GI': 'Gibraltar',
2818 'GR': 'Greece',
2819 'GL': 'Greenland',
2820 'GD': 'Grenada',
2821 'GP': 'Guadeloupe',
2822 'GU': 'Guam',
2823 'GT': 'Guatemala',
2824 'GG': 'Guernsey',
2825 'GN': 'Guinea',
2826 'GW': 'Guinea-Bissau',
2827 'GY': 'Guyana',
2828 'HT': 'Haiti',
2829 'HM': 'Heard Island and McDonald Islands',
2830 'VA': 'Holy See (Vatican City State)',
2831 'HN': 'Honduras',
2832 'HK': 'Hong Kong',
2833 'HU': 'Hungary',
2834 'IS': 'Iceland',
2835 'IN': 'India',
2836 'ID': 'Indonesia',
2837 'IR': 'Iran, Islamic Republic of',
2838 'IQ': 'Iraq',
2839 'IE': 'Ireland',
2840 'IM': 'Isle of Man',
2841 'IL': 'Israel',
2842 'IT': 'Italy',
2843 'JM': 'Jamaica',
2844 'JP': 'Japan',
2845 'JE': 'Jersey',
2846 'JO': 'Jordan',
2847 'KZ': 'Kazakhstan',
2848 'KE': 'Kenya',
2849 'KI': 'Kiribati',
2850 'KP': 'Korea, Democratic People\'s Republic of',
2851 'KR': 'Korea, Republic of',
2852 'KW': 'Kuwait',
2853 'KG': 'Kyrgyzstan',
2854 'LA': 'Lao People\'s Democratic Republic',
2855 'LV': 'Latvia',
2856 'LB': 'Lebanon',
2857 'LS': 'Lesotho',
2858 'LR': 'Liberia',
2859 'LY': 'Libya',
2860 'LI': 'Liechtenstein',
2861 'LT': 'Lithuania',
2862 'LU': 'Luxembourg',
2863 'MO': 'Macao',
2864 'MK': 'Macedonia, the Former Yugoslav Republic of',
2865 'MG': 'Madagascar',
2866 'MW': 'Malawi',
2867 'MY': 'Malaysia',
2868 'MV': 'Maldives',
2869 'ML': 'Mali',
2870 'MT': 'Malta',
2871 'MH': 'Marshall Islands',
2872 'MQ': 'Martinique',
2873 'MR': 'Mauritania',
2874 'MU': 'Mauritius',
2875 'YT': 'Mayotte',
2876 'MX': 'Mexico',
2877 'FM': 'Micronesia, Federated States of',
2878 'MD': 'Moldova, Republic of',
2879 'MC': 'Monaco',
2880 'MN': 'Mongolia',
2881 'ME': 'Montenegro',
2882 'MS': 'Montserrat',
2883 'MA': 'Morocco',
2884 'MZ': 'Mozambique',
2885 'MM': 'Myanmar',
2886 'NA': 'Namibia',
2887 'NR': 'Nauru',
2888 'NP': 'Nepal',
2889 'NL': 'Netherlands',
2890 'NC': 'New Caledonia',
2891 'NZ': 'New Zealand',
2892 'NI': 'Nicaragua',
2893 'NE': 'Niger',
2894 'NG': 'Nigeria',
2895 'NU': 'Niue',
2896 'NF': 'Norfolk Island',
2897 'MP': 'Northern Mariana Islands',
2898 'NO': 'Norway',
2899 'OM': 'Oman',
2900 'PK': 'Pakistan',
2901 'PW': 'Palau',
2902 'PS': 'Palestine, State of',
2903 'PA': 'Panama',
2904 'PG': 'Papua New Guinea',
2905 'PY': 'Paraguay',
2906 'PE': 'Peru',
2907 'PH': 'Philippines',
2908 'PN': 'Pitcairn',
2909 'PL': 'Poland',
2910 'PT': 'Portugal',
2911 'PR': 'Puerto Rico',
2912 'QA': 'Qatar',
2913 'RE': 'Réunion',
2914 'RO': 'Romania',
2915 'RU': 'Russian Federation',
2916 'RW': 'Rwanda',
2917 'BL': 'Saint Barthélemy',
2918 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2919 'KN': 'Saint Kitts and Nevis',
2920 'LC': 'Saint Lucia',
2921 'MF': 'Saint Martin (French part)',
2922 'PM': 'Saint Pierre and Miquelon',
2923 'VC': 'Saint Vincent and the Grenadines',
2924 'WS': 'Samoa',
2925 'SM': 'San Marino',
2926 'ST': 'Sao Tome and Principe',
2927 'SA': 'Saudi Arabia',
2928 'SN': 'Senegal',
2929 'RS': 'Serbia',
2930 'SC': 'Seychelles',
2931 'SL': 'Sierra Leone',
2932 'SG': 'Singapore',
2933 'SX': 'Sint Maarten (Dutch part)',
2934 'SK': 'Slovakia',
2935 'SI': 'Slovenia',
2936 'SB': 'Solomon Islands',
2937 'SO': 'Somalia',
2938 'ZA': 'South Africa',
2939 'GS': 'South Georgia and the South Sandwich Islands',
2940 'SS': 'South Sudan',
2941 'ES': 'Spain',
2942 'LK': 'Sri Lanka',
2943 'SD': 'Sudan',
2944 'SR': 'Suriname',
2945 'SJ': 'Svalbard and Jan Mayen',
2946 'SZ': 'Swaziland',
2947 'SE': 'Sweden',
2948 'CH': 'Switzerland',
2949 'SY': 'Syrian Arab Republic',
2950 'TW': 'Taiwan, Province of China',
2951 'TJ': 'Tajikistan',
2952 'TZ': 'Tanzania, United Republic of',
2953 'TH': 'Thailand',
2954 'TL': 'Timor-Leste',
2955 'TG': 'Togo',
2956 'TK': 'Tokelau',
2957 'TO': 'Tonga',
2958 'TT': 'Trinidad and Tobago',
2959 'TN': 'Tunisia',
2960 'TR': 'Turkey',
2961 'TM': 'Turkmenistan',
2962 'TC': 'Turks and Caicos Islands',
2963 'TV': 'Tuvalu',
2964 'UG': 'Uganda',
2965 'UA': 'Ukraine',
2966 'AE': 'United Arab Emirates',
2967 'GB': 'United Kingdom',
2968 'US': 'United States',
2969 'UM': 'United States Minor Outlying Islands',
2970 'UY': 'Uruguay',
2971 'UZ': 'Uzbekistan',
2972 'VU': 'Vanuatu',
2973 'VE': 'Venezuela, Bolivarian Republic of',
2974 'VN': 'Viet Nam',
2975 'VG': 'Virgin Islands, British',
2976 'VI': 'Virgin Islands, U.S.',
2977 'WF': 'Wallis and Futuna',
2978 'EH': 'Western Sahara',
2979 'YE': 'Yemen',
2980 'ZM': 'Zambia',
2981 'ZW': 'Zimbabwe',
2982 }
2983
2984 @classmethod
2985 def short2full(cls, code):
2986 """Convert an ISO 3166-2 country code to the corresponding full name"""
2987 return cls._country_map.get(code.upper())
2988
2989
2990 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2991 def __init__(self, proxies=None):
2992 # Set default handlers
2993 for type in ('http', 'https'):
2994 setattr(self, '%s_open' % type,
2995 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2996 meth(r, proxy, type))
2997 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2998
2999 def proxy_open(self, req, proxy, type):
3000 req_proxy = req.headers.get('Ytdl-request-proxy')
3001 if req_proxy is not None:
3002 proxy = req_proxy
3003 del req.headers['Ytdl-request-proxy']
3004
3005 if proxy == '__noproxy__':
3006 return None # No Proxy
3007 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3008 req.add_header('Ytdl-socks-proxy', proxy)
3009 # youtube-dl's http/https handlers do wrapping the socket with socks
3010 return None
3011 return compat_urllib_request.ProxyHandler.proxy_open(
3012 self, req, proxy, type)
3013
3014
3015 def ohdave_rsa_encrypt(data, exponent, modulus):
3016 '''
3017 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3018
3019 Input:
3020 data: data to encrypt, bytes-like object
3021 exponent, modulus: parameter e and N of RSA algorithm, both integer
3022 Output: hex string of encrypted data
3023
3024 Limitation: supports one block encryption only
3025 '''
3026
3027 payload = int(binascii.hexlify(data[::-1]), 16)
3028 encrypted = pow(payload, exponent, modulus)
3029 return '%x' % encrypted
3030
3031
3032 def encode_base_n(num, n, table=None):
3033 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3034 if not table:
3035 table = FULL_TABLE[:n]
3036
3037 if n > len(table):
3038 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3039
3040 if num == 0:
3041 return table[0]
3042
3043 ret = ''
3044 while num:
3045 ret = table[num % n] + ret
3046 num = num // n
3047 return ret
3048
3049
3050 def decode_packed_codes(code):
3051 mobj = re.search(PACKED_CODES_RE, code)
3052 obfucasted_code, base, count, symbols = mobj.groups()
3053 base = int(base)
3054 count = int(count)
3055 symbols = symbols.split('|')
3056 symbol_table = {}
3057
3058 while count:
3059 count -= 1
3060 base_n_count = encode_base_n(count, base)
3061 symbol_table[base_n_count] = symbols[count] or base_n_count
3062
3063 return re.sub(
3064 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3065 obfucasted_code)
3066
3067
3068 def parse_m3u8_attributes(attrib):
3069 info = {}
3070 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3071 if val.startswith('"'):
3072 val = val[1:-1]
3073 info[key] = val
3074 return info
3075
3076
3077 def urshift(val, n):
3078 return val >> n if val >= 0 else (val + 0x100000000) >> n
3079
3080
3081 # Based on png2str() written by @gdkchan and improved by @yokrysty
3082 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3083 def decode_png(png_data):
3084 # Reference: https://www.w3.org/TR/PNG/
3085 header = png_data[8:]
3086
3087 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3088 raise IOError('Not a valid PNG file.')
3089
3090 int_map = {1: '>B', 2: '>H', 4: '>I'}
3091 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3092
3093 chunks = []
3094
3095 while header:
3096 length = unpack_integer(header[:4])
3097 header = header[4:]
3098
3099 chunk_type = header[:4]
3100 header = header[4:]
3101
3102 chunk_data = header[:length]
3103 header = header[length:]
3104
3105 header = header[4:] # Skip CRC
3106
3107 chunks.append({
3108 'type': chunk_type,
3109 'length': length,
3110 'data': chunk_data
3111 })
3112
3113 ihdr = chunks[0]['data']
3114
3115 width = unpack_integer(ihdr[:4])
3116 height = unpack_integer(ihdr[4:8])
3117
3118 idat = b''
3119
3120 for chunk in chunks:
3121 if chunk['type'] == b'IDAT':
3122 idat += chunk['data']
3123
3124 if not idat:
3125 raise IOError('Unable to read PNG data.')
3126
3127 decompressed_data = bytearray(zlib.decompress(idat))
3128
3129 stride = width * 3
3130 pixels = []
3131
3132 def _get_pixel(idx):
3133 x = idx % stride
3134 y = idx // stride
3135 return pixels[y][x]
3136
3137 for y in range(height):
3138 basePos = y * (1 + stride)
3139 filter_type = decompressed_data[basePos]
3140
3141 current_row = []
3142
3143 pixels.append(current_row)
3144
3145 for x in range(stride):
3146 color = decompressed_data[1 + basePos + x]
3147 basex = y * stride + x
3148 left = 0
3149 up = 0
3150
3151 if x > 2:
3152 left = _get_pixel(basex - 3)
3153 if y > 0:
3154 up = _get_pixel(basex - stride)
3155
3156 if filter_type == 1: # Sub
3157 color = (color + left) & 0xff
3158 elif filter_type == 2: # Up
3159 color = (color + up) & 0xff
3160 elif filter_type == 3: # Average
3161 color = (color + ((left + up) >> 1)) & 0xff
3162 elif filter_type == 4: # Paeth
3163 a = left
3164 b = up
3165 c = 0
3166
3167 if x > 2 and y > 0:
3168 c = _get_pixel(basex - stride - 3)
3169
3170 p = a + b - c
3171
3172 pa = abs(p - a)
3173 pb = abs(p - b)
3174 pc = abs(p - c)
3175
3176 if pa <= pb and pa <= pc:
3177 color = (color + a) & 0xff
3178 elif pb <= pc:
3179 color = (color + b) & 0xff
3180 else:
3181 color = (color + c) & 0xff
3182
3183 current_row.append(color)
3184
3185 return width, height, pixels
3186
3187
3188 def write_xattr(path, key, value):
3189 # This mess below finds the best xattr tool for the job
3190 try:
3191 # try the pyxattr module...
3192 import xattr
3193
3194 if hasattr(xattr, 'set'): # pyxattr
3195 # Unicode arguments are not supported in python-pyxattr until
3196 # version 0.5.0
3197 # See https://github.com/rg3/youtube-dl/issues/5498
3198 pyxattr_required_version = '0.5.0'
3199 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3200 # TODO: fallback to CLI tools
3201 raise XAttrUnavailableError(
3202 'python-pyxattr is detected but is too old. '
3203 'youtube-dl requires %s or above while your version is %s. '
3204 'Falling back to other xattr implementations' % (
3205 pyxattr_required_version, xattr.__version__))
3206
3207 setxattr = xattr.set
3208 else: # xattr
3209 setxattr = xattr.setxattr
3210
3211 try:
3212 setxattr(path, key, value)
3213 except EnvironmentError as e:
3214 raise XAttrMetadataError(e.errno, e.strerror)
3215
3216 except ImportError:
3217 if compat_os_name == 'nt':
3218 # Write xattrs to NTFS Alternate Data Streams:
3219 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3220 assert ':' not in key
3221 assert os.path.exists(path)
3222
3223 ads_fn = path + ':' + key
3224 try:
3225 with open(ads_fn, 'wb') as f:
3226 f.write(value)
3227 except EnvironmentError as e:
3228 raise XAttrMetadataError(e.errno, e.strerror)
3229 else:
3230 user_has_setfattr = check_executable('setfattr', ['--version'])
3231 user_has_xattr = check_executable('xattr', ['-h'])
3232
3233 if user_has_setfattr or user_has_xattr:
3234
3235 value = value.decode('utf-8')
3236 if user_has_setfattr:
3237 executable = 'setfattr'
3238 opts = ['-n', key, '-v', value]
3239 elif user_has_xattr:
3240 executable = 'xattr'
3241 opts = ['-w', key, value]
3242
3243 cmd = ([encodeFilename(executable, True)] +
3244 [encodeArgument(o) for o in opts] +
3245 [encodeFilename(path, True)])
3246
3247 try:
3248 p = subprocess.Popen(
3249 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3250 except EnvironmentError as e:
3251 raise XAttrMetadataError(e.errno, e.strerror)
3252 stdout, stderr = p.communicate()
3253 stderr = stderr.decode('utf-8', 'replace')
3254 if p.returncode != 0:
3255 raise XAttrMetadataError(p.returncode, stderr)
3256
3257 else:
3258 # On Unix, and can't find pyxattr, setfattr, or xattr.
3259 if sys.platform.startswith('linux'):
3260 raise XAttrUnavailableError(
3261 "Couldn't find a tool to set the xattrs. "
3262 "Install either the python 'pyxattr' or 'xattr' "
3263 "modules, or the GNU 'attr' package "
3264 "(which contains the 'setfattr' tool).")
3265 else:
3266 raise XAttrUnavailableError(
3267 "Couldn't find a tool to set the xattrs. "
3268 "Install either the python 'xattr' module, "
3269 "or the 'xattr' binary.")