]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[nhl] Add support for wch2016.com (Closes #10833)
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import errno
15 import functools
16 import gzip
17 import io
18 import itertools
19 import json
20 import locale
21 import math
22 import operator
23 import os
24 import pipes
25 import platform
26 import re
27 import socket
28 import ssl
29 import subprocess
30 import sys
31 import tempfile
32 import traceback
33 import xml.etree.ElementTree
34 import zlib
35
36 from .compat import (
37 compat_HTMLParser,
38 compat_basestring,
39 compat_chr,
40 compat_etree_fromstring,
41 compat_html_entities,
42 compat_html_entities_html5,
43 compat_http_client,
44 compat_kwargs,
45 compat_os_name,
46 compat_parse_qs,
47 compat_shlex_quote,
48 compat_socket_create_connection,
49 compat_str,
50 compat_struct_pack,
51 compat_struct_unpack,
52 compat_urllib_error,
53 compat_urllib_parse,
54 compat_urllib_parse_urlencode,
55 compat_urllib_parse_urlparse,
56 compat_urllib_parse_unquote_plus,
57 compat_urllib_request,
58 compat_urlparse,
59 compat_xpath,
60 )
61
62 from .socks import (
63 ProxyType,
64 sockssocket,
65 )
66
67
68 def register_socks_protocols():
69 # "Register" SOCKS protocols
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
75
76
77 # This is not clearly defined otherwise
78 compiled_regex_type = type(re.compile(''))
79
80 std_headers = {
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
86 }
87
88
89 NO_DEFAULT = object()
90
91 ENGLISH_MONTH_NAMES = [
92 'January', 'February', 'March', 'April', 'May', 'June',
93 'July', 'August', 'September', 'October', 'November', 'December']
94
95 MONTH_NAMES = {
96 'en': ENGLISH_MONTH_NAMES,
97 'fr': [
98 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
99 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
100 }
101
102 KNOWN_EXTENSIONS = (
103 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
104 'flv', 'f4v', 'f4a', 'f4b',
105 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
106 'mkv', 'mka', 'mk3d',
107 'avi', 'divx',
108 'mov',
109 'asf', 'wmv', 'wma',
110 '3gp', '3g2',
111 'mp3',
112 'flac',
113 'ape',
114 'wav',
115 'f4f', 'f4m', 'm3u8', 'smil')
116
117 # needed for sanitizing filenames in restricted mode
118 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
119 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
120 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
121
122 DATE_FORMATS = (
123 '%d %B %Y',
124 '%d %b %Y',
125 '%B %d %Y',
126 '%b %d %Y',
127 '%b %dst %Y %I:%M',
128 '%b %dnd %Y %I:%M',
129 '%b %dth %Y %I:%M',
130 '%Y %m %d',
131 '%Y-%m-%d',
132 '%Y/%m/%d',
133 '%Y/%m/%d %H:%M',
134 '%Y/%m/%d %H:%M:%S',
135 '%Y-%m-%d %H:%M:%S',
136 '%Y-%m-%d %H:%M:%S.%f',
137 '%d.%m.%Y %H:%M',
138 '%d.%m.%Y %H.%M',
139 '%Y-%m-%dT%H:%M:%SZ',
140 '%Y-%m-%dT%H:%M:%S.%fZ',
141 '%Y-%m-%dT%H:%M:%S.%f0Z',
142 '%Y-%m-%dT%H:%M:%S',
143 '%Y-%m-%dT%H:%M:%S.%f',
144 '%Y-%m-%dT%H:%M',
145 '%b %d %Y at %H:%M',
146 '%b %d %Y at %H:%M:%S',
147 )
148
149 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
150 DATE_FORMATS_DAY_FIRST.extend([
151 '%d-%m-%Y',
152 '%d.%m.%Y',
153 '%d.%m.%y',
154 '%d/%m/%Y',
155 '%d/%m/%y',
156 '%d/%m/%Y %H:%M:%S',
157 ])
158
159 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
160 DATE_FORMATS_MONTH_FIRST.extend([
161 '%m-%d-%Y',
162 '%m.%d.%Y',
163 '%m/%d/%Y',
164 '%m/%d/%y',
165 '%m/%d/%Y %H:%M:%S',
166 ])
167
168
169 def preferredencoding():
170 """Get preferred encoding.
171
172 Returns the best encoding scheme for the system, based on
173 locale.getpreferredencoding() and some further tweaks.
174 """
175 try:
176 pref = locale.getpreferredencoding()
177 'TEST'.encode(pref)
178 except Exception:
179 pref = 'UTF-8'
180
181 return pref
182
183
184 def write_json_file(obj, fn):
185 """ Encode obj as JSON and write it to fn, atomically if possible """
186
187 fn = encodeFilename(fn)
188 if sys.version_info < (3, 0) and sys.platform != 'win32':
189 encoding = get_filesystem_encoding()
190 # os.path.basename returns a bytes object, but NamedTemporaryFile
191 # will fail if the filename contains non ascii characters unless we
192 # use a unicode object
193 path_basename = lambda f: os.path.basename(fn).decode(encoding)
194 # the same for os.path.dirname
195 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
196 else:
197 path_basename = os.path.basename
198 path_dirname = os.path.dirname
199
200 args = {
201 'suffix': '.tmp',
202 'prefix': path_basename(fn) + '.',
203 'dir': path_dirname(fn),
204 'delete': False,
205 }
206
207 # In Python 2.x, json.dump expects a bytestream.
208 # In Python 3.x, it writes to a character stream
209 if sys.version_info < (3, 0):
210 args['mode'] = 'wb'
211 else:
212 args.update({
213 'mode': 'w',
214 'encoding': 'utf-8',
215 })
216
217 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
218
219 try:
220 with tf:
221 json.dump(obj, tf)
222 if sys.platform == 'win32':
223 # Need to remove existing file on Windows, else os.rename raises
224 # WindowsError or FileExistsError.
225 try:
226 os.unlink(fn)
227 except OSError:
228 pass
229 os.rename(tf.name, fn)
230 except Exception:
231 try:
232 os.remove(tf.name)
233 except OSError:
234 pass
235 raise
236
237
238 if sys.version_info >= (2, 7):
239 def find_xpath_attr(node, xpath, key, val=None):
240 """ Find the xpath xpath[@key=val] """
241 assert re.match(r'^[a-zA-Z_-]+$', key)
242 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
243 return node.find(expr)
244 else:
245 def find_xpath_attr(node, xpath, key, val=None):
246 for f in node.findall(compat_xpath(xpath)):
247 if key not in f.attrib:
248 continue
249 if val is None or f.attrib.get(key) == val:
250 return f
251 return None
252
253 # On python2.6 the xml.etree.ElementTree.Element methods don't support
254 # the namespace parameter
255
256
257 def xpath_with_ns(path, ns_map):
258 components = [c.split(':') for c in path.split('/')]
259 replaced = []
260 for c in components:
261 if len(c) == 1:
262 replaced.append(c[0])
263 else:
264 ns, tag = c
265 replaced.append('{%s}%s' % (ns_map[ns], tag))
266 return '/'.join(replaced)
267
268
269 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
270 def _find_xpath(xpath):
271 return node.find(compat_xpath(xpath))
272
273 if isinstance(xpath, (str, compat_str)):
274 n = _find_xpath(xpath)
275 else:
276 for xp in xpath:
277 n = _find_xpath(xp)
278 if n is not None:
279 break
280
281 if n is None:
282 if default is not NO_DEFAULT:
283 return default
284 elif fatal:
285 name = xpath if name is None else name
286 raise ExtractorError('Could not find XML element %s' % name)
287 else:
288 return None
289 return n
290
291
292 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
293 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
294 if n is None or n == default:
295 return n
296 if n.text is None:
297 if default is not NO_DEFAULT:
298 return default
299 elif fatal:
300 name = xpath if name is None else name
301 raise ExtractorError('Could not find XML element\'s text %s' % name)
302 else:
303 return None
304 return n.text
305
306
307 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
308 n = find_xpath_attr(node, xpath, key)
309 if n is None:
310 if default is not NO_DEFAULT:
311 return default
312 elif fatal:
313 name = '%s[@%s]' % (xpath, key) if name is None else name
314 raise ExtractorError('Could not find XML attribute %s' % name)
315 else:
316 return None
317 return n.attrib[key]
318
319
320 def get_element_by_id(id, html):
321 """Return the content of the tag with the specified ID in the passed HTML document"""
322 return get_element_by_attribute('id', id, html)
323
324
325 def get_element_by_class(class_name, html):
326 return get_element_by_attribute(
327 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
328 html, escape_value=False)
329
330
331 def get_element_by_attribute(attribute, value, html, escape_value=True):
332 """Return the content of the tag with the specified attribute in the passed HTML document"""
333
334 value = re.escape(value) if escape_value else value
335
336 m = re.search(r'''(?xs)
337 <([a-zA-Z0-9:._-]+)
338 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
339 \s+%s=['"]?%s['"]?
340 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
341 \s*>
342 (?P<content>.*?)
343 </\1>
344 ''' % (re.escape(attribute), value), html)
345
346 if not m:
347 return None
348 res = m.group('content')
349
350 if res.startswith('"') or res.startswith("'"):
351 res = res[1:-1]
352
353 return unescapeHTML(res)
354
355
356 class HTMLAttributeParser(compat_HTMLParser):
357 """Trivial HTML parser to gather the attributes for a single element"""
358 def __init__(self):
359 self.attrs = {}
360 compat_HTMLParser.__init__(self)
361
362 def handle_starttag(self, tag, attrs):
363 self.attrs = dict(attrs)
364
365
366 def extract_attributes(html_element):
367 """Given a string for an HTML element such as
368 <el
369 a="foo" B="bar" c="&98;az" d=boz
370 empty= noval entity="&amp;"
371 sq='"' dq="'"
372 >
373 Decode and return a dictionary of attributes.
374 {
375 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
376 'empty': '', 'noval': None, 'entity': '&',
377 'sq': '"', 'dq': '\''
378 }.
379 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
380 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
381 """
382 parser = HTMLAttributeParser()
383 parser.feed(html_element)
384 parser.close()
385 return parser.attrs
386
387
388 def clean_html(html):
389 """Clean an HTML snippet into a readable string"""
390
391 if html is None: # Convenience for sanitizing descriptions etc.
392 return html
393
394 # Newline vs <br />
395 html = html.replace('\n', ' ')
396 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
397 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
398 # Strip html tags
399 html = re.sub('<.*?>', '', html)
400 # Replace html entities
401 html = unescapeHTML(html)
402 return html.strip()
403
404
405 def sanitize_open(filename, open_mode):
406 """Try to open the given filename, and slightly tweak it if this fails.
407
408 Attempts to open the given filename. If this fails, it tries to change
409 the filename slightly, step by step, until it's either able to open it
410 or it fails and raises a final exception, like the standard open()
411 function.
412
413 It returns the tuple (stream, definitive_file_name).
414 """
415 try:
416 if filename == '-':
417 if sys.platform == 'win32':
418 import msvcrt
419 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
420 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
421 stream = open(encodeFilename(filename), open_mode)
422 return (stream, filename)
423 except (IOError, OSError) as err:
424 if err.errno in (errno.EACCES,):
425 raise
426
427 # In case of error, try to remove win32 forbidden chars
428 alt_filename = sanitize_path(filename)
429 if alt_filename == filename:
430 raise
431 else:
432 # An exception here should be caught in the caller
433 stream = open(encodeFilename(alt_filename), open_mode)
434 return (stream, alt_filename)
435
436
437 def timeconvert(timestr):
438 """Convert RFC 2822 defined time string into system timestamp"""
439 timestamp = None
440 timetuple = email.utils.parsedate_tz(timestr)
441 if timetuple is not None:
442 timestamp = email.utils.mktime_tz(timetuple)
443 return timestamp
444
445
446 def sanitize_filename(s, restricted=False, is_id=False):
447 """Sanitizes a string so it could be used as part of a filename.
448 If restricted is set, use a stricter subset of allowed characters.
449 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
450 """
451 def replace_insane(char):
452 if restricted and char in ACCENT_CHARS:
453 return ACCENT_CHARS[char]
454 if char == '?' or ord(char) < 32 or ord(char) == 127:
455 return ''
456 elif char == '"':
457 return '' if restricted else '\''
458 elif char == ':':
459 return '_-' if restricted else ' -'
460 elif char in '\\/|*<>':
461 return '_'
462 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
463 return '_'
464 if restricted and ord(char) > 127:
465 return '_'
466 return char
467
468 # Handle timestamps
469 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
470 result = ''.join(map(replace_insane, s))
471 if not is_id:
472 while '__' in result:
473 result = result.replace('__', '_')
474 result = result.strip('_')
475 # Common case of "Foreign band name - English song title"
476 if restricted and result.startswith('-_'):
477 result = result[2:]
478 if result.startswith('-'):
479 result = '_' + result[len('-'):]
480 result = result.lstrip('.')
481 if not result:
482 result = '_'
483 return result
484
485
486 def sanitize_path(s):
487 """Sanitizes and normalizes path on Windows"""
488 if sys.platform != 'win32':
489 return s
490 drive_or_unc, _ = os.path.splitdrive(s)
491 if sys.version_info < (2, 7) and not drive_or_unc:
492 drive_or_unc, _ = os.path.splitunc(s)
493 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
494 if drive_or_unc:
495 norm_path.pop(0)
496 sanitized_path = [
497 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
498 for path_part in norm_path]
499 if drive_or_unc:
500 sanitized_path.insert(0, drive_or_unc + os.path.sep)
501 return os.path.join(*sanitized_path)
502
503
504 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
505 # unwanted failures due to missing protocol
506 def sanitize_url(url):
507 return 'http:%s' % url if url.startswith('//') else url
508
509
510 def sanitized_Request(url, *args, **kwargs):
511 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
512
513
514 def orderedSet(iterable):
515 """ Remove all duplicates from the input iterable """
516 res = []
517 for el in iterable:
518 if el not in res:
519 res.append(el)
520 return res
521
522
523 def _htmlentity_transform(entity_with_semicolon):
524 """Transforms an HTML entity to a character."""
525 entity = entity_with_semicolon[:-1]
526
527 # Known non-numeric HTML entity
528 if entity in compat_html_entities.name2codepoint:
529 return compat_chr(compat_html_entities.name2codepoint[entity])
530
531 # TODO: HTML5 allows entities without a semicolon. For example,
532 # '&Eacuteric' should be decoded as 'Éric'.
533 if entity_with_semicolon in compat_html_entities_html5:
534 return compat_html_entities_html5[entity_with_semicolon]
535
536 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
537 if mobj is not None:
538 numstr = mobj.group(1)
539 if numstr.startswith('x'):
540 base = 16
541 numstr = '0%s' % numstr
542 else:
543 base = 10
544 # See https://github.com/rg3/youtube-dl/issues/7518
545 try:
546 return compat_chr(int(numstr, base))
547 except ValueError:
548 pass
549
550 # Unknown entity in name, return its literal representation
551 return '&%s;' % entity
552
553
554 def unescapeHTML(s):
555 if s is None:
556 return None
557 assert type(s) == compat_str
558
559 return re.sub(
560 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
561
562
563 def get_subprocess_encoding():
564 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
565 # For subprocess calls, encode with locale encoding
566 # Refer to http://stackoverflow.com/a/9951851/35070
567 encoding = preferredencoding()
568 else:
569 encoding = sys.getfilesystemencoding()
570 if encoding is None:
571 encoding = 'utf-8'
572 return encoding
573
574
575 def encodeFilename(s, for_subprocess=False):
576 """
577 @param s The name of the file
578 """
579
580 assert type(s) == compat_str
581
582 # Python 3 has a Unicode API
583 if sys.version_info >= (3, 0):
584 return s
585
586 # Pass '' directly to use Unicode APIs on Windows 2000 and up
587 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
588 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
589 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
590 return s
591
592 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
593 if sys.platform.startswith('java'):
594 return s
595
596 return s.encode(get_subprocess_encoding(), 'ignore')
597
598
599 def decodeFilename(b, for_subprocess=False):
600
601 if sys.version_info >= (3, 0):
602 return b
603
604 if not isinstance(b, bytes):
605 return b
606
607 return b.decode(get_subprocess_encoding(), 'ignore')
608
609
610 def encodeArgument(s):
611 if not isinstance(s, compat_str):
612 # Legacy code that uses byte strings
613 # Uncomment the following line after fixing all post processors
614 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
615 s = s.decode('ascii')
616 return encodeFilename(s, True)
617
618
619 def decodeArgument(b):
620 return decodeFilename(b, True)
621
622
623 def decodeOption(optval):
624 if optval is None:
625 return optval
626 if isinstance(optval, bytes):
627 optval = optval.decode(preferredencoding())
628
629 assert isinstance(optval, compat_str)
630 return optval
631
632
633 def formatSeconds(secs):
634 if secs > 3600:
635 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
636 elif secs > 60:
637 return '%d:%02d' % (secs // 60, secs % 60)
638 else:
639 return '%d' % secs
640
641
642 def make_HTTPS_handler(params, **kwargs):
643 opts_no_check_certificate = params.get('nocheckcertificate', False)
644 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
645 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
646 if opts_no_check_certificate:
647 context.check_hostname = False
648 context.verify_mode = ssl.CERT_NONE
649 try:
650 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
651 except TypeError:
652 # Python 2.7.8
653 # (create_default_context present but HTTPSHandler has no context=)
654 pass
655
656 if sys.version_info < (3, 2):
657 return YoutubeDLHTTPSHandler(params, **kwargs)
658 else: # Python < 3.4
659 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
660 context.verify_mode = (ssl.CERT_NONE
661 if opts_no_check_certificate
662 else ssl.CERT_REQUIRED)
663 context.set_default_verify_paths()
664 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
665
666
667 def bug_reports_message():
668 if ytdl_is_updateable():
669 update_cmd = 'type youtube-dl -U to update'
670 else:
671 update_cmd = 'see https://yt-dl.org/update on how to update'
672 msg = '; please report this issue on https://yt-dl.org/bug .'
673 msg += ' Make sure you are using the latest version; %s.' % update_cmd
674 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
675 return msg
676
677
678 class ExtractorError(Exception):
679 """Error during info extraction."""
680
681 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
682 """ tb, if given, is the original traceback (so that it can be printed out).
683 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
684 """
685
686 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
687 expected = True
688 if video_id is not None:
689 msg = video_id + ': ' + msg
690 if cause:
691 msg += ' (caused by %r)' % cause
692 if not expected:
693 msg += bug_reports_message()
694 super(ExtractorError, self).__init__(msg)
695
696 self.traceback = tb
697 self.exc_info = sys.exc_info() # preserve original exception
698 self.cause = cause
699 self.video_id = video_id
700
701 def format_traceback(self):
702 if self.traceback is None:
703 return None
704 return ''.join(traceback.format_tb(self.traceback))
705
706
707 class UnsupportedError(ExtractorError):
708 def __init__(self, url):
709 super(UnsupportedError, self).__init__(
710 'Unsupported URL: %s' % url, expected=True)
711 self.url = url
712
713
714 class RegexNotFoundError(ExtractorError):
715 """Error when a regex didn't match"""
716 pass
717
718
719 class DownloadError(Exception):
720 """Download Error exception.
721
722 This exception may be thrown by FileDownloader objects if they are not
723 configured to continue on errors. They will contain the appropriate
724 error message.
725 """
726
727 def __init__(self, msg, exc_info=None):
728 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
729 super(DownloadError, self).__init__(msg)
730 self.exc_info = exc_info
731
732
733 class SameFileError(Exception):
734 """Same File exception.
735
736 This exception will be thrown by FileDownloader objects if they detect
737 multiple files would have to be downloaded to the same file on disk.
738 """
739 pass
740
741
742 class PostProcessingError(Exception):
743 """Post Processing exception.
744
745 This exception may be raised by PostProcessor's .run() method to
746 indicate an error in the postprocessing task.
747 """
748
749 def __init__(self, msg):
750 self.msg = msg
751
752
753 class MaxDownloadsReached(Exception):
754 """ --max-downloads limit has been reached. """
755 pass
756
757
758 class UnavailableVideoError(Exception):
759 """Unavailable Format exception.
760
761 This exception will be thrown when a video is requested
762 in a format that is not available for that video.
763 """
764 pass
765
766
767 class ContentTooShortError(Exception):
768 """Content Too Short exception.
769
770 This exception may be raised by FileDownloader objects when a file they
771 download is too small for what the server announced first, indicating
772 the connection was probably interrupted.
773 """
774
775 def __init__(self, downloaded, expected):
776 # Both in bytes
777 self.downloaded = downloaded
778 self.expected = expected
779
780
781 class XAttrMetadataError(Exception):
782 def __init__(self, code=None, msg='Unknown error'):
783 super(XAttrMetadataError, self).__init__(msg)
784 self.code = code
785 self.msg = msg
786
787 # Parsing code and msg
788 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
789 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
790 self.reason = 'NO_SPACE'
791 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
792 self.reason = 'VALUE_TOO_LONG'
793 else:
794 self.reason = 'NOT_SUPPORTED'
795
796
797 class XAttrUnavailableError(Exception):
798 pass
799
800
801 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
802 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
803 # expected HTTP responses to meet HTTP/1.0 or later (see also
804 # https://github.com/rg3/youtube-dl/issues/6727)
805 if sys.version_info < (3, 0):
806 kwargs[b'strict'] = True
807 hc = http_class(*args, **kwargs)
808 source_address = ydl_handler._params.get('source_address')
809 if source_address is not None:
810 sa = (source_address, 0)
811 if hasattr(hc, 'source_address'): # Python 2.7+
812 hc.source_address = sa
813 else: # Python 2.6
814 def _hc_connect(self, *args, **kwargs):
815 sock = compat_socket_create_connection(
816 (self.host, self.port), self.timeout, sa)
817 if is_https:
818 self.sock = ssl.wrap_socket(
819 sock, self.key_file, self.cert_file,
820 ssl_version=ssl.PROTOCOL_TLSv1)
821 else:
822 self.sock = sock
823 hc.connect = functools.partial(_hc_connect, hc)
824
825 return hc
826
827
828 def handle_youtubedl_headers(headers):
829 filtered_headers = headers
830
831 if 'Youtubedl-no-compression' in filtered_headers:
832 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
833 del filtered_headers['Youtubedl-no-compression']
834
835 return filtered_headers
836
837
838 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
839 """Handler for HTTP requests and responses.
840
841 This class, when installed with an OpenerDirector, automatically adds
842 the standard headers to every HTTP request and handles gzipped and
843 deflated responses from web servers. If compression is to be avoided in
844 a particular request, the original request in the program code only has
845 to include the HTTP header "Youtubedl-no-compression", which will be
846 removed before making the real request.
847
848 Part of this code was copied from:
849
850 http://techknack.net/python-urllib2-handlers/
851
852 Andrew Rowls, the author of that code, agreed to release it to the
853 public domain.
854 """
855
856 def __init__(self, params, *args, **kwargs):
857 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
858 self._params = params
859
860 def http_open(self, req):
861 conn_class = compat_http_client.HTTPConnection
862
863 socks_proxy = req.headers.get('Ytdl-socks-proxy')
864 if socks_proxy:
865 conn_class = make_socks_conn_class(conn_class, socks_proxy)
866 del req.headers['Ytdl-socks-proxy']
867
868 return self.do_open(functools.partial(
869 _create_http_connection, self, conn_class, False),
870 req)
871
872 @staticmethod
873 def deflate(data):
874 try:
875 return zlib.decompress(data, -zlib.MAX_WBITS)
876 except zlib.error:
877 return zlib.decompress(data)
878
879 @staticmethod
880 def addinfourl_wrapper(stream, headers, url, code):
881 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
882 return compat_urllib_request.addinfourl(stream, headers, url, code)
883 ret = compat_urllib_request.addinfourl(stream, headers, url)
884 ret.code = code
885 return ret
886
887 def http_request(self, req):
888 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
889 # always respected by websites, some tend to give out URLs with non percent-encoded
890 # non-ASCII characters (see telemb.py, ard.py [#3412])
891 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
892 # To work around aforementioned issue we will replace request's original URL with
893 # percent-encoded one
894 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
895 # the code of this workaround has been moved here from YoutubeDL.urlopen()
896 url = req.get_full_url()
897 url_escaped = escape_url(url)
898
899 # Substitute URL if any change after escaping
900 if url != url_escaped:
901 req = update_Request(req, url=url_escaped)
902
903 for h, v in std_headers.items():
904 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
905 # The dict keys are capitalized because of this bug by urllib
906 if h.capitalize() not in req.headers:
907 req.add_header(h, v)
908
909 req.headers = handle_youtubedl_headers(req.headers)
910
911 if sys.version_info < (2, 7) and '#' in req.get_full_url():
912 # Python 2.6 is brain-dead when it comes to fragments
913 req._Request__original = req._Request__original.partition('#')[0]
914 req._Request__r_type = req._Request__r_type.partition('#')[0]
915
916 return req
917
918 def http_response(self, req, resp):
919 old_resp = resp
920 # gzip
921 if resp.headers.get('Content-encoding', '') == 'gzip':
922 content = resp.read()
923 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
924 try:
925 uncompressed = io.BytesIO(gz.read())
926 except IOError as original_ioerror:
927 # There may be junk add the end of the file
928 # See http://stackoverflow.com/q/4928560/35070 for details
929 for i in range(1, 1024):
930 try:
931 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
932 uncompressed = io.BytesIO(gz.read())
933 except IOError:
934 continue
935 break
936 else:
937 raise original_ioerror
938 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
939 resp.msg = old_resp.msg
940 del resp.headers['Content-encoding']
941 # deflate
942 if resp.headers.get('Content-encoding', '') == 'deflate':
943 gz = io.BytesIO(self.deflate(resp.read()))
944 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
945 resp.msg = old_resp.msg
946 del resp.headers['Content-encoding']
947 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
948 # https://github.com/rg3/youtube-dl/issues/6457).
949 if 300 <= resp.code < 400:
950 location = resp.headers.get('Location')
951 if location:
952 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
953 if sys.version_info >= (3, 0):
954 location = location.encode('iso-8859-1').decode('utf-8')
955 else:
956 location = location.decode('utf-8')
957 location_escaped = escape_url(location)
958 if location != location_escaped:
959 del resp.headers['Location']
960 if sys.version_info < (3, 0):
961 location_escaped = location_escaped.encode('utf-8')
962 resp.headers['Location'] = location_escaped
963 return resp
964
965 https_request = http_request
966 https_response = http_response
967
968
969 def make_socks_conn_class(base_class, socks_proxy):
970 assert issubclass(base_class, (
971 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
972
973 url_components = compat_urlparse.urlparse(socks_proxy)
974 if url_components.scheme.lower() == 'socks5':
975 socks_type = ProxyType.SOCKS5
976 elif url_components.scheme.lower() in ('socks', 'socks4'):
977 socks_type = ProxyType.SOCKS4
978 elif url_components.scheme.lower() == 'socks4a':
979 socks_type = ProxyType.SOCKS4A
980
981 def unquote_if_non_empty(s):
982 if not s:
983 return s
984 return compat_urllib_parse_unquote_plus(s)
985
986 proxy_args = (
987 socks_type,
988 url_components.hostname, url_components.port or 1080,
989 True, # Remote DNS
990 unquote_if_non_empty(url_components.username),
991 unquote_if_non_empty(url_components.password),
992 )
993
994 class SocksConnection(base_class):
995 def connect(self):
996 self.sock = sockssocket()
997 self.sock.setproxy(*proxy_args)
998 if type(self.timeout) in (int, float):
999 self.sock.settimeout(self.timeout)
1000 self.sock.connect((self.host, self.port))
1001
1002 if isinstance(self, compat_http_client.HTTPSConnection):
1003 if hasattr(self, '_context'): # Python > 2.6
1004 self.sock = self._context.wrap_socket(
1005 self.sock, server_hostname=self.host)
1006 else:
1007 self.sock = ssl.wrap_socket(self.sock)
1008
1009 return SocksConnection
1010
1011
1012 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1013 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1014 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1015 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1016 self._params = params
1017
1018 def https_open(self, req):
1019 kwargs = {}
1020 conn_class = self._https_conn_class
1021
1022 if hasattr(self, '_context'): # python > 2.6
1023 kwargs['context'] = self._context
1024 if hasattr(self, '_check_hostname'): # python 3.x
1025 kwargs['check_hostname'] = self._check_hostname
1026
1027 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1028 if socks_proxy:
1029 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1030 del req.headers['Ytdl-socks-proxy']
1031
1032 return self.do_open(functools.partial(
1033 _create_http_connection, self, conn_class, True),
1034 req, **kwargs)
1035
1036
1037 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1038 def __init__(self, cookiejar=None):
1039 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1040
1041 def http_response(self, request, response):
1042 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1043 # characters in Set-Cookie HTTP header of last response (see
1044 # https://github.com/rg3/youtube-dl/issues/6769).
1045 # In order to at least prevent crashing we will percent encode Set-Cookie
1046 # header before HTTPCookieProcessor starts processing it.
1047 # if sys.version_info < (3, 0) and response.headers:
1048 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1049 # set_cookie = response.headers.get(set_cookie_header)
1050 # if set_cookie:
1051 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1052 # if set_cookie != set_cookie_escaped:
1053 # del response.headers[set_cookie_header]
1054 # response.headers[set_cookie_header] = set_cookie_escaped
1055 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1056
1057 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1058 https_response = http_response
1059
1060
1061 def extract_timezone(date_str):
1062 m = re.search(
1063 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1064 date_str)
1065 if not m:
1066 timezone = datetime.timedelta()
1067 else:
1068 date_str = date_str[:-len(m.group('tz'))]
1069 if not m.group('sign'):
1070 timezone = datetime.timedelta()
1071 else:
1072 sign = 1 if m.group('sign') == '+' else -1
1073 timezone = datetime.timedelta(
1074 hours=sign * int(m.group('hours')),
1075 minutes=sign * int(m.group('minutes')))
1076 return timezone, date_str
1077
1078
1079 def parse_iso8601(date_str, delimiter='T', timezone=None):
1080 """ Return a UNIX timestamp from the given date """
1081
1082 if date_str is None:
1083 return None
1084
1085 date_str = re.sub(r'\.[0-9]+', '', date_str)
1086
1087 if timezone is None:
1088 timezone, date_str = extract_timezone(date_str)
1089
1090 try:
1091 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1092 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1093 return calendar.timegm(dt.timetuple())
1094 except ValueError:
1095 pass
1096
1097
1098 def date_formats(day_first=True):
1099 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1100
1101
1102 def unified_strdate(date_str, day_first=True):
1103 """Return a string with the date in the format YYYYMMDD"""
1104
1105 if date_str is None:
1106 return None
1107 upload_date = None
1108 # Replace commas
1109 date_str = date_str.replace(',', ' ')
1110 # Remove AM/PM + timezone
1111 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1112 _, date_str = extract_timezone(date_str)
1113
1114 for expression in date_formats(day_first):
1115 try:
1116 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1117 except ValueError:
1118 pass
1119 if upload_date is None:
1120 timetuple = email.utils.parsedate_tz(date_str)
1121 if timetuple:
1122 try:
1123 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1124 except ValueError:
1125 pass
1126 if upload_date is not None:
1127 return compat_str(upload_date)
1128
1129
1130 def unified_timestamp(date_str, day_first=True):
1131 if date_str is None:
1132 return None
1133
1134 date_str = date_str.replace(',', ' ')
1135
1136 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1137 timezone, date_str = extract_timezone(date_str)
1138
1139 # Remove AM/PM + timezone
1140 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1141
1142 for expression in date_formats(day_first):
1143 try:
1144 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1145 return calendar.timegm(dt.timetuple())
1146 except ValueError:
1147 pass
1148 timetuple = email.utils.parsedate_tz(date_str)
1149 if timetuple:
1150 return calendar.timegm(timetuple) + pm_delta * 3600
1151
1152
1153 def determine_ext(url, default_ext='unknown_video'):
1154 if url is None:
1155 return default_ext
1156 guess = url.partition('?')[0].rpartition('.')[2]
1157 if re.match(r'^[A-Za-z0-9]+$', guess):
1158 return guess
1159 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1160 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1161 return guess.rstrip('/')
1162 else:
1163 return default_ext
1164
1165
1166 def subtitles_filename(filename, sub_lang, sub_format):
1167 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1168
1169
1170 def date_from_str(date_str):
1171 """
1172 Return a datetime object from a string in the format YYYYMMDD or
1173 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1174 today = datetime.date.today()
1175 if date_str in ('now', 'today'):
1176 return today
1177 if date_str == 'yesterday':
1178 return today - datetime.timedelta(days=1)
1179 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1180 if match is not None:
1181 sign = match.group('sign')
1182 time = int(match.group('time'))
1183 if sign == '-':
1184 time = -time
1185 unit = match.group('unit')
1186 # A bad approximation?
1187 if unit == 'month':
1188 unit = 'day'
1189 time *= 30
1190 elif unit == 'year':
1191 unit = 'day'
1192 time *= 365
1193 unit += 's'
1194 delta = datetime.timedelta(**{unit: time})
1195 return today + delta
1196 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1197
1198
1199 def hyphenate_date(date_str):
1200 """
1201 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1202 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1203 if match is not None:
1204 return '-'.join(match.groups())
1205 else:
1206 return date_str
1207
1208
1209 class DateRange(object):
1210 """Represents a time interval between two dates"""
1211
1212 def __init__(self, start=None, end=None):
1213 """start and end must be strings in the format accepted by date"""
1214 if start is not None:
1215 self.start = date_from_str(start)
1216 else:
1217 self.start = datetime.datetime.min.date()
1218 if end is not None:
1219 self.end = date_from_str(end)
1220 else:
1221 self.end = datetime.datetime.max.date()
1222 if self.start > self.end:
1223 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1224
1225 @classmethod
1226 def day(cls, day):
1227 """Returns a range that only contains the given day"""
1228 return cls(day, day)
1229
1230 def __contains__(self, date):
1231 """Check if the date is in the range"""
1232 if not isinstance(date, datetime.date):
1233 date = date_from_str(date)
1234 return self.start <= date <= self.end
1235
1236 def __str__(self):
1237 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1238
1239
1240 def platform_name():
1241 """ Returns the platform name as a compat_str """
1242 res = platform.platform()
1243 if isinstance(res, bytes):
1244 res = res.decode(preferredencoding())
1245
1246 assert isinstance(res, compat_str)
1247 return res
1248
1249
1250 def _windows_write_string(s, out):
1251 """ Returns True if the string was written using special methods,
1252 False if it has yet to be written out."""
1253 # Adapted from http://stackoverflow.com/a/3259271/35070
1254
1255 import ctypes
1256 import ctypes.wintypes
1257
1258 WIN_OUTPUT_IDS = {
1259 1: -11,
1260 2: -12,
1261 }
1262
1263 try:
1264 fileno = out.fileno()
1265 except AttributeError:
1266 # If the output stream doesn't have a fileno, it's virtual
1267 return False
1268 except io.UnsupportedOperation:
1269 # Some strange Windows pseudo files?
1270 return False
1271 if fileno not in WIN_OUTPUT_IDS:
1272 return False
1273
1274 GetStdHandle = ctypes.WINFUNCTYPE(
1275 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1276 (b'GetStdHandle', ctypes.windll.kernel32))
1277 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1278
1279 WriteConsoleW = ctypes.WINFUNCTYPE(
1280 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1281 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1282 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1283 written = ctypes.wintypes.DWORD(0)
1284
1285 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1286 FILE_TYPE_CHAR = 0x0002
1287 FILE_TYPE_REMOTE = 0x8000
1288 GetConsoleMode = ctypes.WINFUNCTYPE(
1289 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1290 ctypes.POINTER(ctypes.wintypes.DWORD))(
1291 (b'GetConsoleMode', ctypes.windll.kernel32))
1292 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1293
1294 def not_a_console(handle):
1295 if handle == INVALID_HANDLE_VALUE or handle is None:
1296 return True
1297 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1298 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1299
1300 if not_a_console(h):
1301 return False
1302
1303 def next_nonbmp_pos(s):
1304 try:
1305 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1306 except StopIteration:
1307 return len(s)
1308
1309 while s:
1310 count = min(next_nonbmp_pos(s), 1024)
1311
1312 ret = WriteConsoleW(
1313 h, s, count if count else 2, ctypes.byref(written), None)
1314 if ret == 0:
1315 raise OSError('Failed to write string')
1316 if not count: # We just wrote a non-BMP character
1317 assert written.value == 2
1318 s = s[1:]
1319 else:
1320 assert written.value > 0
1321 s = s[written.value:]
1322 return True
1323
1324
1325 def write_string(s, out=None, encoding=None):
1326 if out is None:
1327 out = sys.stderr
1328 assert type(s) == compat_str
1329
1330 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1331 if _windows_write_string(s, out):
1332 return
1333
1334 if ('b' in getattr(out, 'mode', '') or
1335 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1336 byt = s.encode(encoding or preferredencoding(), 'ignore')
1337 out.write(byt)
1338 elif hasattr(out, 'buffer'):
1339 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1340 byt = s.encode(enc, 'ignore')
1341 out.buffer.write(byt)
1342 else:
1343 out.write(s)
1344 out.flush()
1345
1346
1347 def bytes_to_intlist(bs):
1348 if not bs:
1349 return []
1350 if isinstance(bs[0], int): # Python 3
1351 return list(bs)
1352 else:
1353 return [ord(c) for c in bs]
1354
1355
1356 def intlist_to_bytes(xs):
1357 if not xs:
1358 return b''
1359 return compat_struct_pack('%dB' % len(xs), *xs)
1360
1361
1362 # Cross-platform file locking
1363 if sys.platform == 'win32':
1364 import ctypes.wintypes
1365 import msvcrt
1366
1367 class OVERLAPPED(ctypes.Structure):
1368 _fields_ = [
1369 ('Internal', ctypes.wintypes.LPVOID),
1370 ('InternalHigh', ctypes.wintypes.LPVOID),
1371 ('Offset', ctypes.wintypes.DWORD),
1372 ('OffsetHigh', ctypes.wintypes.DWORD),
1373 ('hEvent', ctypes.wintypes.HANDLE),
1374 ]
1375
1376 kernel32 = ctypes.windll.kernel32
1377 LockFileEx = kernel32.LockFileEx
1378 LockFileEx.argtypes = [
1379 ctypes.wintypes.HANDLE, # hFile
1380 ctypes.wintypes.DWORD, # dwFlags
1381 ctypes.wintypes.DWORD, # dwReserved
1382 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1383 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1384 ctypes.POINTER(OVERLAPPED) # Overlapped
1385 ]
1386 LockFileEx.restype = ctypes.wintypes.BOOL
1387 UnlockFileEx = kernel32.UnlockFileEx
1388 UnlockFileEx.argtypes = [
1389 ctypes.wintypes.HANDLE, # hFile
1390 ctypes.wintypes.DWORD, # dwReserved
1391 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1392 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1393 ctypes.POINTER(OVERLAPPED) # Overlapped
1394 ]
1395 UnlockFileEx.restype = ctypes.wintypes.BOOL
1396 whole_low = 0xffffffff
1397 whole_high = 0x7fffffff
1398
1399 def _lock_file(f, exclusive):
1400 overlapped = OVERLAPPED()
1401 overlapped.Offset = 0
1402 overlapped.OffsetHigh = 0
1403 overlapped.hEvent = 0
1404 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1405 handle = msvcrt.get_osfhandle(f.fileno())
1406 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1407 whole_low, whole_high, f._lock_file_overlapped_p):
1408 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1409
1410 def _unlock_file(f):
1411 assert f._lock_file_overlapped_p
1412 handle = msvcrt.get_osfhandle(f.fileno())
1413 if not UnlockFileEx(handle, 0,
1414 whole_low, whole_high, f._lock_file_overlapped_p):
1415 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1416
1417 else:
1418 # Some platforms, such as Jython, is missing fcntl
1419 try:
1420 import fcntl
1421
1422 def _lock_file(f, exclusive):
1423 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1424
1425 def _unlock_file(f):
1426 fcntl.flock(f, fcntl.LOCK_UN)
1427 except ImportError:
1428 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1429
1430 def _lock_file(f, exclusive):
1431 raise IOError(UNSUPPORTED_MSG)
1432
1433 def _unlock_file(f):
1434 raise IOError(UNSUPPORTED_MSG)
1435
1436
1437 class locked_file(object):
1438 def __init__(self, filename, mode, encoding=None):
1439 assert mode in ['r', 'a', 'w']
1440 self.f = io.open(filename, mode, encoding=encoding)
1441 self.mode = mode
1442
1443 def __enter__(self):
1444 exclusive = self.mode != 'r'
1445 try:
1446 _lock_file(self.f, exclusive)
1447 except IOError:
1448 self.f.close()
1449 raise
1450 return self
1451
1452 def __exit__(self, etype, value, traceback):
1453 try:
1454 _unlock_file(self.f)
1455 finally:
1456 self.f.close()
1457
1458 def __iter__(self):
1459 return iter(self.f)
1460
1461 def write(self, *args):
1462 return self.f.write(*args)
1463
1464 def read(self, *args):
1465 return self.f.read(*args)
1466
1467
1468 def get_filesystem_encoding():
1469 encoding = sys.getfilesystemencoding()
1470 return encoding if encoding is not None else 'utf-8'
1471
1472
1473 def shell_quote(args):
1474 quoted_args = []
1475 encoding = get_filesystem_encoding()
1476 for a in args:
1477 if isinstance(a, bytes):
1478 # We may get a filename encoded with 'encodeFilename'
1479 a = a.decode(encoding)
1480 quoted_args.append(pipes.quote(a))
1481 return ' '.join(quoted_args)
1482
1483
1484 def smuggle_url(url, data):
1485 """ Pass additional data in a URL for internal use. """
1486
1487 url, idata = unsmuggle_url(url, {})
1488 data.update(idata)
1489 sdata = compat_urllib_parse_urlencode(
1490 {'__youtubedl_smuggle': json.dumps(data)})
1491 return url + '#' + sdata
1492
1493
1494 def unsmuggle_url(smug_url, default=None):
1495 if '#__youtubedl_smuggle' not in smug_url:
1496 return smug_url, default
1497 url, _, sdata = smug_url.rpartition('#')
1498 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1499 data = json.loads(jsond)
1500 return url, data
1501
1502
1503 def format_bytes(bytes):
1504 if bytes is None:
1505 return 'N/A'
1506 if type(bytes) is str:
1507 bytes = float(bytes)
1508 if bytes == 0.0:
1509 exponent = 0
1510 else:
1511 exponent = int(math.log(bytes, 1024.0))
1512 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1513 converted = float(bytes) / float(1024 ** exponent)
1514 return '%.2f%s' % (converted, suffix)
1515
1516
1517 def lookup_unit_table(unit_table, s):
1518 units_re = '|'.join(re.escape(u) for u in unit_table)
1519 m = re.match(
1520 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1521 if not m:
1522 return None
1523 num_str = m.group('num').replace(',', '.')
1524 mult = unit_table[m.group('unit')]
1525 return int(float(num_str) * mult)
1526
1527
1528 def parse_filesize(s):
1529 if s is None:
1530 return None
1531
1532 # The lower-case forms are of course incorrect and unofficial,
1533 # but we support those too
1534 _UNIT_TABLE = {
1535 'B': 1,
1536 'b': 1,
1537 'bytes': 1,
1538 'KiB': 1024,
1539 'KB': 1000,
1540 'kB': 1024,
1541 'Kb': 1000,
1542 'kb': 1000,
1543 'kilobytes': 1000,
1544 'kibibytes': 1024,
1545 'MiB': 1024 ** 2,
1546 'MB': 1000 ** 2,
1547 'mB': 1024 ** 2,
1548 'Mb': 1000 ** 2,
1549 'mb': 1000 ** 2,
1550 'megabytes': 1000 ** 2,
1551 'mebibytes': 1024 ** 2,
1552 'GiB': 1024 ** 3,
1553 'GB': 1000 ** 3,
1554 'gB': 1024 ** 3,
1555 'Gb': 1000 ** 3,
1556 'gb': 1000 ** 3,
1557 'gigabytes': 1000 ** 3,
1558 'gibibytes': 1024 ** 3,
1559 'TiB': 1024 ** 4,
1560 'TB': 1000 ** 4,
1561 'tB': 1024 ** 4,
1562 'Tb': 1000 ** 4,
1563 'tb': 1000 ** 4,
1564 'terabytes': 1000 ** 4,
1565 'tebibytes': 1024 ** 4,
1566 'PiB': 1024 ** 5,
1567 'PB': 1000 ** 5,
1568 'pB': 1024 ** 5,
1569 'Pb': 1000 ** 5,
1570 'pb': 1000 ** 5,
1571 'petabytes': 1000 ** 5,
1572 'pebibytes': 1024 ** 5,
1573 'EiB': 1024 ** 6,
1574 'EB': 1000 ** 6,
1575 'eB': 1024 ** 6,
1576 'Eb': 1000 ** 6,
1577 'eb': 1000 ** 6,
1578 'exabytes': 1000 ** 6,
1579 'exbibytes': 1024 ** 6,
1580 'ZiB': 1024 ** 7,
1581 'ZB': 1000 ** 7,
1582 'zB': 1024 ** 7,
1583 'Zb': 1000 ** 7,
1584 'zb': 1000 ** 7,
1585 'zettabytes': 1000 ** 7,
1586 'zebibytes': 1024 ** 7,
1587 'YiB': 1024 ** 8,
1588 'YB': 1000 ** 8,
1589 'yB': 1024 ** 8,
1590 'Yb': 1000 ** 8,
1591 'yb': 1000 ** 8,
1592 'yottabytes': 1000 ** 8,
1593 'yobibytes': 1024 ** 8,
1594 }
1595
1596 return lookup_unit_table(_UNIT_TABLE, s)
1597
1598
1599 def parse_count(s):
1600 if s is None:
1601 return None
1602
1603 s = s.strip()
1604
1605 if re.match(r'^[\d,.]+$', s):
1606 return str_to_int(s)
1607
1608 _UNIT_TABLE = {
1609 'k': 1000,
1610 'K': 1000,
1611 'm': 1000 ** 2,
1612 'M': 1000 ** 2,
1613 'kk': 1000 ** 2,
1614 'KK': 1000 ** 2,
1615 }
1616
1617 return lookup_unit_table(_UNIT_TABLE, s)
1618
1619
1620 def month_by_name(name, lang='en'):
1621 """ Return the number of a month by (locale-independently) English name """
1622
1623 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1624
1625 try:
1626 return month_names.index(name) + 1
1627 except ValueError:
1628 return None
1629
1630
1631 def month_by_abbreviation(abbrev):
1632 """ Return the number of a month by (locale-independently) English
1633 abbreviations """
1634
1635 try:
1636 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1637 except ValueError:
1638 return None
1639
1640
1641 def fix_xml_ampersands(xml_str):
1642 """Replace all the '&' by '&amp;' in XML"""
1643 return re.sub(
1644 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1645 '&amp;',
1646 xml_str)
1647
1648
1649 def setproctitle(title):
1650 assert isinstance(title, compat_str)
1651
1652 # ctypes in Jython is not complete
1653 # http://bugs.jython.org/issue2148
1654 if sys.platform.startswith('java'):
1655 return
1656
1657 try:
1658 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1659 except OSError:
1660 return
1661 title_bytes = title.encode('utf-8')
1662 buf = ctypes.create_string_buffer(len(title_bytes))
1663 buf.value = title_bytes
1664 try:
1665 libc.prctl(15, buf, 0, 0, 0)
1666 except AttributeError:
1667 return # Strange libc, just skip this
1668
1669
1670 def remove_start(s, start):
1671 return s[len(start):] if s is not None and s.startswith(start) else s
1672
1673
1674 def remove_end(s, end):
1675 return s[:-len(end)] if s is not None and s.endswith(end) else s
1676
1677
1678 def remove_quotes(s):
1679 if s is None or len(s) < 2:
1680 return s
1681 for quote in ('"', "'", ):
1682 if s[0] == quote and s[-1] == quote:
1683 return s[1:-1]
1684 return s
1685
1686
1687 def url_basename(url):
1688 path = compat_urlparse.urlparse(url).path
1689 return path.strip('/').split('/')[-1]
1690
1691
1692 class HEADRequest(compat_urllib_request.Request):
1693 def get_method(self):
1694 return 'HEAD'
1695
1696
1697 class PUTRequest(compat_urllib_request.Request):
1698 def get_method(self):
1699 return 'PUT'
1700
1701
1702 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1703 if get_attr:
1704 if v is not None:
1705 v = getattr(v, get_attr, None)
1706 if v == '':
1707 v = None
1708 if v is None:
1709 return default
1710 try:
1711 return int(v) * invscale // scale
1712 except ValueError:
1713 return default
1714
1715
1716 def str_or_none(v, default=None):
1717 return default if v is None else compat_str(v)
1718
1719
1720 def str_to_int(int_str):
1721 """ A more relaxed version of int_or_none """
1722 if int_str is None:
1723 return None
1724 int_str = re.sub(r'[,\.\+]', '', int_str)
1725 return int(int_str)
1726
1727
1728 def float_or_none(v, scale=1, invscale=1, default=None):
1729 if v is None:
1730 return default
1731 try:
1732 return float(v) * invscale / scale
1733 except ValueError:
1734 return default
1735
1736
1737 def strip_or_none(v):
1738 return None if v is None else v.strip()
1739
1740
1741 def parse_duration(s):
1742 if not isinstance(s, compat_basestring):
1743 return None
1744
1745 s = s.strip()
1746
1747 days, hours, mins, secs, ms = [None] * 5
1748 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1749 if m:
1750 days, hours, mins, secs, ms = m.groups()
1751 else:
1752 m = re.match(
1753 r'''(?ix)(?:P?T)?
1754 (?:
1755 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1756 )?
1757 (?:
1758 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1759 )?
1760 (?:
1761 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1762 )?
1763 (?:
1764 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1765 )?$''', s)
1766 if m:
1767 days, hours, mins, secs, ms = m.groups()
1768 else:
1769 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1770 if m:
1771 hours, mins = m.groups()
1772 else:
1773 return None
1774
1775 duration = 0
1776 if secs:
1777 duration += float(secs)
1778 if mins:
1779 duration += float(mins) * 60
1780 if hours:
1781 duration += float(hours) * 60 * 60
1782 if days:
1783 duration += float(days) * 24 * 60 * 60
1784 if ms:
1785 duration += float(ms)
1786 return duration
1787
1788
1789 def prepend_extension(filename, ext, expected_real_ext=None):
1790 name, real_ext = os.path.splitext(filename)
1791 return (
1792 '{0}.{1}{2}'.format(name, ext, real_ext)
1793 if not expected_real_ext or real_ext[1:] == expected_real_ext
1794 else '{0}.{1}'.format(filename, ext))
1795
1796
1797 def replace_extension(filename, ext, expected_real_ext=None):
1798 name, real_ext = os.path.splitext(filename)
1799 return '{0}.{1}'.format(
1800 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1801 ext)
1802
1803
1804 def check_executable(exe, args=[]):
1805 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1806 args can be a list of arguments for a short output (like -version) """
1807 try:
1808 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1809 except OSError:
1810 return False
1811 return exe
1812
1813
1814 def get_exe_version(exe, args=['--version'],
1815 version_re=None, unrecognized='present'):
1816 """ Returns the version of the specified executable,
1817 or False if the executable is not present """
1818 try:
1819 out, _ = subprocess.Popen(
1820 [encodeArgument(exe)] + args,
1821 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1822 except OSError:
1823 return False
1824 if isinstance(out, bytes): # Python 2.x
1825 out = out.decode('ascii', 'ignore')
1826 return detect_exe_version(out, version_re, unrecognized)
1827
1828
1829 def detect_exe_version(output, version_re=None, unrecognized='present'):
1830 assert isinstance(output, compat_str)
1831 if version_re is None:
1832 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1833 m = re.search(version_re, output)
1834 if m:
1835 return m.group(1)
1836 else:
1837 return unrecognized
1838
1839
1840 class PagedList(object):
1841 def __len__(self):
1842 # This is only useful for tests
1843 return len(self.getslice())
1844
1845
1846 class OnDemandPagedList(PagedList):
1847 def __init__(self, pagefunc, pagesize, use_cache=False):
1848 self._pagefunc = pagefunc
1849 self._pagesize = pagesize
1850 self._use_cache = use_cache
1851 if use_cache:
1852 self._cache = {}
1853
1854 def getslice(self, start=0, end=None):
1855 res = []
1856 for pagenum in itertools.count(start // self._pagesize):
1857 firstid = pagenum * self._pagesize
1858 nextfirstid = pagenum * self._pagesize + self._pagesize
1859 if start >= nextfirstid:
1860 continue
1861
1862 page_results = None
1863 if self._use_cache:
1864 page_results = self._cache.get(pagenum)
1865 if page_results is None:
1866 page_results = list(self._pagefunc(pagenum))
1867 if self._use_cache:
1868 self._cache[pagenum] = page_results
1869
1870 startv = (
1871 start % self._pagesize
1872 if firstid <= start < nextfirstid
1873 else 0)
1874
1875 endv = (
1876 ((end - 1) % self._pagesize) + 1
1877 if (end is not None and firstid <= end <= nextfirstid)
1878 else None)
1879
1880 if startv != 0 or endv is not None:
1881 page_results = page_results[startv:endv]
1882 res.extend(page_results)
1883
1884 # A little optimization - if current page is not "full", ie. does
1885 # not contain page_size videos then we can assume that this page
1886 # is the last one - there are no more ids on further pages -
1887 # i.e. no need to query again.
1888 if len(page_results) + startv < self._pagesize:
1889 break
1890
1891 # If we got the whole page, but the next page is not interesting,
1892 # break out early as well
1893 if end == nextfirstid:
1894 break
1895 return res
1896
1897
1898 class InAdvancePagedList(PagedList):
1899 def __init__(self, pagefunc, pagecount, pagesize):
1900 self._pagefunc = pagefunc
1901 self._pagecount = pagecount
1902 self._pagesize = pagesize
1903
1904 def getslice(self, start=0, end=None):
1905 res = []
1906 start_page = start // self._pagesize
1907 end_page = (
1908 self._pagecount if end is None else (end // self._pagesize + 1))
1909 skip_elems = start - start_page * self._pagesize
1910 only_more = None if end is None else end - start
1911 for pagenum in range(start_page, end_page):
1912 page = list(self._pagefunc(pagenum))
1913 if skip_elems:
1914 page = page[skip_elems:]
1915 skip_elems = None
1916 if only_more is not None:
1917 if len(page) < only_more:
1918 only_more -= len(page)
1919 else:
1920 page = page[:only_more]
1921 res.extend(page)
1922 break
1923 res.extend(page)
1924 return res
1925
1926
1927 def uppercase_escape(s):
1928 unicode_escape = codecs.getdecoder('unicode_escape')
1929 return re.sub(
1930 r'\\U[0-9a-fA-F]{8}',
1931 lambda m: unicode_escape(m.group(0))[0],
1932 s)
1933
1934
1935 def lowercase_escape(s):
1936 unicode_escape = codecs.getdecoder('unicode_escape')
1937 return re.sub(
1938 r'\\u[0-9a-fA-F]{4}',
1939 lambda m: unicode_escape(m.group(0))[0],
1940 s)
1941
1942
1943 def escape_rfc3986(s):
1944 """Escape non-ASCII characters as suggested by RFC 3986"""
1945 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1946 s = s.encode('utf-8')
1947 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1948
1949
1950 def escape_url(url):
1951 """Escape URL as suggested by RFC 3986"""
1952 url_parsed = compat_urllib_parse_urlparse(url)
1953 return url_parsed._replace(
1954 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1955 path=escape_rfc3986(url_parsed.path),
1956 params=escape_rfc3986(url_parsed.params),
1957 query=escape_rfc3986(url_parsed.query),
1958 fragment=escape_rfc3986(url_parsed.fragment)
1959 ).geturl()
1960
1961
1962 def read_batch_urls(batch_fd):
1963 def fixup(url):
1964 if not isinstance(url, compat_str):
1965 url = url.decode('utf-8', 'replace')
1966 BOM_UTF8 = '\xef\xbb\xbf'
1967 if url.startswith(BOM_UTF8):
1968 url = url[len(BOM_UTF8):]
1969 url = url.strip()
1970 if url.startswith(('#', ';', ']')):
1971 return False
1972 return url
1973
1974 with contextlib.closing(batch_fd) as fd:
1975 return [url for url in map(fixup, fd) if url]
1976
1977
1978 def urlencode_postdata(*args, **kargs):
1979 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1980
1981
1982 def update_url_query(url, query):
1983 if not query:
1984 return url
1985 parsed_url = compat_urlparse.urlparse(url)
1986 qs = compat_parse_qs(parsed_url.query)
1987 qs.update(query)
1988 return compat_urlparse.urlunparse(parsed_url._replace(
1989 query=compat_urllib_parse_urlencode(qs, True)))
1990
1991
1992 def update_Request(req, url=None, data=None, headers={}, query={}):
1993 req_headers = req.headers.copy()
1994 req_headers.update(headers)
1995 req_data = data or req.data
1996 req_url = update_url_query(url or req.get_full_url(), query)
1997 req_get_method = req.get_method()
1998 if req_get_method == 'HEAD':
1999 req_type = HEADRequest
2000 elif req_get_method == 'PUT':
2001 req_type = PUTRequest
2002 else:
2003 req_type = compat_urllib_request.Request
2004 new_req = req_type(
2005 req_url, data=req_data, headers=req_headers,
2006 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2007 if hasattr(req, 'timeout'):
2008 new_req.timeout = req.timeout
2009 return new_req
2010
2011
2012 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2013 if isinstance(key_or_keys, (list, tuple)):
2014 for key in key_or_keys:
2015 if key not in d or d[key] is None or skip_false_values and not d[key]:
2016 continue
2017 return d[key]
2018 return default
2019 return d.get(key_or_keys, default)
2020
2021
2022 def try_get(src, getter, expected_type=None):
2023 try:
2024 v = getter(src)
2025 except (AttributeError, KeyError, TypeError, IndexError):
2026 pass
2027 else:
2028 if expected_type is None or isinstance(v, expected_type):
2029 return v
2030
2031
2032 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2033 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2034
2035
2036 US_RATINGS = {
2037 'G': 0,
2038 'PG': 10,
2039 'PG-13': 13,
2040 'R': 16,
2041 'NC': 18,
2042 }
2043
2044
2045 TV_PARENTAL_GUIDELINES = {
2046 'TV-Y': 0,
2047 'TV-Y7': 7,
2048 'TV-G': 0,
2049 'TV-PG': 0,
2050 'TV-14': 14,
2051 'TV-MA': 17,
2052 }
2053
2054
2055 def parse_age_limit(s):
2056 if type(s) == int:
2057 return s if 0 <= s <= 21 else None
2058 if not isinstance(s, compat_basestring):
2059 return None
2060 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2061 if m:
2062 return int(m.group('age'))
2063 if s in US_RATINGS:
2064 return US_RATINGS[s]
2065 return TV_PARENTAL_GUIDELINES.get(s)
2066
2067
2068 def strip_jsonp(code):
2069 return re.sub(
2070 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2071
2072
2073 def js_to_json(code):
2074 def fix_kv(m):
2075 v = m.group(0)
2076 if v in ('true', 'false', 'null'):
2077 return v
2078 elif v.startswith('/*') or v == ',':
2079 return ""
2080
2081 if v[0] in ("'", '"'):
2082 v = re.sub(r'(?s)\\.|"', lambda m: {
2083 '"': '\\"',
2084 "\\'": "'",
2085 '\\\n': '',
2086 '\\x': '\\u00',
2087 }.get(m.group(0), m.group(0)), v[1:-1])
2088
2089 INTEGER_TABLE = (
2090 (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2091 (r'^(0+[0-7]+)\s*:?$', 8),
2092 )
2093
2094 for regex, base in INTEGER_TABLE:
2095 im = re.match(regex, v)
2096 if im:
2097 i = int(im.group(1), base)
2098 return '"%d":' % i if v.endswith(':') else '%d' % i
2099
2100 return '"%s"' % v
2101
2102 return re.sub(r'''(?sx)
2103 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2104 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2105 /\*.*?\*/|,(?=\s*[\]}])|
2106 [a-zA-Z_][.a-zA-Z_0-9]*|
2107 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2108 [0-9]+(?=\s*:)
2109 ''', fix_kv, code)
2110
2111
2112 def qualities(quality_ids):
2113 """ Get a numeric quality value out of a list of possible values """
2114 def q(qid):
2115 try:
2116 return quality_ids.index(qid)
2117 except ValueError:
2118 return -1
2119 return q
2120
2121
2122 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2123
2124
2125 def limit_length(s, length):
2126 """ Add ellipses to overly long strings """
2127 if s is None:
2128 return None
2129 ELLIPSES = '...'
2130 if len(s) > length:
2131 return s[:length - len(ELLIPSES)] + ELLIPSES
2132 return s
2133
2134
2135 def version_tuple(v):
2136 return tuple(int(e) for e in re.split(r'[-.]', v))
2137
2138
2139 def is_outdated_version(version, limit, assume_new=True):
2140 if not version:
2141 return not assume_new
2142 try:
2143 return version_tuple(version) < version_tuple(limit)
2144 except ValueError:
2145 return not assume_new
2146
2147
2148 def ytdl_is_updateable():
2149 """ Returns if youtube-dl can be updated with -U """
2150 from zipimport import zipimporter
2151
2152 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2153
2154
2155 def args_to_str(args):
2156 # Get a short string representation for a subprocess command
2157 return ' '.join(compat_shlex_quote(a) for a in args)
2158
2159
2160 def error_to_compat_str(err):
2161 err_str = str(err)
2162 # On python 2 error byte string must be decoded with proper
2163 # encoding rather than ascii
2164 if sys.version_info[0] < 3:
2165 err_str = err_str.decode(preferredencoding())
2166 return err_str
2167
2168
2169 def mimetype2ext(mt):
2170 if mt is None:
2171 return None
2172
2173 ext = {
2174 'audio/mp4': 'm4a',
2175 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2176 # it's the most popular one
2177 'audio/mpeg': 'mp3',
2178 }.get(mt)
2179 if ext is not None:
2180 return ext
2181
2182 _, _, res = mt.rpartition('/')
2183 res = res.split(';')[0].strip().lower()
2184
2185 return {
2186 '3gpp': '3gp',
2187 'smptett+xml': 'tt',
2188 'srt': 'srt',
2189 'ttaf+xml': 'dfxp',
2190 'ttml+xml': 'ttml',
2191 'vtt': 'vtt',
2192 'x-flv': 'flv',
2193 'x-mp4-fragmented': 'mp4',
2194 'x-ms-wmv': 'wmv',
2195 'mpegurl': 'm3u8',
2196 'x-mpegurl': 'm3u8',
2197 'vnd.apple.mpegurl': 'm3u8',
2198 'dash+xml': 'mpd',
2199 'f4m': 'f4m',
2200 'f4m+xml': 'f4m',
2201 'hds+xml': 'f4m',
2202 'vnd.ms-sstr+xml': 'ism',
2203 'quicktime': 'mov',
2204 }.get(res, res)
2205
2206
2207 def parse_codecs(codecs_str):
2208 # http://tools.ietf.org/html/rfc6381
2209 if not codecs_str:
2210 return {}
2211 splited_codecs = list(filter(None, map(
2212 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2213 vcodec, acodec = None, None
2214 for full_codec in splited_codecs:
2215 codec = full_codec.split('.')[0]
2216 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2217 if not vcodec:
2218 vcodec = full_codec
2219 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2220 if not acodec:
2221 acodec = full_codec
2222 else:
2223 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2224 if not vcodec and not acodec:
2225 if len(splited_codecs) == 2:
2226 return {
2227 'vcodec': vcodec,
2228 'acodec': acodec,
2229 }
2230 elif len(splited_codecs) == 1:
2231 return {
2232 'vcodec': 'none',
2233 'acodec': vcodec,
2234 }
2235 else:
2236 return {
2237 'vcodec': vcodec or 'none',
2238 'acodec': acodec or 'none',
2239 }
2240 return {}
2241
2242
2243 def urlhandle_detect_ext(url_handle):
2244 getheader = url_handle.headers.get
2245
2246 cd = getheader('Content-Disposition')
2247 if cd:
2248 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2249 if m:
2250 e = determine_ext(m.group('filename'), default_ext=None)
2251 if e:
2252 return e
2253
2254 return mimetype2ext(getheader('Content-Type'))
2255
2256
2257 def encode_data_uri(data, mime_type):
2258 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2259
2260
2261 def age_restricted(content_limit, age_limit):
2262 """ Returns True iff the content should be blocked """
2263
2264 if age_limit is None: # No limit set
2265 return False
2266 if content_limit is None:
2267 return False # Content available for everyone
2268 return age_limit < content_limit
2269
2270
2271 def is_html(first_bytes):
2272 """ Detect whether a file contains HTML by examining its first bytes. """
2273
2274 BOMS = [
2275 (b'\xef\xbb\xbf', 'utf-8'),
2276 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2277 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2278 (b'\xff\xfe', 'utf-16-le'),
2279 (b'\xfe\xff', 'utf-16-be'),
2280 ]
2281 for bom, enc in BOMS:
2282 if first_bytes.startswith(bom):
2283 s = first_bytes[len(bom):].decode(enc, 'replace')
2284 break
2285 else:
2286 s = first_bytes.decode('utf-8', 'replace')
2287
2288 return re.match(r'^\s*<', s)
2289
2290
2291 def determine_protocol(info_dict):
2292 protocol = info_dict.get('protocol')
2293 if protocol is not None:
2294 return protocol
2295
2296 url = info_dict['url']
2297 if url.startswith('rtmp'):
2298 return 'rtmp'
2299 elif url.startswith('mms'):
2300 return 'mms'
2301 elif url.startswith('rtsp'):
2302 return 'rtsp'
2303
2304 ext = determine_ext(url)
2305 if ext == 'm3u8':
2306 return 'm3u8'
2307 elif ext == 'f4m':
2308 return 'f4m'
2309
2310 return compat_urllib_parse_urlparse(url).scheme
2311
2312
2313 def render_table(header_row, data):
2314 """ Render a list of rows, each as a list of values """
2315 table = [header_row] + data
2316 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2317 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2318 return '\n'.join(format_str % tuple(row) for row in table)
2319
2320
2321 def _match_one(filter_part, dct):
2322 COMPARISON_OPERATORS = {
2323 '<': operator.lt,
2324 '<=': operator.le,
2325 '>': operator.gt,
2326 '>=': operator.ge,
2327 '=': operator.eq,
2328 '!=': operator.ne,
2329 }
2330 operator_rex = re.compile(r'''(?x)\s*
2331 (?P<key>[a-z_]+)
2332 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2333 (?:
2334 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2335 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2336 )
2337 \s*$
2338 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2339 m = operator_rex.search(filter_part)
2340 if m:
2341 op = COMPARISON_OPERATORS[m.group('op')]
2342 if m.group('strval') is not None:
2343 if m.group('op') not in ('=', '!='):
2344 raise ValueError(
2345 'Operator %s does not support string values!' % m.group('op'))
2346 comparison_value = m.group('strval')
2347 else:
2348 try:
2349 comparison_value = int(m.group('intval'))
2350 except ValueError:
2351 comparison_value = parse_filesize(m.group('intval'))
2352 if comparison_value is None:
2353 comparison_value = parse_filesize(m.group('intval') + 'B')
2354 if comparison_value is None:
2355 raise ValueError(
2356 'Invalid integer value %r in filter part %r' % (
2357 m.group('intval'), filter_part))
2358 actual_value = dct.get(m.group('key'))
2359 if actual_value is None:
2360 return m.group('none_inclusive')
2361 return op(actual_value, comparison_value)
2362
2363 UNARY_OPERATORS = {
2364 '': lambda v: v is not None,
2365 '!': lambda v: v is None,
2366 }
2367 operator_rex = re.compile(r'''(?x)\s*
2368 (?P<op>%s)\s*(?P<key>[a-z_]+)
2369 \s*$
2370 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2371 m = operator_rex.search(filter_part)
2372 if m:
2373 op = UNARY_OPERATORS[m.group('op')]
2374 actual_value = dct.get(m.group('key'))
2375 return op(actual_value)
2376
2377 raise ValueError('Invalid filter part %r' % filter_part)
2378
2379
2380 def match_str(filter_str, dct):
2381 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2382
2383 return all(
2384 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2385
2386
2387 def match_filter_func(filter_str):
2388 def _match_func(info_dict):
2389 if match_str(filter_str, info_dict):
2390 return None
2391 else:
2392 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2393 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2394 return _match_func
2395
2396
2397 def parse_dfxp_time_expr(time_expr):
2398 if not time_expr:
2399 return
2400
2401 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2402 if mobj:
2403 return float(mobj.group('time_offset'))
2404
2405 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2406 if mobj:
2407 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2408
2409
2410 def srt_subtitles_timecode(seconds):
2411 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2412
2413
2414 def dfxp2srt(dfxp_data):
2415 _x = functools.partial(xpath_with_ns, ns_map={
2416 'ttml': 'http://www.w3.org/ns/ttml',
2417 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2418 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2419 })
2420
2421 class TTMLPElementParser(object):
2422 out = ''
2423
2424 def start(self, tag, attrib):
2425 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2426 self.out += '\n'
2427
2428 def end(self, tag):
2429 pass
2430
2431 def data(self, data):
2432 self.out += data
2433
2434 def close(self):
2435 return self.out.strip()
2436
2437 def parse_node(node):
2438 target = TTMLPElementParser()
2439 parser = xml.etree.ElementTree.XMLParser(target=target)
2440 parser.feed(xml.etree.ElementTree.tostring(node))
2441 return parser.close()
2442
2443 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2444 out = []
2445 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2446
2447 if not paras:
2448 raise ValueError('Invalid dfxp/TTML subtitle')
2449
2450 for para, index in zip(paras, itertools.count(1)):
2451 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2452 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2453 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2454 if begin_time is None:
2455 continue
2456 if not end_time:
2457 if not dur:
2458 continue
2459 end_time = begin_time + dur
2460 out.append('%d\n%s --> %s\n%s\n\n' % (
2461 index,
2462 srt_subtitles_timecode(begin_time),
2463 srt_subtitles_timecode(end_time),
2464 parse_node(para)))
2465
2466 return ''.join(out)
2467
2468
2469 def cli_option(params, command_option, param):
2470 param = params.get(param)
2471 if param:
2472 param = compat_str(param)
2473 return [command_option, param] if param is not None else []
2474
2475
2476 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2477 param = params.get(param)
2478 assert isinstance(param, bool)
2479 if separator:
2480 return [command_option + separator + (true_value if param else false_value)]
2481 return [command_option, true_value if param else false_value]
2482
2483
2484 def cli_valueless_option(params, command_option, param, expected_value=True):
2485 param = params.get(param)
2486 return [command_option] if param == expected_value else []
2487
2488
2489 def cli_configuration_args(params, param, default=[]):
2490 ex_args = params.get(param)
2491 if ex_args is None:
2492 return default
2493 assert isinstance(ex_args, list)
2494 return ex_args
2495
2496
2497 class ISO639Utils(object):
2498 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2499 _lang_map = {
2500 'aa': 'aar',
2501 'ab': 'abk',
2502 'ae': 'ave',
2503 'af': 'afr',
2504 'ak': 'aka',
2505 'am': 'amh',
2506 'an': 'arg',
2507 'ar': 'ara',
2508 'as': 'asm',
2509 'av': 'ava',
2510 'ay': 'aym',
2511 'az': 'aze',
2512 'ba': 'bak',
2513 'be': 'bel',
2514 'bg': 'bul',
2515 'bh': 'bih',
2516 'bi': 'bis',
2517 'bm': 'bam',
2518 'bn': 'ben',
2519 'bo': 'bod',
2520 'br': 'bre',
2521 'bs': 'bos',
2522 'ca': 'cat',
2523 'ce': 'che',
2524 'ch': 'cha',
2525 'co': 'cos',
2526 'cr': 'cre',
2527 'cs': 'ces',
2528 'cu': 'chu',
2529 'cv': 'chv',
2530 'cy': 'cym',
2531 'da': 'dan',
2532 'de': 'deu',
2533 'dv': 'div',
2534 'dz': 'dzo',
2535 'ee': 'ewe',
2536 'el': 'ell',
2537 'en': 'eng',
2538 'eo': 'epo',
2539 'es': 'spa',
2540 'et': 'est',
2541 'eu': 'eus',
2542 'fa': 'fas',
2543 'ff': 'ful',
2544 'fi': 'fin',
2545 'fj': 'fij',
2546 'fo': 'fao',
2547 'fr': 'fra',
2548 'fy': 'fry',
2549 'ga': 'gle',
2550 'gd': 'gla',
2551 'gl': 'glg',
2552 'gn': 'grn',
2553 'gu': 'guj',
2554 'gv': 'glv',
2555 'ha': 'hau',
2556 'he': 'heb',
2557 'hi': 'hin',
2558 'ho': 'hmo',
2559 'hr': 'hrv',
2560 'ht': 'hat',
2561 'hu': 'hun',
2562 'hy': 'hye',
2563 'hz': 'her',
2564 'ia': 'ina',
2565 'id': 'ind',
2566 'ie': 'ile',
2567 'ig': 'ibo',
2568 'ii': 'iii',
2569 'ik': 'ipk',
2570 'io': 'ido',
2571 'is': 'isl',
2572 'it': 'ita',
2573 'iu': 'iku',
2574 'ja': 'jpn',
2575 'jv': 'jav',
2576 'ka': 'kat',
2577 'kg': 'kon',
2578 'ki': 'kik',
2579 'kj': 'kua',
2580 'kk': 'kaz',
2581 'kl': 'kal',
2582 'km': 'khm',
2583 'kn': 'kan',
2584 'ko': 'kor',
2585 'kr': 'kau',
2586 'ks': 'kas',
2587 'ku': 'kur',
2588 'kv': 'kom',
2589 'kw': 'cor',
2590 'ky': 'kir',
2591 'la': 'lat',
2592 'lb': 'ltz',
2593 'lg': 'lug',
2594 'li': 'lim',
2595 'ln': 'lin',
2596 'lo': 'lao',
2597 'lt': 'lit',
2598 'lu': 'lub',
2599 'lv': 'lav',
2600 'mg': 'mlg',
2601 'mh': 'mah',
2602 'mi': 'mri',
2603 'mk': 'mkd',
2604 'ml': 'mal',
2605 'mn': 'mon',
2606 'mr': 'mar',
2607 'ms': 'msa',
2608 'mt': 'mlt',
2609 'my': 'mya',
2610 'na': 'nau',
2611 'nb': 'nob',
2612 'nd': 'nde',
2613 'ne': 'nep',
2614 'ng': 'ndo',
2615 'nl': 'nld',
2616 'nn': 'nno',
2617 'no': 'nor',
2618 'nr': 'nbl',
2619 'nv': 'nav',
2620 'ny': 'nya',
2621 'oc': 'oci',
2622 'oj': 'oji',
2623 'om': 'orm',
2624 'or': 'ori',
2625 'os': 'oss',
2626 'pa': 'pan',
2627 'pi': 'pli',
2628 'pl': 'pol',
2629 'ps': 'pus',
2630 'pt': 'por',
2631 'qu': 'que',
2632 'rm': 'roh',
2633 'rn': 'run',
2634 'ro': 'ron',
2635 'ru': 'rus',
2636 'rw': 'kin',
2637 'sa': 'san',
2638 'sc': 'srd',
2639 'sd': 'snd',
2640 'se': 'sme',
2641 'sg': 'sag',
2642 'si': 'sin',
2643 'sk': 'slk',
2644 'sl': 'slv',
2645 'sm': 'smo',
2646 'sn': 'sna',
2647 'so': 'som',
2648 'sq': 'sqi',
2649 'sr': 'srp',
2650 'ss': 'ssw',
2651 'st': 'sot',
2652 'su': 'sun',
2653 'sv': 'swe',
2654 'sw': 'swa',
2655 'ta': 'tam',
2656 'te': 'tel',
2657 'tg': 'tgk',
2658 'th': 'tha',
2659 'ti': 'tir',
2660 'tk': 'tuk',
2661 'tl': 'tgl',
2662 'tn': 'tsn',
2663 'to': 'ton',
2664 'tr': 'tur',
2665 'ts': 'tso',
2666 'tt': 'tat',
2667 'tw': 'twi',
2668 'ty': 'tah',
2669 'ug': 'uig',
2670 'uk': 'ukr',
2671 'ur': 'urd',
2672 'uz': 'uzb',
2673 've': 'ven',
2674 'vi': 'vie',
2675 'vo': 'vol',
2676 'wa': 'wln',
2677 'wo': 'wol',
2678 'xh': 'xho',
2679 'yi': 'yid',
2680 'yo': 'yor',
2681 'za': 'zha',
2682 'zh': 'zho',
2683 'zu': 'zul',
2684 }
2685
2686 @classmethod
2687 def short2long(cls, code):
2688 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2689 return cls._lang_map.get(code[:2])
2690
2691 @classmethod
2692 def long2short(cls, code):
2693 """Convert language code from ISO 639-2/T to ISO 639-1"""
2694 for short_name, long_name in cls._lang_map.items():
2695 if long_name == code:
2696 return short_name
2697
2698
2699 class ISO3166Utils(object):
2700 # From http://data.okfn.org/data/core/country-list
2701 _country_map = {
2702 'AF': 'Afghanistan',
2703 'AX': 'Åland Islands',
2704 'AL': 'Albania',
2705 'DZ': 'Algeria',
2706 'AS': 'American Samoa',
2707 'AD': 'Andorra',
2708 'AO': 'Angola',
2709 'AI': 'Anguilla',
2710 'AQ': 'Antarctica',
2711 'AG': 'Antigua and Barbuda',
2712 'AR': 'Argentina',
2713 'AM': 'Armenia',
2714 'AW': 'Aruba',
2715 'AU': 'Australia',
2716 'AT': 'Austria',
2717 'AZ': 'Azerbaijan',
2718 'BS': 'Bahamas',
2719 'BH': 'Bahrain',
2720 'BD': 'Bangladesh',
2721 'BB': 'Barbados',
2722 'BY': 'Belarus',
2723 'BE': 'Belgium',
2724 'BZ': 'Belize',
2725 'BJ': 'Benin',
2726 'BM': 'Bermuda',
2727 'BT': 'Bhutan',
2728 'BO': 'Bolivia, Plurinational State of',
2729 'BQ': 'Bonaire, Sint Eustatius and Saba',
2730 'BA': 'Bosnia and Herzegovina',
2731 'BW': 'Botswana',
2732 'BV': 'Bouvet Island',
2733 'BR': 'Brazil',
2734 'IO': 'British Indian Ocean Territory',
2735 'BN': 'Brunei Darussalam',
2736 'BG': 'Bulgaria',
2737 'BF': 'Burkina Faso',
2738 'BI': 'Burundi',
2739 'KH': 'Cambodia',
2740 'CM': 'Cameroon',
2741 'CA': 'Canada',
2742 'CV': 'Cape Verde',
2743 'KY': 'Cayman Islands',
2744 'CF': 'Central African Republic',
2745 'TD': 'Chad',
2746 'CL': 'Chile',
2747 'CN': 'China',
2748 'CX': 'Christmas Island',
2749 'CC': 'Cocos (Keeling) Islands',
2750 'CO': 'Colombia',
2751 'KM': 'Comoros',
2752 'CG': 'Congo',
2753 'CD': 'Congo, the Democratic Republic of the',
2754 'CK': 'Cook Islands',
2755 'CR': 'Costa Rica',
2756 'CI': 'Côte d\'Ivoire',
2757 'HR': 'Croatia',
2758 'CU': 'Cuba',
2759 'CW': 'Curaçao',
2760 'CY': 'Cyprus',
2761 'CZ': 'Czech Republic',
2762 'DK': 'Denmark',
2763 'DJ': 'Djibouti',
2764 'DM': 'Dominica',
2765 'DO': 'Dominican Republic',
2766 'EC': 'Ecuador',
2767 'EG': 'Egypt',
2768 'SV': 'El Salvador',
2769 'GQ': 'Equatorial Guinea',
2770 'ER': 'Eritrea',
2771 'EE': 'Estonia',
2772 'ET': 'Ethiopia',
2773 'FK': 'Falkland Islands (Malvinas)',
2774 'FO': 'Faroe Islands',
2775 'FJ': 'Fiji',
2776 'FI': 'Finland',
2777 'FR': 'France',
2778 'GF': 'French Guiana',
2779 'PF': 'French Polynesia',
2780 'TF': 'French Southern Territories',
2781 'GA': 'Gabon',
2782 'GM': 'Gambia',
2783 'GE': 'Georgia',
2784 'DE': 'Germany',
2785 'GH': 'Ghana',
2786 'GI': 'Gibraltar',
2787 'GR': 'Greece',
2788 'GL': 'Greenland',
2789 'GD': 'Grenada',
2790 'GP': 'Guadeloupe',
2791 'GU': 'Guam',
2792 'GT': 'Guatemala',
2793 'GG': 'Guernsey',
2794 'GN': 'Guinea',
2795 'GW': 'Guinea-Bissau',
2796 'GY': 'Guyana',
2797 'HT': 'Haiti',
2798 'HM': 'Heard Island and McDonald Islands',
2799 'VA': 'Holy See (Vatican City State)',
2800 'HN': 'Honduras',
2801 'HK': 'Hong Kong',
2802 'HU': 'Hungary',
2803 'IS': 'Iceland',
2804 'IN': 'India',
2805 'ID': 'Indonesia',
2806 'IR': 'Iran, Islamic Republic of',
2807 'IQ': 'Iraq',
2808 'IE': 'Ireland',
2809 'IM': 'Isle of Man',
2810 'IL': 'Israel',
2811 'IT': 'Italy',
2812 'JM': 'Jamaica',
2813 'JP': 'Japan',
2814 'JE': 'Jersey',
2815 'JO': 'Jordan',
2816 'KZ': 'Kazakhstan',
2817 'KE': 'Kenya',
2818 'KI': 'Kiribati',
2819 'KP': 'Korea, Democratic People\'s Republic of',
2820 'KR': 'Korea, Republic of',
2821 'KW': 'Kuwait',
2822 'KG': 'Kyrgyzstan',
2823 'LA': 'Lao People\'s Democratic Republic',
2824 'LV': 'Latvia',
2825 'LB': 'Lebanon',
2826 'LS': 'Lesotho',
2827 'LR': 'Liberia',
2828 'LY': 'Libya',
2829 'LI': 'Liechtenstein',
2830 'LT': 'Lithuania',
2831 'LU': 'Luxembourg',
2832 'MO': 'Macao',
2833 'MK': 'Macedonia, the Former Yugoslav Republic of',
2834 'MG': 'Madagascar',
2835 'MW': 'Malawi',
2836 'MY': 'Malaysia',
2837 'MV': 'Maldives',
2838 'ML': 'Mali',
2839 'MT': 'Malta',
2840 'MH': 'Marshall Islands',
2841 'MQ': 'Martinique',
2842 'MR': 'Mauritania',
2843 'MU': 'Mauritius',
2844 'YT': 'Mayotte',
2845 'MX': 'Mexico',
2846 'FM': 'Micronesia, Federated States of',
2847 'MD': 'Moldova, Republic of',
2848 'MC': 'Monaco',
2849 'MN': 'Mongolia',
2850 'ME': 'Montenegro',
2851 'MS': 'Montserrat',
2852 'MA': 'Morocco',
2853 'MZ': 'Mozambique',
2854 'MM': 'Myanmar',
2855 'NA': 'Namibia',
2856 'NR': 'Nauru',
2857 'NP': 'Nepal',
2858 'NL': 'Netherlands',
2859 'NC': 'New Caledonia',
2860 'NZ': 'New Zealand',
2861 'NI': 'Nicaragua',
2862 'NE': 'Niger',
2863 'NG': 'Nigeria',
2864 'NU': 'Niue',
2865 'NF': 'Norfolk Island',
2866 'MP': 'Northern Mariana Islands',
2867 'NO': 'Norway',
2868 'OM': 'Oman',
2869 'PK': 'Pakistan',
2870 'PW': 'Palau',
2871 'PS': 'Palestine, State of',
2872 'PA': 'Panama',
2873 'PG': 'Papua New Guinea',
2874 'PY': 'Paraguay',
2875 'PE': 'Peru',
2876 'PH': 'Philippines',
2877 'PN': 'Pitcairn',
2878 'PL': 'Poland',
2879 'PT': 'Portugal',
2880 'PR': 'Puerto Rico',
2881 'QA': 'Qatar',
2882 'RE': 'Réunion',
2883 'RO': 'Romania',
2884 'RU': 'Russian Federation',
2885 'RW': 'Rwanda',
2886 'BL': 'Saint Barthélemy',
2887 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2888 'KN': 'Saint Kitts and Nevis',
2889 'LC': 'Saint Lucia',
2890 'MF': 'Saint Martin (French part)',
2891 'PM': 'Saint Pierre and Miquelon',
2892 'VC': 'Saint Vincent and the Grenadines',
2893 'WS': 'Samoa',
2894 'SM': 'San Marino',
2895 'ST': 'Sao Tome and Principe',
2896 'SA': 'Saudi Arabia',
2897 'SN': 'Senegal',
2898 'RS': 'Serbia',
2899 'SC': 'Seychelles',
2900 'SL': 'Sierra Leone',
2901 'SG': 'Singapore',
2902 'SX': 'Sint Maarten (Dutch part)',
2903 'SK': 'Slovakia',
2904 'SI': 'Slovenia',
2905 'SB': 'Solomon Islands',
2906 'SO': 'Somalia',
2907 'ZA': 'South Africa',
2908 'GS': 'South Georgia and the South Sandwich Islands',
2909 'SS': 'South Sudan',
2910 'ES': 'Spain',
2911 'LK': 'Sri Lanka',
2912 'SD': 'Sudan',
2913 'SR': 'Suriname',
2914 'SJ': 'Svalbard and Jan Mayen',
2915 'SZ': 'Swaziland',
2916 'SE': 'Sweden',
2917 'CH': 'Switzerland',
2918 'SY': 'Syrian Arab Republic',
2919 'TW': 'Taiwan, Province of China',
2920 'TJ': 'Tajikistan',
2921 'TZ': 'Tanzania, United Republic of',
2922 'TH': 'Thailand',
2923 'TL': 'Timor-Leste',
2924 'TG': 'Togo',
2925 'TK': 'Tokelau',
2926 'TO': 'Tonga',
2927 'TT': 'Trinidad and Tobago',
2928 'TN': 'Tunisia',
2929 'TR': 'Turkey',
2930 'TM': 'Turkmenistan',
2931 'TC': 'Turks and Caicos Islands',
2932 'TV': 'Tuvalu',
2933 'UG': 'Uganda',
2934 'UA': 'Ukraine',
2935 'AE': 'United Arab Emirates',
2936 'GB': 'United Kingdom',
2937 'US': 'United States',
2938 'UM': 'United States Minor Outlying Islands',
2939 'UY': 'Uruguay',
2940 'UZ': 'Uzbekistan',
2941 'VU': 'Vanuatu',
2942 'VE': 'Venezuela, Bolivarian Republic of',
2943 'VN': 'Viet Nam',
2944 'VG': 'Virgin Islands, British',
2945 'VI': 'Virgin Islands, U.S.',
2946 'WF': 'Wallis and Futuna',
2947 'EH': 'Western Sahara',
2948 'YE': 'Yemen',
2949 'ZM': 'Zambia',
2950 'ZW': 'Zimbabwe',
2951 }
2952
2953 @classmethod
2954 def short2full(cls, code):
2955 """Convert an ISO 3166-2 country code to the corresponding full name"""
2956 return cls._country_map.get(code.upper())
2957
2958
2959 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2960 def __init__(self, proxies=None):
2961 # Set default handlers
2962 for type in ('http', 'https'):
2963 setattr(self, '%s_open' % type,
2964 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2965 meth(r, proxy, type))
2966 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2967
2968 def proxy_open(self, req, proxy, type):
2969 req_proxy = req.headers.get('Ytdl-request-proxy')
2970 if req_proxy is not None:
2971 proxy = req_proxy
2972 del req.headers['Ytdl-request-proxy']
2973
2974 if proxy == '__noproxy__':
2975 return None # No Proxy
2976 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2977 req.add_header('Ytdl-socks-proxy', proxy)
2978 # youtube-dl's http/https handlers do wrapping the socket with socks
2979 return None
2980 return compat_urllib_request.ProxyHandler.proxy_open(
2981 self, req, proxy, type)
2982
2983
2984 def ohdave_rsa_encrypt(data, exponent, modulus):
2985 '''
2986 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2987
2988 Input:
2989 data: data to encrypt, bytes-like object
2990 exponent, modulus: parameter e and N of RSA algorithm, both integer
2991 Output: hex string of encrypted data
2992
2993 Limitation: supports one block encryption only
2994 '''
2995
2996 payload = int(binascii.hexlify(data[::-1]), 16)
2997 encrypted = pow(payload, exponent, modulus)
2998 return '%x' % encrypted
2999
3000
3001 def encode_base_n(num, n, table=None):
3002 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3003 if not table:
3004 table = FULL_TABLE[:n]
3005
3006 if n > len(table):
3007 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3008
3009 if num == 0:
3010 return table[0]
3011
3012 ret = ''
3013 while num:
3014 ret = table[num % n] + ret
3015 num = num // n
3016 return ret
3017
3018
3019 def decode_packed_codes(code):
3020 mobj = re.search(
3021 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
3022 code)
3023 obfucasted_code, base, count, symbols = mobj.groups()
3024 base = int(base)
3025 count = int(count)
3026 symbols = symbols.split('|')
3027 symbol_table = {}
3028
3029 while count:
3030 count -= 1
3031 base_n_count = encode_base_n(count, base)
3032 symbol_table[base_n_count] = symbols[count] or base_n_count
3033
3034 return re.sub(
3035 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3036 obfucasted_code)
3037
3038
3039 def parse_m3u8_attributes(attrib):
3040 info = {}
3041 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3042 if val.startswith('"'):
3043 val = val[1:-1]
3044 info[key] = val
3045 return info
3046
3047
3048 def urshift(val, n):
3049 return val >> n if val >= 0 else (val + 0x100000000) >> n
3050
3051
3052 # Based on png2str() written by @gdkchan and improved by @yokrysty
3053 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3054 def decode_png(png_data):
3055 # Reference: https://www.w3.org/TR/PNG/
3056 header = png_data[8:]
3057
3058 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3059 raise IOError('Not a valid PNG file.')
3060
3061 int_map = {1: '>B', 2: '>H', 4: '>I'}
3062 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3063
3064 chunks = []
3065
3066 while header:
3067 length = unpack_integer(header[:4])
3068 header = header[4:]
3069
3070 chunk_type = header[:4]
3071 header = header[4:]
3072
3073 chunk_data = header[:length]
3074 header = header[length:]
3075
3076 header = header[4:] # Skip CRC
3077
3078 chunks.append({
3079 'type': chunk_type,
3080 'length': length,
3081 'data': chunk_data
3082 })
3083
3084 ihdr = chunks[0]['data']
3085
3086 width = unpack_integer(ihdr[:4])
3087 height = unpack_integer(ihdr[4:8])
3088
3089 idat = b''
3090
3091 for chunk in chunks:
3092 if chunk['type'] == b'IDAT':
3093 idat += chunk['data']
3094
3095 if not idat:
3096 raise IOError('Unable to read PNG data.')
3097
3098 decompressed_data = bytearray(zlib.decompress(idat))
3099
3100 stride = width * 3
3101 pixels = []
3102
3103 def _get_pixel(idx):
3104 x = idx % stride
3105 y = idx // stride
3106 return pixels[y][x]
3107
3108 for y in range(height):
3109 basePos = y * (1 + stride)
3110 filter_type = decompressed_data[basePos]
3111
3112 current_row = []
3113
3114 pixels.append(current_row)
3115
3116 for x in range(stride):
3117 color = decompressed_data[1 + basePos + x]
3118 basex = y * stride + x
3119 left = 0
3120 up = 0
3121
3122 if x > 2:
3123 left = _get_pixel(basex - 3)
3124 if y > 0:
3125 up = _get_pixel(basex - stride)
3126
3127 if filter_type == 1: # Sub
3128 color = (color + left) & 0xff
3129 elif filter_type == 2: # Up
3130 color = (color + up) & 0xff
3131 elif filter_type == 3: # Average
3132 color = (color + ((left + up) >> 1)) & 0xff
3133 elif filter_type == 4: # Paeth
3134 a = left
3135 b = up
3136 c = 0
3137
3138 if x > 2 and y > 0:
3139 c = _get_pixel(basex - stride - 3)
3140
3141 p = a + b - c
3142
3143 pa = abs(p - a)
3144 pb = abs(p - b)
3145 pc = abs(p - c)
3146
3147 if pa <= pb and pa <= pc:
3148 color = (color + a) & 0xff
3149 elif pb <= pc:
3150 color = (color + b) & 0xff
3151 else:
3152 color = (color + c) & 0xff
3153
3154 current_row.append(color)
3155
3156 return width, height, pixels
3157
3158
3159 def write_xattr(path, key, value):
3160 # This mess below finds the best xattr tool for the job
3161 try:
3162 # try the pyxattr module...
3163 import xattr
3164
3165 if hasattr(xattr, 'set'): # pyxattr
3166 # Unicode arguments are not supported in python-pyxattr until
3167 # version 0.5.0
3168 # See https://github.com/rg3/youtube-dl/issues/5498
3169 pyxattr_required_version = '0.5.0'
3170 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3171 # TODO: fallback to CLI tools
3172 raise XAttrUnavailableError(
3173 'python-pyxattr is detected but is too old. '
3174 'youtube-dl requires %s or above while your version is %s. '
3175 'Falling back to other xattr implementations' % (
3176 pyxattr_required_version, xattr.__version__))
3177
3178 setxattr = xattr.set
3179 else: # xattr
3180 setxattr = xattr.setxattr
3181
3182 try:
3183 setxattr(path, key, value)
3184 except EnvironmentError as e:
3185 raise XAttrMetadataError(e.errno, e.strerror)
3186
3187 except ImportError:
3188 if compat_os_name == 'nt':
3189 # Write xattrs to NTFS Alternate Data Streams:
3190 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3191 assert ':' not in key
3192 assert os.path.exists(path)
3193
3194 ads_fn = path + ':' + key
3195 try:
3196 with open(ads_fn, 'wb') as f:
3197 f.write(value)
3198 except EnvironmentError as e:
3199 raise XAttrMetadataError(e.errno, e.strerror)
3200 else:
3201 user_has_setfattr = check_executable('setfattr', ['--version'])
3202 user_has_xattr = check_executable('xattr', ['-h'])
3203
3204 if user_has_setfattr or user_has_xattr:
3205
3206 value = value.decode('utf-8')
3207 if user_has_setfattr:
3208 executable = 'setfattr'
3209 opts = ['-n', key, '-v', value]
3210 elif user_has_xattr:
3211 executable = 'xattr'
3212 opts = ['-w', key, value]
3213
3214 cmd = ([encodeFilename(executable, True)] +
3215 [encodeArgument(o) for o in opts] +
3216 [encodeFilename(path, True)])
3217
3218 try:
3219 p = subprocess.Popen(
3220 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3221 except EnvironmentError as e:
3222 raise XAttrMetadataError(e.errno, e.strerror)
3223 stdout, stderr = p.communicate()
3224 stderr = stderr.decode('utf-8', 'replace')
3225 if p.returncode != 0:
3226 raise XAttrMetadataError(p.returncode, stderr)
3227
3228 else:
3229 # On Unix, and can't find pyxattr, setfattr, or xattr.
3230 if sys.platform.startswith('linux'):
3231 raise XAttrUnavailableError(
3232 "Couldn't find a tool to set the xattrs. "
3233 "Install either the python 'pyxattr' or 'xattr' "
3234 "modules, or the GNU 'attr' package "
3235 "(which contains the 'setfattr' tool).")
3236 else:
3237 raise XAttrUnavailableError(
3238 "Couldn't find a tool to set the xattrs. "
3239 "Install either the python 'xattr' module, "
3240 "or the 'xattr' binary.")