]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[tv2:article] Fix extraction (Closes #10188)
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import errno
15 import functools
16 import gzip
17 import io
18 import itertools
19 import json
20 import locale
21 import math
22 import operator
23 import os
24 import pipes
25 import platform
26 import re
27 import socket
28 import ssl
29 import subprocess
30 import sys
31 import tempfile
32 import traceback
33 import xml.etree.ElementTree
34 import zlib
35
36 from .compat import (
37 compat_HTMLParser,
38 compat_basestring,
39 compat_chr,
40 compat_etree_fromstring,
41 compat_html_entities,
42 compat_html_entities_html5,
43 compat_http_client,
44 compat_kwargs,
45 compat_parse_qs,
46 compat_shlex_quote,
47 compat_socket_create_connection,
48 compat_str,
49 compat_struct_pack,
50 compat_urllib_error,
51 compat_urllib_parse,
52 compat_urllib_parse_urlencode,
53 compat_urllib_parse_urlparse,
54 compat_urllib_parse_unquote_plus,
55 compat_urllib_request,
56 compat_urlparse,
57 compat_xpath,
58 )
59
60 from .socks import (
61 ProxyType,
62 sockssocket,
63 )
64
65
66 def register_socks_protocols():
67 # "Register" SOCKS protocols
68 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
69 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
70 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
71 if scheme not in compat_urlparse.uses_netloc:
72 compat_urlparse.uses_netloc.append(scheme)
73
74
75 # This is not clearly defined otherwise
76 compiled_regex_type = type(re.compile(''))
77
78 std_headers = {
79 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
80 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
81 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
82 'Accept-Encoding': 'gzip, deflate',
83 'Accept-Language': 'en-us,en;q=0.5',
84 }
85
86
87 NO_DEFAULT = object()
88
89 ENGLISH_MONTH_NAMES = [
90 'January', 'February', 'March', 'April', 'May', 'June',
91 'July', 'August', 'September', 'October', 'November', 'December']
92
93 KNOWN_EXTENSIONS = (
94 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
95 'flv', 'f4v', 'f4a', 'f4b',
96 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
97 'mkv', 'mka', 'mk3d',
98 'avi', 'divx',
99 'mov',
100 'asf', 'wmv', 'wma',
101 '3gp', '3g2',
102 'mp3',
103 'flac',
104 'ape',
105 'wav',
106 'f4f', 'f4m', 'm3u8', 'smil')
107
108 # needed for sanitizing filenames in restricted mode
109 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
110 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
111 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
112
113 DATE_FORMATS = (
114 '%d %B %Y',
115 '%d %b %Y',
116 '%B %d %Y',
117 '%b %d %Y',
118 '%b %dst %Y %I:%M',
119 '%b %dnd %Y %I:%M',
120 '%b %dth %Y %I:%M',
121 '%Y %m %d',
122 '%Y-%m-%d',
123 '%Y/%m/%d',
124 '%Y/%m/%d %H:%M:%S',
125 '%Y-%m-%d %H:%M:%S',
126 '%Y-%m-%d %H:%M:%S.%f',
127 '%d.%m.%Y %H:%M',
128 '%d.%m.%Y %H.%M',
129 '%Y-%m-%dT%H:%M:%SZ',
130 '%Y-%m-%dT%H:%M:%S.%fZ',
131 '%Y-%m-%dT%H:%M:%S.%f0Z',
132 '%Y-%m-%dT%H:%M:%S',
133 '%Y-%m-%dT%H:%M:%S.%f',
134 '%Y-%m-%dT%H:%M',
135 )
136
137 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
138 DATE_FORMATS_DAY_FIRST.extend([
139 '%d-%m-%Y',
140 '%d.%m.%Y',
141 '%d.%m.%y',
142 '%d/%m/%Y',
143 '%d/%m/%y',
144 '%d/%m/%Y %H:%M:%S',
145 ])
146
147 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
148 DATE_FORMATS_MONTH_FIRST.extend([
149 '%m-%d-%Y',
150 '%m.%d.%Y',
151 '%m/%d/%Y',
152 '%m/%d/%y',
153 '%m/%d/%Y %H:%M:%S',
154 ])
155
156
157 def preferredencoding():
158 """Get preferred encoding.
159
160 Returns the best encoding scheme for the system, based on
161 locale.getpreferredencoding() and some further tweaks.
162 """
163 try:
164 pref = locale.getpreferredencoding()
165 'TEST'.encode(pref)
166 except Exception:
167 pref = 'UTF-8'
168
169 return pref
170
171
172 def write_json_file(obj, fn):
173 """ Encode obj as JSON and write it to fn, atomically if possible """
174
175 fn = encodeFilename(fn)
176 if sys.version_info < (3, 0) and sys.platform != 'win32':
177 encoding = get_filesystem_encoding()
178 # os.path.basename returns a bytes object, but NamedTemporaryFile
179 # will fail if the filename contains non ascii characters unless we
180 # use a unicode object
181 path_basename = lambda f: os.path.basename(fn).decode(encoding)
182 # the same for os.path.dirname
183 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
184 else:
185 path_basename = os.path.basename
186 path_dirname = os.path.dirname
187
188 args = {
189 'suffix': '.tmp',
190 'prefix': path_basename(fn) + '.',
191 'dir': path_dirname(fn),
192 'delete': False,
193 }
194
195 # In Python 2.x, json.dump expects a bytestream.
196 # In Python 3.x, it writes to a character stream
197 if sys.version_info < (3, 0):
198 args['mode'] = 'wb'
199 else:
200 args.update({
201 'mode': 'w',
202 'encoding': 'utf-8',
203 })
204
205 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
206
207 try:
208 with tf:
209 json.dump(obj, tf)
210 if sys.platform == 'win32':
211 # Need to remove existing file on Windows, else os.rename raises
212 # WindowsError or FileExistsError.
213 try:
214 os.unlink(fn)
215 except OSError:
216 pass
217 os.rename(tf.name, fn)
218 except Exception:
219 try:
220 os.remove(tf.name)
221 except OSError:
222 pass
223 raise
224
225
226 if sys.version_info >= (2, 7):
227 def find_xpath_attr(node, xpath, key, val=None):
228 """ Find the xpath xpath[@key=val] """
229 assert re.match(r'^[a-zA-Z_-]+$', key)
230 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
231 return node.find(expr)
232 else:
233 def find_xpath_attr(node, xpath, key, val=None):
234 for f in node.findall(compat_xpath(xpath)):
235 if key not in f.attrib:
236 continue
237 if val is None or f.attrib.get(key) == val:
238 return f
239 return None
240
241 # On python2.6 the xml.etree.ElementTree.Element methods don't support
242 # the namespace parameter
243
244
245 def xpath_with_ns(path, ns_map):
246 components = [c.split(':') for c in path.split('/')]
247 replaced = []
248 for c in components:
249 if len(c) == 1:
250 replaced.append(c[0])
251 else:
252 ns, tag = c
253 replaced.append('{%s}%s' % (ns_map[ns], tag))
254 return '/'.join(replaced)
255
256
257 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
258 def _find_xpath(xpath):
259 return node.find(compat_xpath(xpath))
260
261 if isinstance(xpath, (str, compat_str)):
262 n = _find_xpath(xpath)
263 else:
264 for xp in xpath:
265 n = _find_xpath(xp)
266 if n is not None:
267 break
268
269 if n is None:
270 if default is not NO_DEFAULT:
271 return default
272 elif fatal:
273 name = xpath if name is None else name
274 raise ExtractorError('Could not find XML element %s' % name)
275 else:
276 return None
277 return n
278
279
280 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
281 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
282 if n is None or n == default:
283 return n
284 if n.text is None:
285 if default is not NO_DEFAULT:
286 return default
287 elif fatal:
288 name = xpath if name is None else name
289 raise ExtractorError('Could not find XML element\'s text %s' % name)
290 else:
291 return None
292 return n.text
293
294
295 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
296 n = find_xpath_attr(node, xpath, key)
297 if n is None:
298 if default is not NO_DEFAULT:
299 return default
300 elif fatal:
301 name = '%s[@%s]' % (xpath, key) if name is None else name
302 raise ExtractorError('Could not find XML attribute %s' % name)
303 else:
304 return None
305 return n.attrib[key]
306
307
308 def get_element_by_id(id, html):
309 """Return the content of the tag with the specified ID in the passed HTML document"""
310 return get_element_by_attribute('id', id, html)
311
312
313 def get_element_by_class(class_name, html):
314 return get_element_by_attribute(
315 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
316 html, escape_value=False)
317
318
319 def get_element_by_attribute(attribute, value, html, escape_value=True):
320 """Return the content of the tag with the specified attribute in the passed HTML document"""
321
322 value = re.escape(value) if escape_value else value
323
324 m = re.search(r'''(?xs)
325 <([a-zA-Z0-9:._-]+)
326 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
327 \s+%s=['"]?%s['"]?
328 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
329 \s*>
330 (?P<content>.*?)
331 </\1>
332 ''' % (re.escape(attribute), value), html)
333
334 if not m:
335 return None
336 res = m.group('content')
337
338 if res.startswith('"') or res.startswith("'"):
339 res = res[1:-1]
340
341 return unescapeHTML(res)
342
343
344 class HTMLAttributeParser(compat_HTMLParser):
345 """Trivial HTML parser to gather the attributes for a single element"""
346 def __init__(self):
347 self.attrs = {}
348 compat_HTMLParser.__init__(self)
349
350 def handle_starttag(self, tag, attrs):
351 self.attrs = dict(attrs)
352
353
354 def extract_attributes(html_element):
355 """Given a string for an HTML element such as
356 <el
357 a="foo" B="bar" c="&98;az" d=boz
358 empty= noval entity="&amp;"
359 sq='"' dq="'"
360 >
361 Decode and return a dictionary of attributes.
362 {
363 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
364 'empty': '', 'noval': None, 'entity': '&',
365 'sq': '"', 'dq': '\''
366 }.
367 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
368 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
369 """
370 parser = HTMLAttributeParser()
371 parser.feed(html_element)
372 parser.close()
373 return parser.attrs
374
375
376 def clean_html(html):
377 """Clean an HTML snippet into a readable string"""
378
379 if html is None: # Convenience for sanitizing descriptions etc.
380 return html
381
382 # Newline vs <br />
383 html = html.replace('\n', ' ')
384 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
385 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
386 # Strip html tags
387 html = re.sub('<.*?>', '', html)
388 # Replace html entities
389 html = unescapeHTML(html)
390 return html.strip()
391
392
393 def sanitize_open(filename, open_mode):
394 """Try to open the given filename, and slightly tweak it if this fails.
395
396 Attempts to open the given filename. If this fails, it tries to change
397 the filename slightly, step by step, until it's either able to open it
398 or it fails and raises a final exception, like the standard open()
399 function.
400
401 It returns the tuple (stream, definitive_file_name).
402 """
403 try:
404 if filename == '-':
405 if sys.platform == 'win32':
406 import msvcrt
407 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
408 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
409 stream = open(encodeFilename(filename), open_mode)
410 return (stream, filename)
411 except (IOError, OSError) as err:
412 if err.errno in (errno.EACCES,):
413 raise
414
415 # In case of error, try to remove win32 forbidden chars
416 alt_filename = sanitize_path(filename)
417 if alt_filename == filename:
418 raise
419 else:
420 # An exception here should be caught in the caller
421 stream = open(encodeFilename(alt_filename), open_mode)
422 return (stream, alt_filename)
423
424
425 def timeconvert(timestr):
426 """Convert RFC 2822 defined time string into system timestamp"""
427 timestamp = None
428 timetuple = email.utils.parsedate_tz(timestr)
429 if timetuple is not None:
430 timestamp = email.utils.mktime_tz(timetuple)
431 return timestamp
432
433
434 def sanitize_filename(s, restricted=False, is_id=False):
435 """Sanitizes a string so it could be used as part of a filename.
436 If restricted is set, use a stricter subset of allowed characters.
437 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
438 """
439 def replace_insane(char):
440 if restricted and char in ACCENT_CHARS:
441 return ACCENT_CHARS[char]
442 if char == '?' or ord(char) < 32 or ord(char) == 127:
443 return ''
444 elif char == '"':
445 return '' if restricted else '\''
446 elif char == ':':
447 return '_-' if restricted else ' -'
448 elif char in '\\/|*<>':
449 return '_'
450 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
451 return '_'
452 if restricted and ord(char) > 127:
453 return '_'
454 return char
455
456 # Handle timestamps
457 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
458 result = ''.join(map(replace_insane, s))
459 if not is_id:
460 while '__' in result:
461 result = result.replace('__', '_')
462 result = result.strip('_')
463 # Common case of "Foreign band name - English song title"
464 if restricted and result.startswith('-_'):
465 result = result[2:]
466 if result.startswith('-'):
467 result = '_' + result[len('-'):]
468 result = result.lstrip('.')
469 if not result:
470 result = '_'
471 return result
472
473
474 def sanitize_path(s):
475 """Sanitizes and normalizes path on Windows"""
476 if sys.platform != 'win32':
477 return s
478 drive_or_unc, _ = os.path.splitdrive(s)
479 if sys.version_info < (2, 7) and not drive_or_unc:
480 drive_or_unc, _ = os.path.splitunc(s)
481 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
482 if drive_or_unc:
483 norm_path.pop(0)
484 sanitized_path = [
485 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
486 for path_part in norm_path]
487 if drive_or_unc:
488 sanitized_path.insert(0, drive_or_unc + os.path.sep)
489 return os.path.join(*sanitized_path)
490
491
492 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
493 # unwanted failures due to missing protocol
494 def sanitize_url(url):
495 return 'http:%s' % url if url.startswith('//') else url
496
497
498 def sanitized_Request(url, *args, **kwargs):
499 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
500
501
502 def orderedSet(iterable):
503 """ Remove all duplicates from the input iterable """
504 res = []
505 for el in iterable:
506 if el not in res:
507 res.append(el)
508 return res
509
510
511 def _htmlentity_transform(entity_with_semicolon):
512 """Transforms an HTML entity to a character."""
513 entity = entity_with_semicolon[:-1]
514
515 # Known non-numeric HTML entity
516 if entity in compat_html_entities.name2codepoint:
517 return compat_chr(compat_html_entities.name2codepoint[entity])
518
519 # TODO: HTML5 allows entities without a semicolon. For example,
520 # '&Eacuteric' should be decoded as 'Éric'.
521 if entity_with_semicolon in compat_html_entities_html5:
522 return compat_html_entities_html5[entity_with_semicolon]
523
524 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
525 if mobj is not None:
526 numstr = mobj.group(1)
527 if numstr.startswith('x'):
528 base = 16
529 numstr = '0%s' % numstr
530 else:
531 base = 10
532 # See https://github.com/rg3/youtube-dl/issues/7518
533 try:
534 return compat_chr(int(numstr, base))
535 except ValueError:
536 pass
537
538 # Unknown entity in name, return its literal representation
539 return '&%s;' % entity
540
541
542 def unescapeHTML(s):
543 if s is None:
544 return None
545 assert type(s) == compat_str
546
547 return re.sub(
548 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
549
550
551 def get_subprocess_encoding():
552 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
553 # For subprocess calls, encode with locale encoding
554 # Refer to http://stackoverflow.com/a/9951851/35070
555 encoding = preferredencoding()
556 else:
557 encoding = sys.getfilesystemencoding()
558 if encoding is None:
559 encoding = 'utf-8'
560 return encoding
561
562
563 def encodeFilename(s, for_subprocess=False):
564 """
565 @param s The name of the file
566 """
567
568 assert type(s) == compat_str
569
570 # Python 3 has a Unicode API
571 if sys.version_info >= (3, 0):
572 return s
573
574 # Pass '' directly to use Unicode APIs on Windows 2000 and up
575 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
576 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
577 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
578 return s
579
580 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
581 if sys.platform.startswith('java'):
582 return s
583
584 return s.encode(get_subprocess_encoding(), 'ignore')
585
586
587 def decodeFilename(b, for_subprocess=False):
588
589 if sys.version_info >= (3, 0):
590 return b
591
592 if not isinstance(b, bytes):
593 return b
594
595 return b.decode(get_subprocess_encoding(), 'ignore')
596
597
598 def encodeArgument(s):
599 if not isinstance(s, compat_str):
600 # Legacy code that uses byte strings
601 # Uncomment the following line after fixing all post processors
602 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
603 s = s.decode('ascii')
604 return encodeFilename(s, True)
605
606
607 def decodeArgument(b):
608 return decodeFilename(b, True)
609
610
611 def decodeOption(optval):
612 if optval is None:
613 return optval
614 if isinstance(optval, bytes):
615 optval = optval.decode(preferredencoding())
616
617 assert isinstance(optval, compat_str)
618 return optval
619
620
621 def formatSeconds(secs):
622 if secs > 3600:
623 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
624 elif secs > 60:
625 return '%d:%02d' % (secs // 60, secs % 60)
626 else:
627 return '%d' % secs
628
629
630 def make_HTTPS_handler(params, **kwargs):
631 opts_no_check_certificate = params.get('nocheckcertificate', False)
632 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
633 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
634 if opts_no_check_certificate:
635 context.check_hostname = False
636 context.verify_mode = ssl.CERT_NONE
637 try:
638 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
639 except TypeError:
640 # Python 2.7.8
641 # (create_default_context present but HTTPSHandler has no context=)
642 pass
643
644 if sys.version_info < (3, 2):
645 return YoutubeDLHTTPSHandler(params, **kwargs)
646 else: # Python < 3.4
647 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
648 context.verify_mode = (ssl.CERT_NONE
649 if opts_no_check_certificate
650 else ssl.CERT_REQUIRED)
651 context.set_default_verify_paths()
652 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
653
654
655 def bug_reports_message():
656 if ytdl_is_updateable():
657 update_cmd = 'type youtube-dl -U to update'
658 else:
659 update_cmd = 'see https://yt-dl.org/update on how to update'
660 msg = '; please report this issue on https://yt-dl.org/bug .'
661 msg += ' Make sure you are using the latest version; %s.' % update_cmd
662 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
663 return msg
664
665
666 class ExtractorError(Exception):
667 """Error during info extraction."""
668
669 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
670 """ tb, if given, is the original traceback (so that it can be printed out).
671 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
672 """
673
674 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
675 expected = True
676 if video_id is not None:
677 msg = video_id + ': ' + msg
678 if cause:
679 msg += ' (caused by %r)' % cause
680 if not expected:
681 msg += bug_reports_message()
682 super(ExtractorError, self).__init__(msg)
683
684 self.traceback = tb
685 self.exc_info = sys.exc_info() # preserve original exception
686 self.cause = cause
687 self.video_id = video_id
688
689 def format_traceback(self):
690 if self.traceback is None:
691 return None
692 return ''.join(traceback.format_tb(self.traceback))
693
694
695 class UnsupportedError(ExtractorError):
696 def __init__(self, url):
697 super(UnsupportedError, self).__init__(
698 'Unsupported URL: %s' % url, expected=True)
699 self.url = url
700
701
702 class RegexNotFoundError(ExtractorError):
703 """Error when a regex didn't match"""
704 pass
705
706
707 class DownloadError(Exception):
708 """Download Error exception.
709
710 This exception may be thrown by FileDownloader objects if they are not
711 configured to continue on errors. They will contain the appropriate
712 error message.
713 """
714
715 def __init__(self, msg, exc_info=None):
716 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
717 super(DownloadError, self).__init__(msg)
718 self.exc_info = exc_info
719
720
721 class SameFileError(Exception):
722 """Same File exception.
723
724 This exception will be thrown by FileDownloader objects if they detect
725 multiple files would have to be downloaded to the same file on disk.
726 """
727 pass
728
729
730 class PostProcessingError(Exception):
731 """Post Processing exception.
732
733 This exception may be raised by PostProcessor's .run() method to
734 indicate an error in the postprocessing task.
735 """
736
737 def __init__(self, msg):
738 self.msg = msg
739
740
741 class MaxDownloadsReached(Exception):
742 """ --max-downloads limit has been reached. """
743 pass
744
745
746 class UnavailableVideoError(Exception):
747 """Unavailable Format exception.
748
749 This exception will be thrown when a video is requested
750 in a format that is not available for that video.
751 """
752 pass
753
754
755 class ContentTooShortError(Exception):
756 """Content Too Short exception.
757
758 This exception may be raised by FileDownloader objects when a file they
759 download is too small for what the server announced first, indicating
760 the connection was probably interrupted.
761 """
762
763 def __init__(self, downloaded, expected):
764 # Both in bytes
765 self.downloaded = downloaded
766 self.expected = expected
767
768
769 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
770 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
771 # expected HTTP responses to meet HTTP/1.0 or later (see also
772 # https://github.com/rg3/youtube-dl/issues/6727)
773 if sys.version_info < (3, 0):
774 kwargs[b'strict'] = True
775 hc = http_class(*args, **kwargs)
776 source_address = ydl_handler._params.get('source_address')
777 if source_address is not None:
778 sa = (source_address, 0)
779 if hasattr(hc, 'source_address'): # Python 2.7+
780 hc.source_address = sa
781 else: # Python 2.6
782 def _hc_connect(self, *args, **kwargs):
783 sock = compat_socket_create_connection(
784 (self.host, self.port), self.timeout, sa)
785 if is_https:
786 self.sock = ssl.wrap_socket(
787 sock, self.key_file, self.cert_file,
788 ssl_version=ssl.PROTOCOL_TLSv1)
789 else:
790 self.sock = sock
791 hc.connect = functools.partial(_hc_connect, hc)
792
793 return hc
794
795
796 def handle_youtubedl_headers(headers):
797 filtered_headers = headers
798
799 if 'Youtubedl-no-compression' in filtered_headers:
800 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
801 del filtered_headers['Youtubedl-no-compression']
802
803 return filtered_headers
804
805
806 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
807 """Handler for HTTP requests and responses.
808
809 This class, when installed with an OpenerDirector, automatically adds
810 the standard headers to every HTTP request and handles gzipped and
811 deflated responses from web servers. If compression is to be avoided in
812 a particular request, the original request in the program code only has
813 to include the HTTP header "Youtubedl-no-compression", which will be
814 removed before making the real request.
815
816 Part of this code was copied from:
817
818 http://techknack.net/python-urllib2-handlers/
819
820 Andrew Rowls, the author of that code, agreed to release it to the
821 public domain.
822 """
823
824 def __init__(self, params, *args, **kwargs):
825 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
826 self._params = params
827
828 def http_open(self, req):
829 conn_class = compat_http_client.HTTPConnection
830
831 socks_proxy = req.headers.get('Ytdl-socks-proxy')
832 if socks_proxy:
833 conn_class = make_socks_conn_class(conn_class, socks_proxy)
834 del req.headers['Ytdl-socks-proxy']
835
836 return self.do_open(functools.partial(
837 _create_http_connection, self, conn_class, False),
838 req)
839
840 @staticmethod
841 def deflate(data):
842 try:
843 return zlib.decompress(data, -zlib.MAX_WBITS)
844 except zlib.error:
845 return zlib.decompress(data)
846
847 @staticmethod
848 def addinfourl_wrapper(stream, headers, url, code):
849 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
850 return compat_urllib_request.addinfourl(stream, headers, url, code)
851 ret = compat_urllib_request.addinfourl(stream, headers, url)
852 ret.code = code
853 return ret
854
855 def http_request(self, req):
856 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
857 # always respected by websites, some tend to give out URLs with non percent-encoded
858 # non-ASCII characters (see telemb.py, ard.py [#3412])
859 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
860 # To work around aforementioned issue we will replace request's original URL with
861 # percent-encoded one
862 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
863 # the code of this workaround has been moved here from YoutubeDL.urlopen()
864 url = req.get_full_url()
865 url_escaped = escape_url(url)
866
867 # Substitute URL if any change after escaping
868 if url != url_escaped:
869 req = update_Request(req, url=url_escaped)
870
871 for h, v in std_headers.items():
872 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
873 # The dict keys are capitalized because of this bug by urllib
874 if h.capitalize() not in req.headers:
875 req.add_header(h, v)
876
877 req.headers = handle_youtubedl_headers(req.headers)
878
879 if sys.version_info < (2, 7) and '#' in req.get_full_url():
880 # Python 2.6 is brain-dead when it comes to fragments
881 req._Request__original = req._Request__original.partition('#')[0]
882 req._Request__r_type = req._Request__r_type.partition('#')[0]
883
884 return req
885
886 def http_response(self, req, resp):
887 old_resp = resp
888 # gzip
889 if resp.headers.get('Content-encoding', '') == 'gzip':
890 content = resp.read()
891 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
892 try:
893 uncompressed = io.BytesIO(gz.read())
894 except IOError as original_ioerror:
895 # There may be junk add the end of the file
896 # See http://stackoverflow.com/q/4928560/35070 for details
897 for i in range(1, 1024):
898 try:
899 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
900 uncompressed = io.BytesIO(gz.read())
901 except IOError:
902 continue
903 break
904 else:
905 raise original_ioerror
906 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
907 resp.msg = old_resp.msg
908 del resp.headers['Content-encoding']
909 # deflate
910 if resp.headers.get('Content-encoding', '') == 'deflate':
911 gz = io.BytesIO(self.deflate(resp.read()))
912 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
913 resp.msg = old_resp.msg
914 del resp.headers['Content-encoding']
915 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
916 # https://github.com/rg3/youtube-dl/issues/6457).
917 if 300 <= resp.code < 400:
918 location = resp.headers.get('Location')
919 if location:
920 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
921 if sys.version_info >= (3, 0):
922 location = location.encode('iso-8859-1').decode('utf-8')
923 else:
924 location = location.decode('utf-8')
925 location_escaped = escape_url(location)
926 if location != location_escaped:
927 del resp.headers['Location']
928 if sys.version_info < (3, 0):
929 location_escaped = location_escaped.encode('utf-8')
930 resp.headers['Location'] = location_escaped
931 return resp
932
933 https_request = http_request
934 https_response = http_response
935
936
937 def make_socks_conn_class(base_class, socks_proxy):
938 assert issubclass(base_class, (
939 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
940
941 url_components = compat_urlparse.urlparse(socks_proxy)
942 if url_components.scheme.lower() == 'socks5':
943 socks_type = ProxyType.SOCKS5
944 elif url_components.scheme.lower() in ('socks', 'socks4'):
945 socks_type = ProxyType.SOCKS4
946 elif url_components.scheme.lower() == 'socks4a':
947 socks_type = ProxyType.SOCKS4A
948
949 def unquote_if_non_empty(s):
950 if not s:
951 return s
952 return compat_urllib_parse_unquote_plus(s)
953
954 proxy_args = (
955 socks_type,
956 url_components.hostname, url_components.port or 1080,
957 True, # Remote DNS
958 unquote_if_non_empty(url_components.username),
959 unquote_if_non_empty(url_components.password),
960 )
961
962 class SocksConnection(base_class):
963 def connect(self):
964 self.sock = sockssocket()
965 self.sock.setproxy(*proxy_args)
966 if type(self.timeout) in (int, float):
967 self.sock.settimeout(self.timeout)
968 self.sock.connect((self.host, self.port))
969
970 if isinstance(self, compat_http_client.HTTPSConnection):
971 if hasattr(self, '_context'): # Python > 2.6
972 self.sock = self._context.wrap_socket(
973 self.sock, server_hostname=self.host)
974 else:
975 self.sock = ssl.wrap_socket(self.sock)
976
977 return SocksConnection
978
979
980 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
981 def __init__(self, params, https_conn_class=None, *args, **kwargs):
982 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
983 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
984 self._params = params
985
986 def https_open(self, req):
987 kwargs = {}
988 conn_class = self._https_conn_class
989
990 if hasattr(self, '_context'): # python > 2.6
991 kwargs['context'] = self._context
992 if hasattr(self, '_check_hostname'): # python 3.x
993 kwargs['check_hostname'] = self._check_hostname
994
995 socks_proxy = req.headers.get('Ytdl-socks-proxy')
996 if socks_proxy:
997 conn_class = make_socks_conn_class(conn_class, socks_proxy)
998 del req.headers['Ytdl-socks-proxy']
999
1000 return self.do_open(functools.partial(
1001 _create_http_connection, self, conn_class, True),
1002 req, **kwargs)
1003
1004
1005 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1006 def __init__(self, cookiejar=None):
1007 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1008
1009 def http_response(self, request, response):
1010 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1011 # characters in Set-Cookie HTTP header of last response (see
1012 # https://github.com/rg3/youtube-dl/issues/6769).
1013 # In order to at least prevent crashing we will percent encode Set-Cookie
1014 # header before HTTPCookieProcessor starts processing it.
1015 # if sys.version_info < (3, 0) and response.headers:
1016 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1017 # set_cookie = response.headers.get(set_cookie_header)
1018 # if set_cookie:
1019 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1020 # if set_cookie != set_cookie_escaped:
1021 # del response.headers[set_cookie_header]
1022 # response.headers[set_cookie_header] = set_cookie_escaped
1023 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1024
1025 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1026 https_response = http_response
1027
1028
1029 def extract_timezone(date_str):
1030 m = re.search(
1031 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1032 date_str)
1033 if not m:
1034 timezone = datetime.timedelta()
1035 else:
1036 date_str = date_str[:-len(m.group('tz'))]
1037 if not m.group('sign'):
1038 timezone = datetime.timedelta()
1039 else:
1040 sign = 1 if m.group('sign') == '+' else -1
1041 timezone = datetime.timedelta(
1042 hours=sign * int(m.group('hours')),
1043 minutes=sign * int(m.group('minutes')))
1044 return timezone, date_str
1045
1046
1047 def parse_iso8601(date_str, delimiter='T', timezone=None):
1048 """ Return a UNIX timestamp from the given date """
1049
1050 if date_str is None:
1051 return None
1052
1053 date_str = re.sub(r'\.[0-9]+', '', date_str)
1054
1055 if timezone is None:
1056 timezone, date_str = extract_timezone(date_str)
1057
1058 try:
1059 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1060 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1061 return calendar.timegm(dt.timetuple())
1062 except ValueError:
1063 pass
1064
1065
1066 def date_formats(day_first=True):
1067 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1068
1069
1070 def unified_strdate(date_str, day_first=True):
1071 """Return a string with the date in the format YYYYMMDD"""
1072
1073 if date_str is None:
1074 return None
1075 upload_date = None
1076 # Replace commas
1077 date_str = date_str.replace(',', ' ')
1078 # Remove AM/PM + timezone
1079 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1080 _, date_str = extract_timezone(date_str)
1081
1082 for expression in date_formats(day_first):
1083 try:
1084 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1085 except ValueError:
1086 pass
1087 if upload_date is None:
1088 timetuple = email.utils.parsedate_tz(date_str)
1089 if timetuple:
1090 try:
1091 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1092 except ValueError:
1093 pass
1094 if upload_date is not None:
1095 return compat_str(upload_date)
1096
1097
1098 def unified_timestamp(date_str, day_first=True):
1099 if date_str is None:
1100 return None
1101
1102 date_str = date_str.replace(',', ' ')
1103
1104 pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0)
1105 timezone, date_str = extract_timezone(date_str)
1106
1107 # Remove AM/PM + timezone
1108 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1109
1110 for expression in date_formats(day_first):
1111 try:
1112 dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta
1113 return calendar.timegm(dt.timetuple())
1114 except ValueError:
1115 pass
1116 timetuple = email.utils.parsedate_tz(date_str)
1117 if timetuple:
1118 return calendar.timegm(timetuple.timetuple())
1119
1120
1121 def determine_ext(url, default_ext='unknown_video'):
1122 if url is None:
1123 return default_ext
1124 guess = url.partition('?')[0].rpartition('.')[2]
1125 if re.match(r'^[A-Za-z0-9]+$', guess):
1126 return guess
1127 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1128 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1129 return guess.rstrip('/')
1130 else:
1131 return default_ext
1132
1133
1134 def subtitles_filename(filename, sub_lang, sub_format):
1135 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1136
1137
1138 def date_from_str(date_str):
1139 """
1140 Return a datetime object from a string in the format YYYYMMDD or
1141 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1142 today = datetime.date.today()
1143 if date_str in ('now', 'today'):
1144 return today
1145 if date_str == 'yesterday':
1146 return today - datetime.timedelta(days=1)
1147 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1148 if match is not None:
1149 sign = match.group('sign')
1150 time = int(match.group('time'))
1151 if sign == '-':
1152 time = -time
1153 unit = match.group('unit')
1154 # A bad approximation?
1155 if unit == 'month':
1156 unit = 'day'
1157 time *= 30
1158 elif unit == 'year':
1159 unit = 'day'
1160 time *= 365
1161 unit += 's'
1162 delta = datetime.timedelta(**{unit: time})
1163 return today + delta
1164 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1165
1166
1167 def hyphenate_date(date_str):
1168 """
1169 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1170 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1171 if match is not None:
1172 return '-'.join(match.groups())
1173 else:
1174 return date_str
1175
1176
1177 class DateRange(object):
1178 """Represents a time interval between two dates"""
1179
1180 def __init__(self, start=None, end=None):
1181 """start and end must be strings in the format accepted by date"""
1182 if start is not None:
1183 self.start = date_from_str(start)
1184 else:
1185 self.start = datetime.datetime.min.date()
1186 if end is not None:
1187 self.end = date_from_str(end)
1188 else:
1189 self.end = datetime.datetime.max.date()
1190 if self.start > self.end:
1191 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1192
1193 @classmethod
1194 def day(cls, day):
1195 """Returns a range that only contains the given day"""
1196 return cls(day, day)
1197
1198 def __contains__(self, date):
1199 """Check if the date is in the range"""
1200 if not isinstance(date, datetime.date):
1201 date = date_from_str(date)
1202 return self.start <= date <= self.end
1203
1204 def __str__(self):
1205 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1206
1207
1208 def platform_name():
1209 """ Returns the platform name as a compat_str """
1210 res = platform.platform()
1211 if isinstance(res, bytes):
1212 res = res.decode(preferredencoding())
1213
1214 assert isinstance(res, compat_str)
1215 return res
1216
1217
1218 def _windows_write_string(s, out):
1219 """ Returns True if the string was written using special methods,
1220 False if it has yet to be written out."""
1221 # Adapted from http://stackoverflow.com/a/3259271/35070
1222
1223 import ctypes
1224 import ctypes.wintypes
1225
1226 WIN_OUTPUT_IDS = {
1227 1: -11,
1228 2: -12,
1229 }
1230
1231 try:
1232 fileno = out.fileno()
1233 except AttributeError:
1234 # If the output stream doesn't have a fileno, it's virtual
1235 return False
1236 except io.UnsupportedOperation:
1237 # Some strange Windows pseudo files?
1238 return False
1239 if fileno not in WIN_OUTPUT_IDS:
1240 return False
1241
1242 GetStdHandle = ctypes.WINFUNCTYPE(
1243 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1244 (b'GetStdHandle', ctypes.windll.kernel32))
1245 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1246
1247 WriteConsoleW = ctypes.WINFUNCTYPE(
1248 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1249 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1250 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1251 written = ctypes.wintypes.DWORD(0)
1252
1253 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1254 FILE_TYPE_CHAR = 0x0002
1255 FILE_TYPE_REMOTE = 0x8000
1256 GetConsoleMode = ctypes.WINFUNCTYPE(
1257 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1258 ctypes.POINTER(ctypes.wintypes.DWORD))(
1259 (b'GetConsoleMode', ctypes.windll.kernel32))
1260 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1261
1262 def not_a_console(handle):
1263 if handle == INVALID_HANDLE_VALUE or handle is None:
1264 return True
1265 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1266 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1267
1268 if not_a_console(h):
1269 return False
1270
1271 def next_nonbmp_pos(s):
1272 try:
1273 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1274 except StopIteration:
1275 return len(s)
1276
1277 while s:
1278 count = min(next_nonbmp_pos(s), 1024)
1279
1280 ret = WriteConsoleW(
1281 h, s, count if count else 2, ctypes.byref(written), None)
1282 if ret == 0:
1283 raise OSError('Failed to write string')
1284 if not count: # We just wrote a non-BMP character
1285 assert written.value == 2
1286 s = s[1:]
1287 else:
1288 assert written.value > 0
1289 s = s[written.value:]
1290 return True
1291
1292
1293 def write_string(s, out=None, encoding=None):
1294 if out is None:
1295 out = sys.stderr
1296 assert type(s) == compat_str
1297
1298 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1299 if _windows_write_string(s, out):
1300 return
1301
1302 if ('b' in getattr(out, 'mode', '') or
1303 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1304 byt = s.encode(encoding or preferredencoding(), 'ignore')
1305 out.write(byt)
1306 elif hasattr(out, 'buffer'):
1307 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1308 byt = s.encode(enc, 'ignore')
1309 out.buffer.write(byt)
1310 else:
1311 out.write(s)
1312 out.flush()
1313
1314
1315 def bytes_to_intlist(bs):
1316 if not bs:
1317 return []
1318 if isinstance(bs[0], int): # Python 3
1319 return list(bs)
1320 else:
1321 return [ord(c) for c in bs]
1322
1323
1324 def intlist_to_bytes(xs):
1325 if not xs:
1326 return b''
1327 return compat_struct_pack('%dB' % len(xs), *xs)
1328
1329
1330 # Cross-platform file locking
1331 if sys.platform == 'win32':
1332 import ctypes.wintypes
1333 import msvcrt
1334
1335 class OVERLAPPED(ctypes.Structure):
1336 _fields_ = [
1337 ('Internal', ctypes.wintypes.LPVOID),
1338 ('InternalHigh', ctypes.wintypes.LPVOID),
1339 ('Offset', ctypes.wintypes.DWORD),
1340 ('OffsetHigh', ctypes.wintypes.DWORD),
1341 ('hEvent', ctypes.wintypes.HANDLE),
1342 ]
1343
1344 kernel32 = ctypes.windll.kernel32
1345 LockFileEx = kernel32.LockFileEx
1346 LockFileEx.argtypes = [
1347 ctypes.wintypes.HANDLE, # hFile
1348 ctypes.wintypes.DWORD, # dwFlags
1349 ctypes.wintypes.DWORD, # dwReserved
1350 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1351 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1352 ctypes.POINTER(OVERLAPPED) # Overlapped
1353 ]
1354 LockFileEx.restype = ctypes.wintypes.BOOL
1355 UnlockFileEx = kernel32.UnlockFileEx
1356 UnlockFileEx.argtypes = [
1357 ctypes.wintypes.HANDLE, # hFile
1358 ctypes.wintypes.DWORD, # dwReserved
1359 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1360 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1361 ctypes.POINTER(OVERLAPPED) # Overlapped
1362 ]
1363 UnlockFileEx.restype = ctypes.wintypes.BOOL
1364 whole_low = 0xffffffff
1365 whole_high = 0x7fffffff
1366
1367 def _lock_file(f, exclusive):
1368 overlapped = OVERLAPPED()
1369 overlapped.Offset = 0
1370 overlapped.OffsetHigh = 0
1371 overlapped.hEvent = 0
1372 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1373 handle = msvcrt.get_osfhandle(f.fileno())
1374 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1375 whole_low, whole_high, f._lock_file_overlapped_p):
1376 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1377
1378 def _unlock_file(f):
1379 assert f._lock_file_overlapped_p
1380 handle = msvcrt.get_osfhandle(f.fileno())
1381 if not UnlockFileEx(handle, 0,
1382 whole_low, whole_high, f._lock_file_overlapped_p):
1383 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1384
1385 else:
1386 # Some platforms, such as Jython, is missing fcntl
1387 try:
1388 import fcntl
1389
1390 def _lock_file(f, exclusive):
1391 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1392
1393 def _unlock_file(f):
1394 fcntl.flock(f, fcntl.LOCK_UN)
1395 except ImportError:
1396 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1397
1398 def _lock_file(f, exclusive):
1399 raise IOError(UNSUPPORTED_MSG)
1400
1401 def _unlock_file(f):
1402 raise IOError(UNSUPPORTED_MSG)
1403
1404
1405 class locked_file(object):
1406 def __init__(self, filename, mode, encoding=None):
1407 assert mode in ['r', 'a', 'w']
1408 self.f = io.open(filename, mode, encoding=encoding)
1409 self.mode = mode
1410
1411 def __enter__(self):
1412 exclusive = self.mode != 'r'
1413 try:
1414 _lock_file(self.f, exclusive)
1415 except IOError:
1416 self.f.close()
1417 raise
1418 return self
1419
1420 def __exit__(self, etype, value, traceback):
1421 try:
1422 _unlock_file(self.f)
1423 finally:
1424 self.f.close()
1425
1426 def __iter__(self):
1427 return iter(self.f)
1428
1429 def write(self, *args):
1430 return self.f.write(*args)
1431
1432 def read(self, *args):
1433 return self.f.read(*args)
1434
1435
1436 def get_filesystem_encoding():
1437 encoding = sys.getfilesystemencoding()
1438 return encoding if encoding is not None else 'utf-8'
1439
1440
1441 def shell_quote(args):
1442 quoted_args = []
1443 encoding = get_filesystem_encoding()
1444 for a in args:
1445 if isinstance(a, bytes):
1446 # We may get a filename encoded with 'encodeFilename'
1447 a = a.decode(encoding)
1448 quoted_args.append(pipes.quote(a))
1449 return ' '.join(quoted_args)
1450
1451
1452 def smuggle_url(url, data):
1453 """ Pass additional data in a URL for internal use. """
1454
1455 url, idata = unsmuggle_url(url, {})
1456 data.update(idata)
1457 sdata = compat_urllib_parse_urlencode(
1458 {'__youtubedl_smuggle': json.dumps(data)})
1459 return url + '#' + sdata
1460
1461
1462 def unsmuggle_url(smug_url, default=None):
1463 if '#__youtubedl_smuggle' not in smug_url:
1464 return smug_url, default
1465 url, _, sdata = smug_url.rpartition('#')
1466 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1467 data = json.loads(jsond)
1468 return url, data
1469
1470
1471 def format_bytes(bytes):
1472 if bytes is None:
1473 return 'N/A'
1474 if type(bytes) is str:
1475 bytes = float(bytes)
1476 if bytes == 0.0:
1477 exponent = 0
1478 else:
1479 exponent = int(math.log(bytes, 1024.0))
1480 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1481 converted = float(bytes) / float(1024 ** exponent)
1482 return '%.2f%s' % (converted, suffix)
1483
1484
1485 def lookup_unit_table(unit_table, s):
1486 units_re = '|'.join(re.escape(u) for u in unit_table)
1487 m = re.match(
1488 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1489 if not m:
1490 return None
1491 num_str = m.group('num').replace(',', '.')
1492 mult = unit_table[m.group('unit')]
1493 return int(float(num_str) * mult)
1494
1495
1496 def parse_filesize(s):
1497 if s is None:
1498 return None
1499
1500 # The lower-case forms are of course incorrect and unofficial,
1501 # but we support those too
1502 _UNIT_TABLE = {
1503 'B': 1,
1504 'b': 1,
1505 'KiB': 1024,
1506 'KB': 1000,
1507 'kB': 1024,
1508 'Kb': 1000,
1509 'MiB': 1024 ** 2,
1510 'MB': 1000 ** 2,
1511 'mB': 1024 ** 2,
1512 'Mb': 1000 ** 2,
1513 'GiB': 1024 ** 3,
1514 'GB': 1000 ** 3,
1515 'gB': 1024 ** 3,
1516 'Gb': 1000 ** 3,
1517 'TiB': 1024 ** 4,
1518 'TB': 1000 ** 4,
1519 'tB': 1024 ** 4,
1520 'Tb': 1000 ** 4,
1521 'PiB': 1024 ** 5,
1522 'PB': 1000 ** 5,
1523 'pB': 1024 ** 5,
1524 'Pb': 1000 ** 5,
1525 'EiB': 1024 ** 6,
1526 'EB': 1000 ** 6,
1527 'eB': 1024 ** 6,
1528 'Eb': 1000 ** 6,
1529 'ZiB': 1024 ** 7,
1530 'ZB': 1000 ** 7,
1531 'zB': 1024 ** 7,
1532 'Zb': 1000 ** 7,
1533 'YiB': 1024 ** 8,
1534 'YB': 1000 ** 8,
1535 'yB': 1024 ** 8,
1536 'Yb': 1000 ** 8,
1537 }
1538
1539 return lookup_unit_table(_UNIT_TABLE, s)
1540
1541
1542 def parse_count(s):
1543 if s is None:
1544 return None
1545
1546 s = s.strip()
1547
1548 if re.match(r'^[\d,.]+$', s):
1549 return str_to_int(s)
1550
1551 _UNIT_TABLE = {
1552 'k': 1000,
1553 'K': 1000,
1554 'm': 1000 ** 2,
1555 'M': 1000 ** 2,
1556 'kk': 1000 ** 2,
1557 'KK': 1000 ** 2,
1558 }
1559
1560 return lookup_unit_table(_UNIT_TABLE, s)
1561
1562
1563 def month_by_name(name):
1564 """ Return the number of a month by (locale-independently) English name """
1565
1566 try:
1567 return ENGLISH_MONTH_NAMES.index(name) + 1
1568 except ValueError:
1569 return None
1570
1571
1572 def month_by_abbreviation(abbrev):
1573 """ Return the number of a month by (locale-independently) English
1574 abbreviations """
1575
1576 try:
1577 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1578 except ValueError:
1579 return None
1580
1581
1582 def fix_xml_ampersands(xml_str):
1583 """Replace all the '&' by '&amp;' in XML"""
1584 return re.sub(
1585 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1586 '&amp;',
1587 xml_str)
1588
1589
1590 def setproctitle(title):
1591 assert isinstance(title, compat_str)
1592
1593 # ctypes in Jython is not complete
1594 # http://bugs.jython.org/issue2148
1595 if sys.platform.startswith('java'):
1596 return
1597
1598 try:
1599 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1600 except OSError:
1601 return
1602 title_bytes = title.encode('utf-8')
1603 buf = ctypes.create_string_buffer(len(title_bytes))
1604 buf.value = title_bytes
1605 try:
1606 libc.prctl(15, buf, 0, 0, 0)
1607 except AttributeError:
1608 return # Strange libc, just skip this
1609
1610
1611 def remove_start(s, start):
1612 return s[len(start):] if s is not None and s.startswith(start) else s
1613
1614
1615 def remove_end(s, end):
1616 return s[:-len(end)] if s is not None and s.endswith(end) else s
1617
1618
1619 def remove_quotes(s):
1620 if s is None or len(s) < 2:
1621 return s
1622 for quote in ('"', "'", ):
1623 if s[0] == quote and s[-1] == quote:
1624 return s[1:-1]
1625 return s
1626
1627
1628 def url_basename(url):
1629 path = compat_urlparse.urlparse(url).path
1630 return path.strip('/').split('/')[-1]
1631
1632
1633 class HEADRequest(compat_urllib_request.Request):
1634 def get_method(self):
1635 return 'HEAD'
1636
1637
1638 class PUTRequest(compat_urllib_request.Request):
1639 def get_method(self):
1640 return 'PUT'
1641
1642
1643 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1644 if get_attr:
1645 if v is not None:
1646 v = getattr(v, get_attr, None)
1647 if v == '':
1648 v = None
1649 if v is None:
1650 return default
1651 try:
1652 return int(v) * invscale // scale
1653 except ValueError:
1654 return default
1655
1656
1657 def str_or_none(v, default=None):
1658 return default if v is None else compat_str(v)
1659
1660
1661 def str_to_int(int_str):
1662 """ A more relaxed version of int_or_none """
1663 if int_str is None:
1664 return None
1665 int_str = re.sub(r'[,\.\+]', '', int_str)
1666 return int(int_str)
1667
1668
1669 def float_or_none(v, scale=1, invscale=1, default=None):
1670 if v is None:
1671 return default
1672 try:
1673 return float(v) * invscale / scale
1674 except ValueError:
1675 return default
1676
1677
1678 def strip_or_none(v):
1679 return None if v is None else v.strip()
1680
1681
1682 def parse_duration(s):
1683 if not isinstance(s, compat_basestring):
1684 return None
1685
1686 s = s.strip()
1687
1688 days, hours, mins, secs, ms = [None] * 5
1689 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1690 if m:
1691 days, hours, mins, secs, ms = m.groups()
1692 else:
1693 m = re.match(
1694 r'''(?ix)(?:P?T)?
1695 (?:
1696 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1697 )?
1698 (?:
1699 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1700 )?
1701 (?:
1702 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1703 )?
1704 (?:
1705 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1706 )?$''', s)
1707 if m:
1708 days, hours, mins, secs, ms = m.groups()
1709 else:
1710 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1711 if m:
1712 hours, mins = m.groups()
1713 else:
1714 return None
1715
1716 duration = 0
1717 if secs:
1718 duration += float(secs)
1719 if mins:
1720 duration += float(mins) * 60
1721 if hours:
1722 duration += float(hours) * 60 * 60
1723 if days:
1724 duration += float(days) * 24 * 60 * 60
1725 if ms:
1726 duration += float(ms)
1727 return duration
1728
1729
1730 def prepend_extension(filename, ext, expected_real_ext=None):
1731 name, real_ext = os.path.splitext(filename)
1732 return (
1733 '{0}.{1}{2}'.format(name, ext, real_ext)
1734 if not expected_real_ext or real_ext[1:] == expected_real_ext
1735 else '{0}.{1}'.format(filename, ext))
1736
1737
1738 def replace_extension(filename, ext, expected_real_ext=None):
1739 name, real_ext = os.path.splitext(filename)
1740 return '{0}.{1}'.format(
1741 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1742 ext)
1743
1744
1745 def check_executable(exe, args=[]):
1746 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1747 args can be a list of arguments for a short output (like -version) """
1748 try:
1749 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1750 except OSError:
1751 return False
1752 return exe
1753
1754
1755 def get_exe_version(exe, args=['--version'],
1756 version_re=None, unrecognized='present'):
1757 """ Returns the version of the specified executable,
1758 or False if the executable is not present """
1759 try:
1760 out, _ = subprocess.Popen(
1761 [encodeArgument(exe)] + args,
1762 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1763 except OSError:
1764 return False
1765 if isinstance(out, bytes): # Python 2.x
1766 out = out.decode('ascii', 'ignore')
1767 return detect_exe_version(out, version_re, unrecognized)
1768
1769
1770 def detect_exe_version(output, version_re=None, unrecognized='present'):
1771 assert isinstance(output, compat_str)
1772 if version_re is None:
1773 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1774 m = re.search(version_re, output)
1775 if m:
1776 return m.group(1)
1777 else:
1778 return unrecognized
1779
1780
1781 class PagedList(object):
1782 def __len__(self):
1783 # This is only useful for tests
1784 return len(self.getslice())
1785
1786
1787 class OnDemandPagedList(PagedList):
1788 def __init__(self, pagefunc, pagesize, use_cache=False):
1789 self._pagefunc = pagefunc
1790 self._pagesize = pagesize
1791 self._use_cache = use_cache
1792 if use_cache:
1793 self._cache = {}
1794
1795 def getslice(self, start=0, end=None):
1796 res = []
1797 for pagenum in itertools.count(start // self._pagesize):
1798 firstid = pagenum * self._pagesize
1799 nextfirstid = pagenum * self._pagesize + self._pagesize
1800 if start >= nextfirstid:
1801 continue
1802
1803 page_results = None
1804 if self._use_cache:
1805 page_results = self._cache.get(pagenum)
1806 if page_results is None:
1807 page_results = list(self._pagefunc(pagenum))
1808 if self._use_cache:
1809 self._cache[pagenum] = page_results
1810
1811 startv = (
1812 start % self._pagesize
1813 if firstid <= start < nextfirstid
1814 else 0)
1815
1816 endv = (
1817 ((end - 1) % self._pagesize) + 1
1818 if (end is not None and firstid <= end <= nextfirstid)
1819 else None)
1820
1821 if startv != 0 or endv is not None:
1822 page_results = page_results[startv:endv]
1823 res.extend(page_results)
1824
1825 # A little optimization - if current page is not "full", ie. does
1826 # not contain page_size videos then we can assume that this page
1827 # is the last one - there are no more ids on further pages -
1828 # i.e. no need to query again.
1829 if len(page_results) + startv < self._pagesize:
1830 break
1831
1832 # If we got the whole page, but the next page is not interesting,
1833 # break out early as well
1834 if end == nextfirstid:
1835 break
1836 return res
1837
1838
1839 class InAdvancePagedList(PagedList):
1840 def __init__(self, pagefunc, pagecount, pagesize):
1841 self._pagefunc = pagefunc
1842 self._pagecount = pagecount
1843 self._pagesize = pagesize
1844
1845 def getslice(self, start=0, end=None):
1846 res = []
1847 start_page = start // self._pagesize
1848 end_page = (
1849 self._pagecount if end is None else (end // self._pagesize + 1))
1850 skip_elems = start - start_page * self._pagesize
1851 only_more = None if end is None else end - start
1852 for pagenum in range(start_page, end_page):
1853 page = list(self._pagefunc(pagenum))
1854 if skip_elems:
1855 page = page[skip_elems:]
1856 skip_elems = None
1857 if only_more is not None:
1858 if len(page) < only_more:
1859 only_more -= len(page)
1860 else:
1861 page = page[:only_more]
1862 res.extend(page)
1863 break
1864 res.extend(page)
1865 return res
1866
1867
1868 def uppercase_escape(s):
1869 unicode_escape = codecs.getdecoder('unicode_escape')
1870 return re.sub(
1871 r'\\U[0-9a-fA-F]{8}',
1872 lambda m: unicode_escape(m.group(0))[0],
1873 s)
1874
1875
1876 def lowercase_escape(s):
1877 unicode_escape = codecs.getdecoder('unicode_escape')
1878 return re.sub(
1879 r'\\u[0-9a-fA-F]{4}',
1880 lambda m: unicode_escape(m.group(0))[0],
1881 s)
1882
1883
1884 def escape_rfc3986(s):
1885 """Escape non-ASCII characters as suggested by RFC 3986"""
1886 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1887 s = s.encode('utf-8')
1888 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1889
1890
1891 def escape_url(url):
1892 """Escape URL as suggested by RFC 3986"""
1893 url_parsed = compat_urllib_parse_urlparse(url)
1894 return url_parsed._replace(
1895 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1896 path=escape_rfc3986(url_parsed.path),
1897 params=escape_rfc3986(url_parsed.params),
1898 query=escape_rfc3986(url_parsed.query),
1899 fragment=escape_rfc3986(url_parsed.fragment)
1900 ).geturl()
1901
1902
1903 def read_batch_urls(batch_fd):
1904 def fixup(url):
1905 if not isinstance(url, compat_str):
1906 url = url.decode('utf-8', 'replace')
1907 BOM_UTF8 = '\xef\xbb\xbf'
1908 if url.startswith(BOM_UTF8):
1909 url = url[len(BOM_UTF8):]
1910 url = url.strip()
1911 if url.startswith(('#', ';', ']')):
1912 return False
1913 return url
1914
1915 with contextlib.closing(batch_fd) as fd:
1916 return [url for url in map(fixup, fd) if url]
1917
1918
1919 def urlencode_postdata(*args, **kargs):
1920 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1921
1922
1923 def update_url_query(url, query):
1924 if not query:
1925 return url
1926 parsed_url = compat_urlparse.urlparse(url)
1927 qs = compat_parse_qs(parsed_url.query)
1928 qs.update(query)
1929 return compat_urlparse.urlunparse(parsed_url._replace(
1930 query=compat_urllib_parse_urlencode(qs, True)))
1931
1932
1933 def update_Request(req, url=None, data=None, headers={}, query={}):
1934 req_headers = req.headers.copy()
1935 req_headers.update(headers)
1936 req_data = data or req.data
1937 req_url = update_url_query(url or req.get_full_url(), query)
1938 req_get_method = req.get_method()
1939 if req_get_method == 'HEAD':
1940 req_type = HEADRequest
1941 elif req_get_method == 'PUT':
1942 req_type = PUTRequest
1943 else:
1944 req_type = compat_urllib_request.Request
1945 new_req = req_type(
1946 req_url, data=req_data, headers=req_headers,
1947 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1948 if hasattr(req, 'timeout'):
1949 new_req.timeout = req.timeout
1950 return new_req
1951
1952
1953 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1954 if isinstance(key_or_keys, (list, tuple)):
1955 for key in key_or_keys:
1956 if key not in d or d[key] is None or skip_false_values and not d[key]:
1957 continue
1958 return d[key]
1959 return default
1960 return d.get(key_or_keys, default)
1961
1962
1963 def try_get(src, getter, expected_type=None):
1964 try:
1965 v = getter(src)
1966 except (AttributeError, KeyError, TypeError, IndexError):
1967 pass
1968 else:
1969 if expected_type is None or isinstance(v, expected_type):
1970 return v
1971
1972
1973 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1974 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1975
1976
1977 US_RATINGS = {
1978 'G': 0,
1979 'PG': 10,
1980 'PG-13': 13,
1981 'R': 16,
1982 'NC': 18,
1983 }
1984
1985
1986 def parse_age_limit(s):
1987 if s is None:
1988 return None
1989 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1990 return int(m.group('age')) if m else US_RATINGS.get(s)
1991
1992
1993 def strip_jsonp(code):
1994 return re.sub(
1995 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1996
1997
1998 def js_to_json(code):
1999 def fix_kv(m):
2000 v = m.group(0)
2001 if v in ('true', 'false', 'null'):
2002 return v
2003 elif v.startswith('/*') or v == ',':
2004 return ""
2005
2006 if v[0] in ("'", '"'):
2007 v = re.sub(r'(?s)\\.|"', lambda m: {
2008 '"': '\\"',
2009 "\\'": "'",
2010 '\\\n': '',
2011 '\\x': '\\u00',
2012 }.get(m.group(0), m.group(0)), v[1:-1])
2013
2014 INTEGER_TABLE = (
2015 (r'^0[xX][0-9a-fA-F]+', 16),
2016 (r'^0+[0-7]+', 8),
2017 )
2018
2019 for regex, base in INTEGER_TABLE:
2020 im = re.match(regex, v)
2021 if im:
2022 i = int(im.group(0), base)
2023 return '"%d":' % i if v.endswith(':') else '%d' % i
2024
2025 return '"%s"' % v
2026
2027 return re.sub(r'''(?sx)
2028 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2029 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2030 /\*.*?\*/|,(?=\s*[\]}])|
2031 [a-zA-Z_][.a-zA-Z_0-9]*|
2032 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2033 [0-9]+(?=\s*:)
2034 ''', fix_kv, code)
2035
2036
2037 def qualities(quality_ids):
2038 """ Get a numeric quality value out of a list of possible values """
2039 def q(qid):
2040 try:
2041 return quality_ids.index(qid)
2042 except ValueError:
2043 return -1
2044 return q
2045
2046
2047 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2048
2049
2050 def limit_length(s, length):
2051 """ Add ellipses to overly long strings """
2052 if s is None:
2053 return None
2054 ELLIPSES = '...'
2055 if len(s) > length:
2056 return s[:length - len(ELLIPSES)] + ELLIPSES
2057 return s
2058
2059
2060 def version_tuple(v):
2061 return tuple(int(e) for e in re.split(r'[-.]', v))
2062
2063
2064 def is_outdated_version(version, limit, assume_new=True):
2065 if not version:
2066 return not assume_new
2067 try:
2068 return version_tuple(version) < version_tuple(limit)
2069 except ValueError:
2070 return not assume_new
2071
2072
2073 def ytdl_is_updateable():
2074 """ Returns if youtube-dl can be updated with -U """
2075 from zipimport import zipimporter
2076
2077 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2078
2079
2080 def args_to_str(args):
2081 # Get a short string representation for a subprocess command
2082 return ' '.join(compat_shlex_quote(a) for a in args)
2083
2084
2085 def error_to_compat_str(err):
2086 err_str = str(err)
2087 # On python 2 error byte string must be decoded with proper
2088 # encoding rather than ascii
2089 if sys.version_info[0] < 3:
2090 err_str = err_str.decode(preferredencoding())
2091 return err_str
2092
2093
2094 def mimetype2ext(mt):
2095 if mt is None:
2096 return None
2097
2098 ext = {
2099 'audio/mp4': 'm4a',
2100 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2101 # it's the most popular one
2102 'audio/mpeg': 'mp3',
2103 }.get(mt)
2104 if ext is not None:
2105 return ext
2106
2107 _, _, res = mt.rpartition('/')
2108 res = res.lower()
2109
2110 return {
2111 '3gpp': '3gp',
2112 'smptett+xml': 'tt',
2113 'srt': 'srt',
2114 'ttaf+xml': 'dfxp',
2115 'ttml+xml': 'ttml',
2116 'vtt': 'vtt',
2117 'x-flv': 'flv',
2118 'x-mp4-fragmented': 'mp4',
2119 'x-ms-wmv': 'wmv',
2120 'mpegurl': 'm3u8',
2121 'x-mpegurl': 'm3u8',
2122 'vnd.apple.mpegurl': 'm3u8',
2123 'dash+xml': 'mpd',
2124 'f4m': 'f4m',
2125 'f4m+xml': 'f4m',
2126 'hds+xml': 'f4m',
2127 'vnd.ms-sstr+xml': 'ism',
2128 }.get(res, res)
2129
2130
2131 def parse_codecs(codecs_str):
2132 # http://tools.ietf.org/html/rfc6381
2133 if not codecs_str:
2134 return {}
2135 splited_codecs = list(filter(None, map(
2136 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2137 vcodec, acodec = None, None
2138 for full_codec in splited_codecs:
2139 codec = full_codec.split('.')[0]
2140 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2141 if not vcodec:
2142 vcodec = full_codec
2143 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'):
2144 if not acodec:
2145 acodec = full_codec
2146 else:
2147 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2148 if not vcodec and not acodec:
2149 if len(splited_codecs) == 2:
2150 return {
2151 'vcodec': vcodec,
2152 'acodec': acodec,
2153 }
2154 elif len(splited_codecs) == 1:
2155 return {
2156 'vcodec': 'none',
2157 'acodec': vcodec,
2158 }
2159 else:
2160 return {
2161 'vcodec': vcodec or 'none',
2162 'acodec': acodec or 'none',
2163 }
2164 return {}
2165
2166
2167 def urlhandle_detect_ext(url_handle):
2168 getheader = url_handle.headers.get
2169
2170 cd = getheader('Content-Disposition')
2171 if cd:
2172 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2173 if m:
2174 e = determine_ext(m.group('filename'), default_ext=None)
2175 if e:
2176 return e
2177
2178 return mimetype2ext(getheader('Content-Type'))
2179
2180
2181 def encode_data_uri(data, mime_type):
2182 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2183
2184
2185 def age_restricted(content_limit, age_limit):
2186 """ Returns True iff the content should be blocked """
2187
2188 if age_limit is None: # No limit set
2189 return False
2190 if content_limit is None:
2191 return False # Content available for everyone
2192 return age_limit < content_limit
2193
2194
2195 def is_html(first_bytes):
2196 """ Detect whether a file contains HTML by examining its first bytes. """
2197
2198 BOMS = [
2199 (b'\xef\xbb\xbf', 'utf-8'),
2200 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2201 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2202 (b'\xff\xfe', 'utf-16-le'),
2203 (b'\xfe\xff', 'utf-16-be'),
2204 ]
2205 for bom, enc in BOMS:
2206 if first_bytes.startswith(bom):
2207 s = first_bytes[len(bom):].decode(enc, 'replace')
2208 break
2209 else:
2210 s = first_bytes.decode('utf-8', 'replace')
2211
2212 return re.match(r'^\s*<', s)
2213
2214
2215 def determine_protocol(info_dict):
2216 protocol = info_dict.get('protocol')
2217 if protocol is not None:
2218 return protocol
2219
2220 url = info_dict['url']
2221 if url.startswith('rtmp'):
2222 return 'rtmp'
2223 elif url.startswith('mms'):
2224 return 'mms'
2225 elif url.startswith('rtsp'):
2226 return 'rtsp'
2227
2228 ext = determine_ext(url)
2229 if ext == 'm3u8':
2230 return 'm3u8'
2231 elif ext == 'f4m':
2232 return 'f4m'
2233
2234 return compat_urllib_parse_urlparse(url).scheme
2235
2236
2237 def render_table(header_row, data):
2238 """ Render a list of rows, each as a list of values """
2239 table = [header_row] + data
2240 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2241 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2242 return '\n'.join(format_str % tuple(row) for row in table)
2243
2244
2245 def _match_one(filter_part, dct):
2246 COMPARISON_OPERATORS = {
2247 '<': operator.lt,
2248 '<=': operator.le,
2249 '>': operator.gt,
2250 '>=': operator.ge,
2251 '=': operator.eq,
2252 '!=': operator.ne,
2253 }
2254 operator_rex = re.compile(r'''(?x)\s*
2255 (?P<key>[a-z_]+)
2256 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2257 (?:
2258 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2259 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2260 )
2261 \s*$
2262 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2263 m = operator_rex.search(filter_part)
2264 if m:
2265 op = COMPARISON_OPERATORS[m.group('op')]
2266 if m.group('strval') is not None:
2267 if m.group('op') not in ('=', '!='):
2268 raise ValueError(
2269 'Operator %s does not support string values!' % m.group('op'))
2270 comparison_value = m.group('strval')
2271 else:
2272 try:
2273 comparison_value = int(m.group('intval'))
2274 except ValueError:
2275 comparison_value = parse_filesize(m.group('intval'))
2276 if comparison_value is None:
2277 comparison_value = parse_filesize(m.group('intval') + 'B')
2278 if comparison_value is None:
2279 raise ValueError(
2280 'Invalid integer value %r in filter part %r' % (
2281 m.group('intval'), filter_part))
2282 actual_value = dct.get(m.group('key'))
2283 if actual_value is None:
2284 return m.group('none_inclusive')
2285 return op(actual_value, comparison_value)
2286
2287 UNARY_OPERATORS = {
2288 '': lambda v: v is not None,
2289 '!': lambda v: v is None,
2290 }
2291 operator_rex = re.compile(r'''(?x)\s*
2292 (?P<op>%s)\s*(?P<key>[a-z_]+)
2293 \s*$
2294 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2295 m = operator_rex.search(filter_part)
2296 if m:
2297 op = UNARY_OPERATORS[m.group('op')]
2298 actual_value = dct.get(m.group('key'))
2299 return op(actual_value)
2300
2301 raise ValueError('Invalid filter part %r' % filter_part)
2302
2303
2304 def match_str(filter_str, dct):
2305 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2306
2307 return all(
2308 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2309
2310
2311 def match_filter_func(filter_str):
2312 def _match_func(info_dict):
2313 if match_str(filter_str, info_dict):
2314 return None
2315 else:
2316 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2317 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2318 return _match_func
2319
2320
2321 def parse_dfxp_time_expr(time_expr):
2322 if not time_expr:
2323 return
2324
2325 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2326 if mobj:
2327 return float(mobj.group('time_offset'))
2328
2329 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2330 if mobj:
2331 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2332
2333
2334 def srt_subtitles_timecode(seconds):
2335 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2336
2337
2338 def dfxp2srt(dfxp_data):
2339 _x = functools.partial(xpath_with_ns, ns_map={
2340 'ttml': 'http://www.w3.org/ns/ttml',
2341 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2342 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2343 })
2344
2345 class TTMLPElementParser(object):
2346 out = ''
2347
2348 def start(self, tag, attrib):
2349 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2350 self.out += '\n'
2351
2352 def end(self, tag):
2353 pass
2354
2355 def data(self, data):
2356 self.out += data
2357
2358 def close(self):
2359 return self.out.strip()
2360
2361 def parse_node(node):
2362 target = TTMLPElementParser()
2363 parser = xml.etree.ElementTree.XMLParser(target=target)
2364 parser.feed(xml.etree.ElementTree.tostring(node))
2365 return parser.close()
2366
2367 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2368 out = []
2369 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2370
2371 if not paras:
2372 raise ValueError('Invalid dfxp/TTML subtitle')
2373
2374 for para, index in zip(paras, itertools.count(1)):
2375 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2376 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2377 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2378 if begin_time is None:
2379 continue
2380 if not end_time:
2381 if not dur:
2382 continue
2383 end_time = begin_time + dur
2384 out.append('%d\n%s --> %s\n%s\n\n' % (
2385 index,
2386 srt_subtitles_timecode(begin_time),
2387 srt_subtitles_timecode(end_time),
2388 parse_node(para)))
2389
2390 return ''.join(out)
2391
2392
2393 def cli_option(params, command_option, param):
2394 param = params.get(param)
2395 return [command_option, param] if param is not None else []
2396
2397
2398 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2399 param = params.get(param)
2400 assert isinstance(param, bool)
2401 if separator:
2402 return [command_option + separator + (true_value if param else false_value)]
2403 return [command_option, true_value if param else false_value]
2404
2405
2406 def cli_valueless_option(params, command_option, param, expected_value=True):
2407 param = params.get(param)
2408 return [command_option] if param == expected_value else []
2409
2410
2411 def cli_configuration_args(params, param, default=[]):
2412 ex_args = params.get(param)
2413 if ex_args is None:
2414 return default
2415 assert isinstance(ex_args, list)
2416 return ex_args
2417
2418
2419 class ISO639Utils(object):
2420 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2421 _lang_map = {
2422 'aa': 'aar',
2423 'ab': 'abk',
2424 'ae': 'ave',
2425 'af': 'afr',
2426 'ak': 'aka',
2427 'am': 'amh',
2428 'an': 'arg',
2429 'ar': 'ara',
2430 'as': 'asm',
2431 'av': 'ava',
2432 'ay': 'aym',
2433 'az': 'aze',
2434 'ba': 'bak',
2435 'be': 'bel',
2436 'bg': 'bul',
2437 'bh': 'bih',
2438 'bi': 'bis',
2439 'bm': 'bam',
2440 'bn': 'ben',
2441 'bo': 'bod',
2442 'br': 'bre',
2443 'bs': 'bos',
2444 'ca': 'cat',
2445 'ce': 'che',
2446 'ch': 'cha',
2447 'co': 'cos',
2448 'cr': 'cre',
2449 'cs': 'ces',
2450 'cu': 'chu',
2451 'cv': 'chv',
2452 'cy': 'cym',
2453 'da': 'dan',
2454 'de': 'deu',
2455 'dv': 'div',
2456 'dz': 'dzo',
2457 'ee': 'ewe',
2458 'el': 'ell',
2459 'en': 'eng',
2460 'eo': 'epo',
2461 'es': 'spa',
2462 'et': 'est',
2463 'eu': 'eus',
2464 'fa': 'fas',
2465 'ff': 'ful',
2466 'fi': 'fin',
2467 'fj': 'fij',
2468 'fo': 'fao',
2469 'fr': 'fra',
2470 'fy': 'fry',
2471 'ga': 'gle',
2472 'gd': 'gla',
2473 'gl': 'glg',
2474 'gn': 'grn',
2475 'gu': 'guj',
2476 'gv': 'glv',
2477 'ha': 'hau',
2478 'he': 'heb',
2479 'hi': 'hin',
2480 'ho': 'hmo',
2481 'hr': 'hrv',
2482 'ht': 'hat',
2483 'hu': 'hun',
2484 'hy': 'hye',
2485 'hz': 'her',
2486 'ia': 'ina',
2487 'id': 'ind',
2488 'ie': 'ile',
2489 'ig': 'ibo',
2490 'ii': 'iii',
2491 'ik': 'ipk',
2492 'io': 'ido',
2493 'is': 'isl',
2494 'it': 'ita',
2495 'iu': 'iku',
2496 'ja': 'jpn',
2497 'jv': 'jav',
2498 'ka': 'kat',
2499 'kg': 'kon',
2500 'ki': 'kik',
2501 'kj': 'kua',
2502 'kk': 'kaz',
2503 'kl': 'kal',
2504 'km': 'khm',
2505 'kn': 'kan',
2506 'ko': 'kor',
2507 'kr': 'kau',
2508 'ks': 'kas',
2509 'ku': 'kur',
2510 'kv': 'kom',
2511 'kw': 'cor',
2512 'ky': 'kir',
2513 'la': 'lat',
2514 'lb': 'ltz',
2515 'lg': 'lug',
2516 'li': 'lim',
2517 'ln': 'lin',
2518 'lo': 'lao',
2519 'lt': 'lit',
2520 'lu': 'lub',
2521 'lv': 'lav',
2522 'mg': 'mlg',
2523 'mh': 'mah',
2524 'mi': 'mri',
2525 'mk': 'mkd',
2526 'ml': 'mal',
2527 'mn': 'mon',
2528 'mr': 'mar',
2529 'ms': 'msa',
2530 'mt': 'mlt',
2531 'my': 'mya',
2532 'na': 'nau',
2533 'nb': 'nob',
2534 'nd': 'nde',
2535 'ne': 'nep',
2536 'ng': 'ndo',
2537 'nl': 'nld',
2538 'nn': 'nno',
2539 'no': 'nor',
2540 'nr': 'nbl',
2541 'nv': 'nav',
2542 'ny': 'nya',
2543 'oc': 'oci',
2544 'oj': 'oji',
2545 'om': 'orm',
2546 'or': 'ori',
2547 'os': 'oss',
2548 'pa': 'pan',
2549 'pi': 'pli',
2550 'pl': 'pol',
2551 'ps': 'pus',
2552 'pt': 'por',
2553 'qu': 'que',
2554 'rm': 'roh',
2555 'rn': 'run',
2556 'ro': 'ron',
2557 'ru': 'rus',
2558 'rw': 'kin',
2559 'sa': 'san',
2560 'sc': 'srd',
2561 'sd': 'snd',
2562 'se': 'sme',
2563 'sg': 'sag',
2564 'si': 'sin',
2565 'sk': 'slk',
2566 'sl': 'slv',
2567 'sm': 'smo',
2568 'sn': 'sna',
2569 'so': 'som',
2570 'sq': 'sqi',
2571 'sr': 'srp',
2572 'ss': 'ssw',
2573 'st': 'sot',
2574 'su': 'sun',
2575 'sv': 'swe',
2576 'sw': 'swa',
2577 'ta': 'tam',
2578 'te': 'tel',
2579 'tg': 'tgk',
2580 'th': 'tha',
2581 'ti': 'tir',
2582 'tk': 'tuk',
2583 'tl': 'tgl',
2584 'tn': 'tsn',
2585 'to': 'ton',
2586 'tr': 'tur',
2587 'ts': 'tso',
2588 'tt': 'tat',
2589 'tw': 'twi',
2590 'ty': 'tah',
2591 'ug': 'uig',
2592 'uk': 'ukr',
2593 'ur': 'urd',
2594 'uz': 'uzb',
2595 've': 'ven',
2596 'vi': 'vie',
2597 'vo': 'vol',
2598 'wa': 'wln',
2599 'wo': 'wol',
2600 'xh': 'xho',
2601 'yi': 'yid',
2602 'yo': 'yor',
2603 'za': 'zha',
2604 'zh': 'zho',
2605 'zu': 'zul',
2606 }
2607
2608 @classmethod
2609 def short2long(cls, code):
2610 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2611 return cls._lang_map.get(code[:2])
2612
2613 @classmethod
2614 def long2short(cls, code):
2615 """Convert language code from ISO 639-2/T to ISO 639-1"""
2616 for short_name, long_name in cls._lang_map.items():
2617 if long_name == code:
2618 return short_name
2619
2620
2621 class ISO3166Utils(object):
2622 # From http://data.okfn.org/data/core/country-list
2623 _country_map = {
2624 'AF': 'Afghanistan',
2625 'AX': 'Åland Islands',
2626 'AL': 'Albania',
2627 'DZ': 'Algeria',
2628 'AS': 'American Samoa',
2629 'AD': 'Andorra',
2630 'AO': 'Angola',
2631 'AI': 'Anguilla',
2632 'AQ': 'Antarctica',
2633 'AG': 'Antigua and Barbuda',
2634 'AR': 'Argentina',
2635 'AM': 'Armenia',
2636 'AW': 'Aruba',
2637 'AU': 'Australia',
2638 'AT': 'Austria',
2639 'AZ': 'Azerbaijan',
2640 'BS': 'Bahamas',
2641 'BH': 'Bahrain',
2642 'BD': 'Bangladesh',
2643 'BB': 'Barbados',
2644 'BY': 'Belarus',
2645 'BE': 'Belgium',
2646 'BZ': 'Belize',
2647 'BJ': 'Benin',
2648 'BM': 'Bermuda',
2649 'BT': 'Bhutan',
2650 'BO': 'Bolivia, Plurinational State of',
2651 'BQ': 'Bonaire, Sint Eustatius and Saba',
2652 'BA': 'Bosnia and Herzegovina',
2653 'BW': 'Botswana',
2654 'BV': 'Bouvet Island',
2655 'BR': 'Brazil',
2656 'IO': 'British Indian Ocean Territory',
2657 'BN': 'Brunei Darussalam',
2658 'BG': 'Bulgaria',
2659 'BF': 'Burkina Faso',
2660 'BI': 'Burundi',
2661 'KH': 'Cambodia',
2662 'CM': 'Cameroon',
2663 'CA': 'Canada',
2664 'CV': 'Cape Verde',
2665 'KY': 'Cayman Islands',
2666 'CF': 'Central African Republic',
2667 'TD': 'Chad',
2668 'CL': 'Chile',
2669 'CN': 'China',
2670 'CX': 'Christmas Island',
2671 'CC': 'Cocos (Keeling) Islands',
2672 'CO': 'Colombia',
2673 'KM': 'Comoros',
2674 'CG': 'Congo',
2675 'CD': 'Congo, the Democratic Republic of the',
2676 'CK': 'Cook Islands',
2677 'CR': 'Costa Rica',
2678 'CI': 'Côte d\'Ivoire',
2679 'HR': 'Croatia',
2680 'CU': 'Cuba',
2681 'CW': 'Curaçao',
2682 'CY': 'Cyprus',
2683 'CZ': 'Czech Republic',
2684 'DK': 'Denmark',
2685 'DJ': 'Djibouti',
2686 'DM': 'Dominica',
2687 'DO': 'Dominican Republic',
2688 'EC': 'Ecuador',
2689 'EG': 'Egypt',
2690 'SV': 'El Salvador',
2691 'GQ': 'Equatorial Guinea',
2692 'ER': 'Eritrea',
2693 'EE': 'Estonia',
2694 'ET': 'Ethiopia',
2695 'FK': 'Falkland Islands (Malvinas)',
2696 'FO': 'Faroe Islands',
2697 'FJ': 'Fiji',
2698 'FI': 'Finland',
2699 'FR': 'France',
2700 'GF': 'French Guiana',
2701 'PF': 'French Polynesia',
2702 'TF': 'French Southern Territories',
2703 'GA': 'Gabon',
2704 'GM': 'Gambia',
2705 'GE': 'Georgia',
2706 'DE': 'Germany',
2707 'GH': 'Ghana',
2708 'GI': 'Gibraltar',
2709 'GR': 'Greece',
2710 'GL': 'Greenland',
2711 'GD': 'Grenada',
2712 'GP': 'Guadeloupe',
2713 'GU': 'Guam',
2714 'GT': 'Guatemala',
2715 'GG': 'Guernsey',
2716 'GN': 'Guinea',
2717 'GW': 'Guinea-Bissau',
2718 'GY': 'Guyana',
2719 'HT': 'Haiti',
2720 'HM': 'Heard Island and McDonald Islands',
2721 'VA': 'Holy See (Vatican City State)',
2722 'HN': 'Honduras',
2723 'HK': 'Hong Kong',
2724 'HU': 'Hungary',
2725 'IS': 'Iceland',
2726 'IN': 'India',
2727 'ID': 'Indonesia',
2728 'IR': 'Iran, Islamic Republic of',
2729 'IQ': 'Iraq',
2730 'IE': 'Ireland',
2731 'IM': 'Isle of Man',
2732 'IL': 'Israel',
2733 'IT': 'Italy',
2734 'JM': 'Jamaica',
2735 'JP': 'Japan',
2736 'JE': 'Jersey',
2737 'JO': 'Jordan',
2738 'KZ': 'Kazakhstan',
2739 'KE': 'Kenya',
2740 'KI': 'Kiribati',
2741 'KP': 'Korea, Democratic People\'s Republic of',
2742 'KR': 'Korea, Republic of',
2743 'KW': 'Kuwait',
2744 'KG': 'Kyrgyzstan',
2745 'LA': 'Lao People\'s Democratic Republic',
2746 'LV': 'Latvia',
2747 'LB': 'Lebanon',
2748 'LS': 'Lesotho',
2749 'LR': 'Liberia',
2750 'LY': 'Libya',
2751 'LI': 'Liechtenstein',
2752 'LT': 'Lithuania',
2753 'LU': 'Luxembourg',
2754 'MO': 'Macao',
2755 'MK': 'Macedonia, the Former Yugoslav Republic of',
2756 'MG': 'Madagascar',
2757 'MW': 'Malawi',
2758 'MY': 'Malaysia',
2759 'MV': 'Maldives',
2760 'ML': 'Mali',
2761 'MT': 'Malta',
2762 'MH': 'Marshall Islands',
2763 'MQ': 'Martinique',
2764 'MR': 'Mauritania',
2765 'MU': 'Mauritius',
2766 'YT': 'Mayotte',
2767 'MX': 'Mexico',
2768 'FM': 'Micronesia, Federated States of',
2769 'MD': 'Moldova, Republic of',
2770 'MC': 'Monaco',
2771 'MN': 'Mongolia',
2772 'ME': 'Montenegro',
2773 'MS': 'Montserrat',
2774 'MA': 'Morocco',
2775 'MZ': 'Mozambique',
2776 'MM': 'Myanmar',
2777 'NA': 'Namibia',
2778 'NR': 'Nauru',
2779 'NP': 'Nepal',
2780 'NL': 'Netherlands',
2781 'NC': 'New Caledonia',
2782 'NZ': 'New Zealand',
2783 'NI': 'Nicaragua',
2784 'NE': 'Niger',
2785 'NG': 'Nigeria',
2786 'NU': 'Niue',
2787 'NF': 'Norfolk Island',
2788 'MP': 'Northern Mariana Islands',
2789 'NO': 'Norway',
2790 'OM': 'Oman',
2791 'PK': 'Pakistan',
2792 'PW': 'Palau',
2793 'PS': 'Palestine, State of',
2794 'PA': 'Panama',
2795 'PG': 'Papua New Guinea',
2796 'PY': 'Paraguay',
2797 'PE': 'Peru',
2798 'PH': 'Philippines',
2799 'PN': 'Pitcairn',
2800 'PL': 'Poland',
2801 'PT': 'Portugal',
2802 'PR': 'Puerto Rico',
2803 'QA': 'Qatar',
2804 'RE': 'Réunion',
2805 'RO': 'Romania',
2806 'RU': 'Russian Federation',
2807 'RW': 'Rwanda',
2808 'BL': 'Saint Barthélemy',
2809 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2810 'KN': 'Saint Kitts and Nevis',
2811 'LC': 'Saint Lucia',
2812 'MF': 'Saint Martin (French part)',
2813 'PM': 'Saint Pierre and Miquelon',
2814 'VC': 'Saint Vincent and the Grenadines',
2815 'WS': 'Samoa',
2816 'SM': 'San Marino',
2817 'ST': 'Sao Tome and Principe',
2818 'SA': 'Saudi Arabia',
2819 'SN': 'Senegal',
2820 'RS': 'Serbia',
2821 'SC': 'Seychelles',
2822 'SL': 'Sierra Leone',
2823 'SG': 'Singapore',
2824 'SX': 'Sint Maarten (Dutch part)',
2825 'SK': 'Slovakia',
2826 'SI': 'Slovenia',
2827 'SB': 'Solomon Islands',
2828 'SO': 'Somalia',
2829 'ZA': 'South Africa',
2830 'GS': 'South Georgia and the South Sandwich Islands',
2831 'SS': 'South Sudan',
2832 'ES': 'Spain',
2833 'LK': 'Sri Lanka',
2834 'SD': 'Sudan',
2835 'SR': 'Suriname',
2836 'SJ': 'Svalbard and Jan Mayen',
2837 'SZ': 'Swaziland',
2838 'SE': 'Sweden',
2839 'CH': 'Switzerland',
2840 'SY': 'Syrian Arab Republic',
2841 'TW': 'Taiwan, Province of China',
2842 'TJ': 'Tajikistan',
2843 'TZ': 'Tanzania, United Republic of',
2844 'TH': 'Thailand',
2845 'TL': 'Timor-Leste',
2846 'TG': 'Togo',
2847 'TK': 'Tokelau',
2848 'TO': 'Tonga',
2849 'TT': 'Trinidad and Tobago',
2850 'TN': 'Tunisia',
2851 'TR': 'Turkey',
2852 'TM': 'Turkmenistan',
2853 'TC': 'Turks and Caicos Islands',
2854 'TV': 'Tuvalu',
2855 'UG': 'Uganda',
2856 'UA': 'Ukraine',
2857 'AE': 'United Arab Emirates',
2858 'GB': 'United Kingdom',
2859 'US': 'United States',
2860 'UM': 'United States Minor Outlying Islands',
2861 'UY': 'Uruguay',
2862 'UZ': 'Uzbekistan',
2863 'VU': 'Vanuatu',
2864 'VE': 'Venezuela, Bolivarian Republic of',
2865 'VN': 'Viet Nam',
2866 'VG': 'Virgin Islands, British',
2867 'VI': 'Virgin Islands, U.S.',
2868 'WF': 'Wallis and Futuna',
2869 'EH': 'Western Sahara',
2870 'YE': 'Yemen',
2871 'ZM': 'Zambia',
2872 'ZW': 'Zimbabwe',
2873 }
2874
2875 @classmethod
2876 def short2full(cls, code):
2877 """Convert an ISO 3166-2 country code to the corresponding full name"""
2878 return cls._country_map.get(code.upper())
2879
2880
2881 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2882 def __init__(self, proxies=None):
2883 # Set default handlers
2884 for type in ('http', 'https'):
2885 setattr(self, '%s_open' % type,
2886 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2887 meth(r, proxy, type))
2888 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2889
2890 def proxy_open(self, req, proxy, type):
2891 req_proxy = req.headers.get('Ytdl-request-proxy')
2892 if req_proxy is not None:
2893 proxy = req_proxy
2894 del req.headers['Ytdl-request-proxy']
2895
2896 if proxy == '__noproxy__':
2897 return None # No Proxy
2898 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2899 req.add_header('Ytdl-socks-proxy', proxy)
2900 # youtube-dl's http/https handlers do wrapping the socket with socks
2901 return None
2902 return compat_urllib_request.ProxyHandler.proxy_open(
2903 self, req, proxy, type)
2904
2905
2906 def ohdave_rsa_encrypt(data, exponent, modulus):
2907 '''
2908 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2909
2910 Input:
2911 data: data to encrypt, bytes-like object
2912 exponent, modulus: parameter e and N of RSA algorithm, both integer
2913 Output: hex string of encrypted data
2914
2915 Limitation: supports one block encryption only
2916 '''
2917
2918 payload = int(binascii.hexlify(data[::-1]), 16)
2919 encrypted = pow(payload, exponent, modulus)
2920 return '%x' % encrypted
2921
2922
2923 def encode_base_n(num, n, table=None):
2924 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2925 if not table:
2926 table = FULL_TABLE[:n]
2927
2928 if n > len(table):
2929 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2930
2931 if num == 0:
2932 return table[0]
2933
2934 ret = ''
2935 while num:
2936 ret = table[num % n] + ret
2937 num = num // n
2938 return ret
2939
2940
2941 def decode_packed_codes(code):
2942 mobj = re.search(
2943 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2944 code)
2945 obfucasted_code, base, count, symbols = mobj.groups()
2946 base = int(base)
2947 count = int(count)
2948 symbols = symbols.split('|')
2949 symbol_table = {}
2950
2951 while count:
2952 count -= 1
2953 base_n_count = encode_base_n(count, base)
2954 symbol_table[base_n_count] = symbols[count] or base_n_count
2955
2956 return re.sub(
2957 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2958 obfucasted_code)
2959
2960
2961 def parse_m3u8_attributes(attrib):
2962 info = {}
2963 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2964 if val.startswith('"'):
2965 val = val[1:-1]
2966 info[key] = val
2967 return info
2968
2969
2970 def urshift(val, n):
2971 return val >> n if val >= 0 else (val + 0x100000000) >> n