]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[utils] add mimetypes to determine manifest ext(m3u8, f4m, mpd)
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import errno
15 import functools
16 import gzip
17 import io
18 import itertools
19 import json
20 import locale
21 import math
22 import operator
23 import os
24 import pipes
25 import platform
26 import re
27 import socket
28 import ssl
29 import subprocess
30 import sys
31 import tempfile
32 import traceback
33 import xml.etree.ElementTree
34 import zlib
35
36 from .compat import (
37 compat_HTMLParser,
38 compat_basestring,
39 compat_chr,
40 compat_etree_fromstring,
41 compat_html_entities,
42 compat_html_entities_html5,
43 compat_http_client,
44 compat_kwargs,
45 compat_parse_qs,
46 compat_shlex_quote,
47 compat_socket_create_connection,
48 compat_str,
49 compat_struct_pack,
50 compat_urllib_error,
51 compat_urllib_parse,
52 compat_urllib_parse_urlencode,
53 compat_urllib_parse_urlparse,
54 compat_urllib_parse_unquote_plus,
55 compat_urllib_request,
56 compat_urlparse,
57 compat_xpath,
58 )
59
60 from .socks import (
61 ProxyType,
62 sockssocket,
63 )
64
65
66 def register_socks_protocols():
67 # "Register" SOCKS protocols
68 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
69 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
70 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
71 if scheme not in compat_urlparse.uses_netloc:
72 compat_urlparse.uses_netloc.append(scheme)
73
74
75 # This is not clearly defined otherwise
76 compiled_regex_type = type(re.compile(''))
77
78 std_headers = {
79 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
80 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
81 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
82 'Accept-Encoding': 'gzip, deflate',
83 'Accept-Language': 'en-us,en;q=0.5',
84 }
85
86
87 NO_DEFAULT = object()
88
89 ENGLISH_MONTH_NAMES = [
90 'January', 'February', 'March', 'April', 'May', 'June',
91 'July', 'August', 'September', 'October', 'November', 'December']
92
93 KNOWN_EXTENSIONS = (
94 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
95 'flv', 'f4v', 'f4a', 'f4b',
96 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
97 'mkv', 'mka', 'mk3d',
98 'avi', 'divx',
99 'mov',
100 'asf', 'wmv', 'wma',
101 '3gp', '3g2',
102 'mp3',
103 'flac',
104 'ape',
105 'wav',
106 'f4f', 'f4m', 'm3u8', 'smil')
107
108 # needed for sanitizing filenames in restricted mode
109 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
110 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
111 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
112
113 DATE_FORMATS = (
114 '%d %B %Y',
115 '%d %b %Y',
116 '%B %d %Y',
117 '%b %d %Y',
118 '%b %dst %Y %I:%M',
119 '%b %dnd %Y %I:%M',
120 '%b %dth %Y %I:%M',
121 '%Y %m %d',
122 '%Y-%m-%d',
123 '%Y/%m/%d',
124 '%Y/%m/%d %H:%M:%S',
125 '%Y-%m-%d %H:%M:%S',
126 '%Y-%m-%d %H:%M:%S.%f',
127 '%d.%m.%Y %H:%M',
128 '%d.%m.%Y %H.%M',
129 '%Y-%m-%dT%H:%M:%SZ',
130 '%Y-%m-%dT%H:%M:%S.%fZ',
131 '%Y-%m-%dT%H:%M:%S.%f0Z',
132 '%Y-%m-%dT%H:%M:%S',
133 '%Y-%m-%dT%H:%M:%S.%f',
134 '%Y-%m-%dT%H:%M',
135 )
136
137 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
138 DATE_FORMATS_DAY_FIRST.extend([
139 '%d-%m-%Y',
140 '%d.%m.%Y',
141 '%d.%m.%y',
142 '%d/%m/%Y',
143 '%d/%m/%y',
144 '%d/%m/%Y %H:%M:%S',
145 ])
146
147 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
148 DATE_FORMATS_MONTH_FIRST.extend([
149 '%m-%d-%Y',
150 '%m.%d.%Y',
151 '%m/%d/%Y',
152 '%m/%d/%y',
153 '%m/%d/%Y %H:%M:%S',
154 ])
155
156
157 def preferredencoding():
158 """Get preferred encoding.
159
160 Returns the best encoding scheme for the system, based on
161 locale.getpreferredencoding() and some further tweaks.
162 """
163 try:
164 pref = locale.getpreferredencoding()
165 'TEST'.encode(pref)
166 except Exception:
167 pref = 'UTF-8'
168
169 return pref
170
171
172 def write_json_file(obj, fn):
173 """ Encode obj as JSON and write it to fn, atomically if possible """
174
175 fn = encodeFilename(fn)
176 if sys.version_info < (3, 0) and sys.platform != 'win32':
177 encoding = get_filesystem_encoding()
178 # os.path.basename returns a bytes object, but NamedTemporaryFile
179 # will fail if the filename contains non ascii characters unless we
180 # use a unicode object
181 path_basename = lambda f: os.path.basename(fn).decode(encoding)
182 # the same for os.path.dirname
183 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
184 else:
185 path_basename = os.path.basename
186 path_dirname = os.path.dirname
187
188 args = {
189 'suffix': '.tmp',
190 'prefix': path_basename(fn) + '.',
191 'dir': path_dirname(fn),
192 'delete': False,
193 }
194
195 # In Python 2.x, json.dump expects a bytestream.
196 # In Python 3.x, it writes to a character stream
197 if sys.version_info < (3, 0):
198 args['mode'] = 'wb'
199 else:
200 args.update({
201 'mode': 'w',
202 'encoding': 'utf-8',
203 })
204
205 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
206
207 try:
208 with tf:
209 json.dump(obj, tf)
210 if sys.platform == 'win32':
211 # Need to remove existing file on Windows, else os.rename raises
212 # WindowsError or FileExistsError.
213 try:
214 os.unlink(fn)
215 except OSError:
216 pass
217 os.rename(tf.name, fn)
218 except Exception:
219 try:
220 os.remove(tf.name)
221 except OSError:
222 pass
223 raise
224
225
226 if sys.version_info >= (2, 7):
227 def find_xpath_attr(node, xpath, key, val=None):
228 """ Find the xpath xpath[@key=val] """
229 assert re.match(r'^[a-zA-Z_-]+$', key)
230 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
231 return node.find(expr)
232 else:
233 def find_xpath_attr(node, xpath, key, val=None):
234 for f in node.findall(compat_xpath(xpath)):
235 if key not in f.attrib:
236 continue
237 if val is None or f.attrib.get(key) == val:
238 return f
239 return None
240
241 # On python2.6 the xml.etree.ElementTree.Element methods don't support
242 # the namespace parameter
243
244
245 def xpath_with_ns(path, ns_map):
246 components = [c.split(':') for c in path.split('/')]
247 replaced = []
248 for c in components:
249 if len(c) == 1:
250 replaced.append(c[0])
251 else:
252 ns, tag = c
253 replaced.append('{%s}%s' % (ns_map[ns], tag))
254 return '/'.join(replaced)
255
256
257 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
258 def _find_xpath(xpath):
259 return node.find(compat_xpath(xpath))
260
261 if isinstance(xpath, (str, compat_str)):
262 n = _find_xpath(xpath)
263 else:
264 for xp in xpath:
265 n = _find_xpath(xp)
266 if n is not None:
267 break
268
269 if n is None:
270 if default is not NO_DEFAULT:
271 return default
272 elif fatal:
273 name = xpath if name is None else name
274 raise ExtractorError('Could not find XML element %s' % name)
275 else:
276 return None
277 return n
278
279
280 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
281 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
282 if n is None or n == default:
283 return n
284 if n.text is None:
285 if default is not NO_DEFAULT:
286 return default
287 elif fatal:
288 name = xpath if name is None else name
289 raise ExtractorError('Could not find XML element\'s text %s' % name)
290 else:
291 return None
292 return n.text
293
294
295 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
296 n = find_xpath_attr(node, xpath, key)
297 if n is None:
298 if default is not NO_DEFAULT:
299 return default
300 elif fatal:
301 name = '%s[@%s]' % (xpath, key) if name is None else name
302 raise ExtractorError('Could not find XML attribute %s' % name)
303 else:
304 return None
305 return n.attrib[key]
306
307
308 def get_element_by_id(id, html):
309 """Return the content of the tag with the specified ID in the passed HTML document"""
310 return get_element_by_attribute('id', id, html)
311
312
313 def get_element_by_attribute(attribute, value, html):
314 """Return the content of the tag with the specified attribute in the passed HTML document"""
315
316 m = re.search(r'''(?xs)
317 <([a-zA-Z0-9:._-]+)
318 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
319 \s+%s=['"]?%s['"]?
320 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
321 \s*>
322 (?P<content>.*?)
323 </\1>
324 ''' % (re.escape(attribute), re.escape(value)), html)
325
326 if not m:
327 return None
328 res = m.group('content')
329
330 if res.startswith('"') or res.startswith("'"):
331 res = res[1:-1]
332
333 return unescapeHTML(res)
334
335
336 class HTMLAttributeParser(compat_HTMLParser):
337 """Trivial HTML parser to gather the attributes for a single element"""
338 def __init__(self):
339 self.attrs = {}
340 compat_HTMLParser.__init__(self)
341
342 def handle_starttag(self, tag, attrs):
343 self.attrs = dict(attrs)
344
345
346 def extract_attributes(html_element):
347 """Given a string for an HTML element such as
348 <el
349 a="foo" B="bar" c="&98;az" d=boz
350 empty= noval entity="&amp;"
351 sq='"' dq="'"
352 >
353 Decode and return a dictionary of attributes.
354 {
355 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
356 'empty': '', 'noval': None, 'entity': '&',
357 'sq': '"', 'dq': '\''
358 }.
359 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
360 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
361 """
362 parser = HTMLAttributeParser()
363 parser.feed(html_element)
364 parser.close()
365 return parser.attrs
366
367
368 def clean_html(html):
369 """Clean an HTML snippet into a readable string"""
370
371 if html is None: # Convenience for sanitizing descriptions etc.
372 return html
373
374 # Newline vs <br />
375 html = html.replace('\n', ' ')
376 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
377 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
378 # Strip html tags
379 html = re.sub('<.*?>', '', html)
380 # Replace html entities
381 html = unescapeHTML(html)
382 return html.strip()
383
384
385 def sanitize_open(filename, open_mode):
386 """Try to open the given filename, and slightly tweak it if this fails.
387
388 Attempts to open the given filename. If this fails, it tries to change
389 the filename slightly, step by step, until it's either able to open it
390 or it fails and raises a final exception, like the standard open()
391 function.
392
393 It returns the tuple (stream, definitive_file_name).
394 """
395 try:
396 if filename == '-':
397 if sys.platform == 'win32':
398 import msvcrt
399 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
400 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
401 stream = open(encodeFilename(filename), open_mode)
402 return (stream, filename)
403 except (IOError, OSError) as err:
404 if err.errno in (errno.EACCES,):
405 raise
406
407 # In case of error, try to remove win32 forbidden chars
408 alt_filename = sanitize_path(filename)
409 if alt_filename == filename:
410 raise
411 else:
412 # An exception here should be caught in the caller
413 stream = open(encodeFilename(alt_filename), open_mode)
414 return (stream, alt_filename)
415
416
417 def timeconvert(timestr):
418 """Convert RFC 2822 defined time string into system timestamp"""
419 timestamp = None
420 timetuple = email.utils.parsedate_tz(timestr)
421 if timetuple is not None:
422 timestamp = email.utils.mktime_tz(timetuple)
423 return timestamp
424
425
426 def sanitize_filename(s, restricted=False, is_id=False):
427 """Sanitizes a string so it could be used as part of a filename.
428 If restricted is set, use a stricter subset of allowed characters.
429 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
430 """
431 def replace_insane(char):
432 if restricted and char in ACCENT_CHARS:
433 return ACCENT_CHARS[char]
434 if char == '?' or ord(char) < 32 or ord(char) == 127:
435 return ''
436 elif char == '"':
437 return '' if restricted else '\''
438 elif char == ':':
439 return '_-' if restricted else ' -'
440 elif char in '\\/|*<>':
441 return '_'
442 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
443 return '_'
444 if restricted and ord(char) > 127:
445 return '_'
446 return char
447
448 # Handle timestamps
449 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
450 result = ''.join(map(replace_insane, s))
451 if not is_id:
452 while '__' in result:
453 result = result.replace('__', '_')
454 result = result.strip('_')
455 # Common case of "Foreign band name - English song title"
456 if restricted and result.startswith('-_'):
457 result = result[2:]
458 if result.startswith('-'):
459 result = '_' + result[len('-'):]
460 result = result.lstrip('.')
461 if not result:
462 result = '_'
463 return result
464
465
466 def sanitize_path(s):
467 """Sanitizes and normalizes path on Windows"""
468 if sys.platform != 'win32':
469 return s
470 drive_or_unc, _ = os.path.splitdrive(s)
471 if sys.version_info < (2, 7) and not drive_or_unc:
472 drive_or_unc, _ = os.path.splitunc(s)
473 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
474 if drive_or_unc:
475 norm_path.pop(0)
476 sanitized_path = [
477 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
478 for path_part in norm_path]
479 if drive_or_unc:
480 sanitized_path.insert(0, drive_or_unc + os.path.sep)
481 return os.path.join(*sanitized_path)
482
483
484 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
485 # unwanted failures due to missing protocol
486 def sanitize_url(url):
487 return 'http:%s' % url if url.startswith('//') else url
488
489
490 def sanitized_Request(url, *args, **kwargs):
491 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
492
493
494 def orderedSet(iterable):
495 """ Remove all duplicates from the input iterable """
496 res = []
497 for el in iterable:
498 if el not in res:
499 res.append(el)
500 return res
501
502
503 def _htmlentity_transform(entity_with_semicolon):
504 """Transforms an HTML entity to a character."""
505 entity = entity_with_semicolon[:-1]
506
507 # Known non-numeric HTML entity
508 if entity in compat_html_entities.name2codepoint:
509 return compat_chr(compat_html_entities.name2codepoint[entity])
510
511 # TODO: HTML5 allows entities without a semicolon. For example,
512 # '&Eacuteric' should be decoded as 'Éric'.
513 if entity_with_semicolon in compat_html_entities_html5:
514 return compat_html_entities_html5[entity_with_semicolon]
515
516 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
517 if mobj is not None:
518 numstr = mobj.group(1)
519 if numstr.startswith('x'):
520 base = 16
521 numstr = '0%s' % numstr
522 else:
523 base = 10
524 # See https://github.com/rg3/youtube-dl/issues/7518
525 try:
526 return compat_chr(int(numstr, base))
527 except ValueError:
528 pass
529
530 # Unknown entity in name, return its literal representation
531 return '&%s;' % entity
532
533
534 def unescapeHTML(s):
535 if s is None:
536 return None
537 assert type(s) == compat_str
538
539 return re.sub(
540 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
541
542
543 def get_subprocess_encoding():
544 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
545 # For subprocess calls, encode with locale encoding
546 # Refer to http://stackoverflow.com/a/9951851/35070
547 encoding = preferredencoding()
548 else:
549 encoding = sys.getfilesystemencoding()
550 if encoding is None:
551 encoding = 'utf-8'
552 return encoding
553
554
555 def encodeFilename(s, for_subprocess=False):
556 """
557 @param s The name of the file
558 """
559
560 assert type(s) == compat_str
561
562 # Python 3 has a Unicode API
563 if sys.version_info >= (3, 0):
564 return s
565
566 # Pass '' directly to use Unicode APIs on Windows 2000 and up
567 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
568 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
569 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
570 return s
571
572 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
573 if sys.platform.startswith('java'):
574 return s
575
576 return s.encode(get_subprocess_encoding(), 'ignore')
577
578
579 def decodeFilename(b, for_subprocess=False):
580
581 if sys.version_info >= (3, 0):
582 return b
583
584 if not isinstance(b, bytes):
585 return b
586
587 return b.decode(get_subprocess_encoding(), 'ignore')
588
589
590 def encodeArgument(s):
591 if not isinstance(s, compat_str):
592 # Legacy code that uses byte strings
593 # Uncomment the following line after fixing all post processors
594 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
595 s = s.decode('ascii')
596 return encodeFilename(s, True)
597
598
599 def decodeArgument(b):
600 return decodeFilename(b, True)
601
602
603 def decodeOption(optval):
604 if optval is None:
605 return optval
606 if isinstance(optval, bytes):
607 optval = optval.decode(preferredencoding())
608
609 assert isinstance(optval, compat_str)
610 return optval
611
612
613 def formatSeconds(secs):
614 if secs > 3600:
615 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
616 elif secs > 60:
617 return '%d:%02d' % (secs // 60, secs % 60)
618 else:
619 return '%d' % secs
620
621
622 def make_HTTPS_handler(params, **kwargs):
623 opts_no_check_certificate = params.get('nocheckcertificate', False)
624 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
625 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
626 if opts_no_check_certificate:
627 context.check_hostname = False
628 context.verify_mode = ssl.CERT_NONE
629 try:
630 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
631 except TypeError:
632 # Python 2.7.8
633 # (create_default_context present but HTTPSHandler has no context=)
634 pass
635
636 if sys.version_info < (3, 2):
637 return YoutubeDLHTTPSHandler(params, **kwargs)
638 else: # Python < 3.4
639 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
640 context.verify_mode = (ssl.CERT_NONE
641 if opts_no_check_certificate
642 else ssl.CERT_REQUIRED)
643 context.set_default_verify_paths()
644 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
645
646
647 def bug_reports_message():
648 if ytdl_is_updateable():
649 update_cmd = 'type youtube-dl -U to update'
650 else:
651 update_cmd = 'see https://yt-dl.org/update on how to update'
652 msg = '; please report this issue on https://yt-dl.org/bug .'
653 msg += ' Make sure you are using the latest version; %s.' % update_cmd
654 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
655 return msg
656
657
658 class ExtractorError(Exception):
659 """Error during info extraction."""
660
661 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
662 """ tb, if given, is the original traceback (so that it can be printed out).
663 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
664 """
665
666 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
667 expected = True
668 if video_id is not None:
669 msg = video_id + ': ' + msg
670 if cause:
671 msg += ' (caused by %r)' % cause
672 if not expected:
673 msg += bug_reports_message()
674 super(ExtractorError, self).__init__(msg)
675
676 self.traceback = tb
677 self.exc_info = sys.exc_info() # preserve original exception
678 self.cause = cause
679 self.video_id = video_id
680
681 def format_traceback(self):
682 if self.traceback is None:
683 return None
684 return ''.join(traceback.format_tb(self.traceback))
685
686
687 class UnsupportedError(ExtractorError):
688 def __init__(self, url):
689 super(UnsupportedError, self).__init__(
690 'Unsupported URL: %s' % url, expected=True)
691 self.url = url
692
693
694 class RegexNotFoundError(ExtractorError):
695 """Error when a regex didn't match"""
696 pass
697
698
699 class DownloadError(Exception):
700 """Download Error exception.
701
702 This exception may be thrown by FileDownloader objects if they are not
703 configured to continue on errors. They will contain the appropriate
704 error message.
705 """
706
707 def __init__(self, msg, exc_info=None):
708 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
709 super(DownloadError, self).__init__(msg)
710 self.exc_info = exc_info
711
712
713 class SameFileError(Exception):
714 """Same File exception.
715
716 This exception will be thrown by FileDownloader objects if they detect
717 multiple files would have to be downloaded to the same file on disk.
718 """
719 pass
720
721
722 class PostProcessingError(Exception):
723 """Post Processing exception.
724
725 This exception may be raised by PostProcessor's .run() method to
726 indicate an error in the postprocessing task.
727 """
728
729 def __init__(self, msg):
730 self.msg = msg
731
732
733 class MaxDownloadsReached(Exception):
734 """ --max-downloads limit has been reached. """
735 pass
736
737
738 class UnavailableVideoError(Exception):
739 """Unavailable Format exception.
740
741 This exception will be thrown when a video is requested
742 in a format that is not available for that video.
743 """
744 pass
745
746
747 class ContentTooShortError(Exception):
748 """Content Too Short exception.
749
750 This exception may be raised by FileDownloader objects when a file they
751 download is too small for what the server announced first, indicating
752 the connection was probably interrupted.
753 """
754
755 def __init__(self, downloaded, expected):
756 # Both in bytes
757 self.downloaded = downloaded
758 self.expected = expected
759
760
761 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
762 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
763 # expected HTTP responses to meet HTTP/1.0 or later (see also
764 # https://github.com/rg3/youtube-dl/issues/6727)
765 if sys.version_info < (3, 0):
766 kwargs[b'strict'] = True
767 hc = http_class(*args, **kwargs)
768 source_address = ydl_handler._params.get('source_address')
769 if source_address is not None:
770 sa = (source_address, 0)
771 if hasattr(hc, 'source_address'): # Python 2.7+
772 hc.source_address = sa
773 else: # Python 2.6
774 def _hc_connect(self, *args, **kwargs):
775 sock = compat_socket_create_connection(
776 (self.host, self.port), self.timeout, sa)
777 if is_https:
778 self.sock = ssl.wrap_socket(
779 sock, self.key_file, self.cert_file,
780 ssl_version=ssl.PROTOCOL_TLSv1)
781 else:
782 self.sock = sock
783 hc.connect = functools.partial(_hc_connect, hc)
784
785 return hc
786
787
788 def handle_youtubedl_headers(headers):
789 filtered_headers = headers
790
791 if 'Youtubedl-no-compression' in filtered_headers:
792 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
793 del filtered_headers['Youtubedl-no-compression']
794
795 return filtered_headers
796
797
798 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
799 """Handler for HTTP requests and responses.
800
801 This class, when installed with an OpenerDirector, automatically adds
802 the standard headers to every HTTP request and handles gzipped and
803 deflated responses from web servers. If compression is to be avoided in
804 a particular request, the original request in the program code only has
805 to include the HTTP header "Youtubedl-no-compression", which will be
806 removed before making the real request.
807
808 Part of this code was copied from:
809
810 http://techknack.net/python-urllib2-handlers/
811
812 Andrew Rowls, the author of that code, agreed to release it to the
813 public domain.
814 """
815
816 def __init__(self, params, *args, **kwargs):
817 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
818 self._params = params
819
820 def http_open(self, req):
821 conn_class = compat_http_client.HTTPConnection
822
823 socks_proxy = req.headers.get('Ytdl-socks-proxy')
824 if socks_proxy:
825 conn_class = make_socks_conn_class(conn_class, socks_proxy)
826 del req.headers['Ytdl-socks-proxy']
827
828 return self.do_open(functools.partial(
829 _create_http_connection, self, conn_class, False),
830 req)
831
832 @staticmethod
833 def deflate(data):
834 try:
835 return zlib.decompress(data, -zlib.MAX_WBITS)
836 except zlib.error:
837 return zlib.decompress(data)
838
839 @staticmethod
840 def addinfourl_wrapper(stream, headers, url, code):
841 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
842 return compat_urllib_request.addinfourl(stream, headers, url, code)
843 ret = compat_urllib_request.addinfourl(stream, headers, url)
844 ret.code = code
845 return ret
846
847 def http_request(self, req):
848 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
849 # always respected by websites, some tend to give out URLs with non percent-encoded
850 # non-ASCII characters (see telemb.py, ard.py [#3412])
851 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
852 # To work around aforementioned issue we will replace request's original URL with
853 # percent-encoded one
854 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
855 # the code of this workaround has been moved here from YoutubeDL.urlopen()
856 url = req.get_full_url()
857 url_escaped = escape_url(url)
858
859 # Substitute URL if any change after escaping
860 if url != url_escaped:
861 req = update_Request(req, url=url_escaped)
862
863 for h, v in std_headers.items():
864 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
865 # The dict keys are capitalized because of this bug by urllib
866 if h.capitalize() not in req.headers:
867 req.add_header(h, v)
868
869 req.headers = handle_youtubedl_headers(req.headers)
870
871 if sys.version_info < (2, 7) and '#' in req.get_full_url():
872 # Python 2.6 is brain-dead when it comes to fragments
873 req._Request__original = req._Request__original.partition('#')[0]
874 req._Request__r_type = req._Request__r_type.partition('#')[0]
875
876 return req
877
878 def http_response(self, req, resp):
879 old_resp = resp
880 # gzip
881 if resp.headers.get('Content-encoding', '') == 'gzip':
882 content = resp.read()
883 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
884 try:
885 uncompressed = io.BytesIO(gz.read())
886 except IOError as original_ioerror:
887 # There may be junk add the end of the file
888 # See http://stackoverflow.com/q/4928560/35070 for details
889 for i in range(1, 1024):
890 try:
891 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
892 uncompressed = io.BytesIO(gz.read())
893 except IOError:
894 continue
895 break
896 else:
897 raise original_ioerror
898 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
899 resp.msg = old_resp.msg
900 del resp.headers['Content-encoding']
901 # deflate
902 if resp.headers.get('Content-encoding', '') == 'deflate':
903 gz = io.BytesIO(self.deflate(resp.read()))
904 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
905 resp.msg = old_resp.msg
906 del resp.headers['Content-encoding']
907 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
908 # https://github.com/rg3/youtube-dl/issues/6457).
909 if 300 <= resp.code < 400:
910 location = resp.headers.get('Location')
911 if location:
912 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
913 if sys.version_info >= (3, 0):
914 location = location.encode('iso-8859-1').decode('utf-8')
915 else:
916 location = location.decode('utf-8')
917 location_escaped = escape_url(location)
918 if location != location_escaped:
919 del resp.headers['Location']
920 if sys.version_info < (3, 0):
921 location_escaped = location_escaped.encode('utf-8')
922 resp.headers['Location'] = location_escaped
923 return resp
924
925 https_request = http_request
926 https_response = http_response
927
928
929 def make_socks_conn_class(base_class, socks_proxy):
930 assert issubclass(base_class, (
931 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
932
933 url_components = compat_urlparse.urlparse(socks_proxy)
934 if url_components.scheme.lower() == 'socks5':
935 socks_type = ProxyType.SOCKS5
936 elif url_components.scheme.lower() in ('socks', 'socks4'):
937 socks_type = ProxyType.SOCKS4
938 elif url_components.scheme.lower() == 'socks4a':
939 socks_type = ProxyType.SOCKS4A
940
941 def unquote_if_non_empty(s):
942 if not s:
943 return s
944 return compat_urllib_parse_unquote_plus(s)
945
946 proxy_args = (
947 socks_type,
948 url_components.hostname, url_components.port or 1080,
949 True, # Remote DNS
950 unquote_if_non_empty(url_components.username),
951 unquote_if_non_empty(url_components.password),
952 )
953
954 class SocksConnection(base_class):
955 def connect(self):
956 self.sock = sockssocket()
957 self.sock.setproxy(*proxy_args)
958 if type(self.timeout) in (int, float):
959 self.sock.settimeout(self.timeout)
960 self.sock.connect((self.host, self.port))
961
962 if isinstance(self, compat_http_client.HTTPSConnection):
963 if hasattr(self, '_context'): # Python > 2.6
964 self.sock = self._context.wrap_socket(
965 self.sock, server_hostname=self.host)
966 else:
967 self.sock = ssl.wrap_socket(self.sock)
968
969 return SocksConnection
970
971
972 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
973 def __init__(self, params, https_conn_class=None, *args, **kwargs):
974 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
975 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
976 self._params = params
977
978 def https_open(self, req):
979 kwargs = {}
980 conn_class = self._https_conn_class
981
982 if hasattr(self, '_context'): # python > 2.6
983 kwargs['context'] = self._context
984 if hasattr(self, '_check_hostname'): # python 3.x
985 kwargs['check_hostname'] = self._check_hostname
986
987 socks_proxy = req.headers.get('Ytdl-socks-proxy')
988 if socks_proxy:
989 conn_class = make_socks_conn_class(conn_class, socks_proxy)
990 del req.headers['Ytdl-socks-proxy']
991
992 return self.do_open(functools.partial(
993 _create_http_connection, self, conn_class, True),
994 req, **kwargs)
995
996
997 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
998 def __init__(self, cookiejar=None):
999 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1000
1001 def http_response(self, request, response):
1002 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1003 # characters in Set-Cookie HTTP header of last response (see
1004 # https://github.com/rg3/youtube-dl/issues/6769).
1005 # In order to at least prevent crashing we will percent encode Set-Cookie
1006 # header before HTTPCookieProcessor starts processing it.
1007 # if sys.version_info < (3, 0) and response.headers:
1008 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1009 # set_cookie = response.headers.get(set_cookie_header)
1010 # if set_cookie:
1011 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1012 # if set_cookie != set_cookie_escaped:
1013 # del response.headers[set_cookie_header]
1014 # response.headers[set_cookie_header] = set_cookie_escaped
1015 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1016
1017 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1018 https_response = http_response
1019
1020
1021 def extract_timezone(date_str):
1022 m = re.search(
1023 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1024 date_str)
1025 if not m:
1026 timezone = datetime.timedelta()
1027 else:
1028 date_str = date_str[:-len(m.group('tz'))]
1029 if not m.group('sign'):
1030 timezone = datetime.timedelta()
1031 else:
1032 sign = 1 if m.group('sign') == '+' else -1
1033 timezone = datetime.timedelta(
1034 hours=sign * int(m.group('hours')),
1035 minutes=sign * int(m.group('minutes')))
1036 return timezone, date_str
1037
1038
1039 def parse_iso8601(date_str, delimiter='T', timezone=None):
1040 """ Return a UNIX timestamp from the given date """
1041
1042 if date_str is None:
1043 return None
1044
1045 date_str = re.sub(r'\.[0-9]+', '', date_str)
1046
1047 if timezone is None:
1048 timezone, date_str = extract_timezone(date_str)
1049
1050 try:
1051 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1052 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1053 return calendar.timegm(dt.timetuple())
1054 except ValueError:
1055 pass
1056
1057
1058 def date_formats(day_first=True):
1059 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1060
1061
1062 def unified_strdate(date_str, day_first=True):
1063 """Return a string with the date in the format YYYYMMDD"""
1064
1065 if date_str is None:
1066 return None
1067 upload_date = None
1068 # Replace commas
1069 date_str = date_str.replace(',', ' ')
1070 # Remove AM/PM + timezone
1071 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1072 _, date_str = extract_timezone(date_str)
1073
1074 for expression in date_formats(day_first):
1075 try:
1076 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1077 except ValueError:
1078 pass
1079 if upload_date is None:
1080 timetuple = email.utils.parsedate_tz(date_str)
1081 if timetuple:
1082 try:
1083 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1084 except ValueError:
1085 pass
1086 if upload_date is not None:
1087 return compat_str(upload_date)
1088
1089
1090 def unified_timestamp(date_str, day_first=True):
1091 if date_str is None:
1092 return None
1093
1094 date_str = date_str.replace(',', ' ')
1095
1096 pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0)
1097 timezone, date_str = extract_timezone(date_str)
1098
1099 # Remove AM/PM + timezone
1100 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1101
1102 for expression in date_formats(day_first):
1103 try:
1104 dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta
1105 return calendar.timegm(dt.timetuple())
1106 except ValueError:
1107 pass
1108 timetuple = email.utils.parsedate_tz(date_str)
1109 if timetuple:
1110 return calendar.timegm(timetuple.timetuple())
1111
1112
1113 def determine_ext(url, default_ext='unknown_video'):
1114 if url is None:
1115 return default_ext
1116 guess = url.partition('?')[0].rpartition('.')[2]
1117 if re.match(r'^[A-Za-z0-9]+$', guess):
1118 return guess
1119 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1120 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1121 return guess.rstrip('/')
1122 else:
1123 return default_ext
1124
1125
1126 def subtitles_filename(filename, sub_lang, sub_format):
1127 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1128
1129
1130 def date_from_str(date_str):
1131 """
1132 Return a datetime object from a string in the format YYYYMMDD or
1133 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1134 today = datetime.date.today()
1135 if date_str in ('now', 'today'):
1136 return today
1137 if date_str == 'yesterday':
1138 return today - datetime.timedelta(days=1)
1139 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1140 if match is not None:
1141 sign = match.group('sign')
1142 time = int(match.group('time'))
1143 if sign == '-':
1144 time = -time
1145 unit = match.group('unit')
1146 # A bad approximation?
1147 if unit == 'month':
1148 unit = 'day'
1149 time *= 30
1150 elif unit == 'year':
1151 unit = 'day'
1152 time *= 365
1153 unit += 's'
1154 delta = datetime.timedelta(**{unit: time})
1155 return today + delta
1156 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1157
1158
1159 def hyphenate_date(date_str):
1160 """
1161 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1162 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1163 if match is not None:
1164 return '-'.join(match.groups())
1165 else:
1166 return date_str
1167
1168
1169 class DateRange(object):
1170 """Represents a time interval between two dates"""
1171
1172 def __init__(self, start=None, end=None):
1173 """start and end must be strings in the format accepted by date"""
1174 if start is not None:
1175 self.start = date_from_str(start)
1176 else:
1177 self.start = datetime.datetime.min.date()
1178 if end is not None:
1179 self.end = date_from_str(end)
1180 else:
1181 self.end = datetime.datetime.max.date()
1182 if self.start > self.end:
1183 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1184
1185 @classmethod
1186 def day(cls, day):
1187 """Returns a range that only contains the given day"""
1188 return cls(day, day)
1189
1190 def __contains__(self, date):
1191 """Check if the date is in the range"""
1192 if not isinstance(date, datetime.date):
1193 date = date_from_str(date)
1194 return self.start <= date <= self.end
1195
1196 def __str__(self):
1197 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1198
1199
1200 def platform_name():
1201 """ Returns the platform name as a compat_str """
1202 res = platform.platform()
1203 if isinstance(res, bytes):
1204 res = res.decode(preferredencoding())
1205
1206 assert isinstance(res, compat_str)
1207 return res
1208
1209
1210 def _windows_write_string(s, out):
1211 """ Returns True if the string was written using special methods,
1212 False if it has yet to be written out."""
1213 # Adapted from http://stackoverflow.com/a/3259271/35070
1214
1215 import ctypes
1216 import ctypes.wintypes
1217
1218 WIN_OUTPUT_IDS = {
1219 1: -11,
1220 2: -12,
1221 }
1222
1223 try:
1224 fileno = out.fileno()
1225 except AttributeError:
1226 # If the output stream doesn't have a fileno, it's virtual
1227 return False
1228 except io.UnsupportedOperation:
1229 # Some strange Windows pseudo files?
1230 return False
1231 if fileno not in WIN_OUTPUT_IDS:
1232 return False
1233
1234 GetStdHandle = ctypes.WINFUNCTYPE(
1235 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1236 (b'GetStdHandle', ctypes.windll.kernel32))
1237 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1238
1239 WriteConsoleW = ctypes.WINFUNCTYPE(
1240 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1241 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1242 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1243 written = ctypes.wintypes.DWORD(0)
1244
1245 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1246 FILE_TYPE_CHAR = 0x0002
1247 FILE_TYPE_REMOTE = 0x8000
1248 GetConsoleMode = ctypes.WINFUNCTYPE(
1249 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1250 ctypes.POINTER(ctypes.wintypes.DWORD))(
1251 (b'GetConsoleMode', ctypes.windll.kernel32))
1252 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1253
1254 def not_a_console(handle):
1255 if handle == INVALID_HANDLE_VALUE or handle is None:
1256 return True
1257 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1258 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1259
1260 if not_a_console(h):
1261 return False
1262
1263 def next_nonbmp_pos(s):
1264 try:
1265 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1266 except StopIteration:
1267 return len(s)
1268
1269 while s:
1270 count = min(next_nonbmp_pos(s), 1024)
1271
1272 ret = WriteConsoleW(
1273 h, s, count if count else 2, ctypes.byref(written), None)
1274 if ret == 0:
1275 raise OSError('Failed to write string')
1276 if not count: # We just wrote a non-BMP character
1277 assert written.value == 2
1278 s = s[1:]
1279 else:
1280 assert written.value > 0
1281 s = s[written.value:]
1282 return True
1283
1284
1285 def write_string(s, out=None, encoding=None):
1286 if out is None:
1287 out = sys.stderr
1288 assert type(s) == compat_str
1289
1290 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1291 if _windows_write_string(s, out):
1292 return
1293
1294 if ('b' in getattr(out, 'mode', '') or
1295 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1296 byt = s.encode(encoding or preferredencoding(), 'ignore')
1297 out.write(byt)
1298 elif hasattr(out, 'buffer'):
1299 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1300 byt = s.encode(enc, 'ignore')
1301 out.buffer.write(byt)
1302 else:
1303 out.write(s)
1304 out.flush()
1305
1306
1307 def bytes_to_intlist(bs):
1308 if not bs:
1309 return []
1310 if isinstance(bs[0], int): # Python 3
1311 return list(bs)
1312 else:
1313 return [ord(c) for c in bs]
1314
1315
1316 def intlist_to_bytes(xs):
1317 if not xs:
1318 return b''
1319 return compat_struct_pack('%dB' % len(xs), *xs)
1320
1321
1322 # Cross-platform file locking
1323 if sys.platform == 'win32':
1324 import ctypes.wintypes
1325 import msvcrt
1326
1327 class OVERLAPPED(ctypes.Structure):
1328 _fields_ = [
1329 ('Internal', ctypes.wintypes.LPVOID),
1330 ('InternalHigh', ctypes.wintypes.LPVOID),
1331 ('Offset', ctypes.wintypes.DWORD),
1332 ('OffsetHigh', ctypes.wintypes.DWORD),
1333 ('hEvent', ctypes.wintypes.HANDLE),
1334 ]
1335
1336 kernel32 = ctypes.windll.kernel32
1337 LockFileEx = kernel32.LockFileEx
1338 LockFileEx.argtypes = [
1339 ctypes.wintypes.HANDLE, # hFile
1340 ctypes.wintypes.DWORD, # dwFlags
1341 ctypes.wintypes.DWORD, # dwReserved
1342 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1343 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1344 ctypes.POINTER(OVERLAPPED) # Overlapped
1345 ]
1346 LockFileEx.restype = ctypes.wintypes.BOOL
1347 UnlockFileEx = kernel32.UnlockFileEx
1348 UnlockFileEx.argtypes = [
1349 ctypes.wintypes.HANDLE, # hFile
1350 ctypes.wintypes.DWORD, # dwReserved
1351 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1352 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1353 ctypes.POINTER(OVERLAPPED) # Overlapped
1354 ]
1355 UnlockFileEx.restype = ctypes.wintypes.BOOL
1356 whole_low = 0xffffffff
1357 whole_high = 0x7fffffff
1358
1359 def _lock_file(f, exclusive):
1360 overlapped = OVERLAPPED()
1361 overlapped.Offset = 0
1362 overlapped.OffsetHigh = 0
1363 overlapped.hEvent = 0
1364 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1365 handle = msvcrt.get_osfhandle(f.fileno())
1366 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1367 whole_low, whole_high, f._lock_file_overlapped_p):
1368 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1369
1370 def _unlock_file(f):
1371 assert f._lock_file_overlapped_p
1372 handle = msvcrt.get_osfhandle(f.fileno())
1373 if not UnlockFileEx(handle, 0,
1374 whole_low, whole_high, f._lock_file_overlapped_p):
1375 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1376
1377 else:
1378 # Some platforms, such as Jython, is missing fcntl
1379 try:
1380 import fcntl
1381
1382 def _lock_file(f, exclusive):
1383 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1384
1385 def _unlock_file(f):
1386 fcntl.flock(f, fcntl.LOCK_UN)
1387 except ImportError:
1388 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1389
1390 def _lock_file(f, exclusive):
1391 raise IOError(UNSUPPORTED_MSG)
1392
1393 def _unlock_file(f):
1394 raise IOError(UNSUPPORTED_MSG)
1395
1396
1397 class locked_file(object):
1398 def __init__(self, filename, mode, encoding=None):
1399 assert mode in ['r', 'a', 'w']
1400 self.f = io.open(filename, mode, encoding=encoding)
1401 self.mode = mode
1402
1403 def __enter__(self):
1404 exclusive = self.mode != 'r'
1405 try:
1406 _lock_file(self.f, exclusive)
1407 except IOError:
1408 self.f.close()
1409 raise
1410 return self
1411
1412 def __exit__(self, etype, value, traceback):
1413 try:
1414 _unlock_file(self.f)
1415 finally:
1416 self.f.close()
1417
1418 def __iter__(self):
1419 return iter(self.f)
1420
1421 def write(self, *args):
1422 return self.f.write(*args)
1423
1424 def read(self, *args):
1425 return self.f.read(*args)
1426
1427
1428 def get_filesystem_encoding():
1429 encoding = sys.getfilesystemencoding()
1430 return encoding if encoding is not None else 'utf-8'
1431
1432
1433 def shell_quote(args):
1434 quoted_args = []
1435 encoding = get_filesystem_encoding()
1436 for a in args:
1437 if isinstance(a, bytes):
1438 # We may get a filename encoded with 'encodeFilename'
1439 a = a.decode(encoding)
1440 quoted_args.append(pipes.quote(a))
1441 return ' '.join(quoted_args)
1442
1443
1444 def smuggle_url(url, data):
1445 """ Pass additional data in a URL for internal use. """
1446
1447 url, idata = unsmuggle_url(url, {})
1448 data.update(idata)
1449 sdata = compat_urllib_parse_urlencode(
1450 {'__youtubedl_smuggle': json.dumps(data)})
1451 return url + '#' + sdata
1452
1453
1454 def unsmuggle_url(smug_url, default=None):
1455 if '#__youtubedl_smuggle' not in smug_url:
1456 return smug_url, default
1457 url, _, sdata = smug_url.rpartition('#')
1458 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1459 data = json.loads(jsond)
1460 return url, data
1461
1462
1463 def format_bytes(bytes):
1464 if bytes is None:
1465 return 'N/A'
1466 if type(bytes) is str:
1467 bytes = float(bytes)
1468 if bytes == 0.0:
1469 exponent = 0
1470 else:
1471 exponent = int(math.log(bytes, 1024.0))
1472 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1473 converted = float(bytes) / float(1024 ** exponent)
1474 return '%.2f%s' % (converted, suffix)
1475
1476
1477 def lookup_unit_table(unit_table, s):
1478 units_re = '|'.join(re.escape(u) for u in unit_table)
1479 m = re.match(
1480 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1481 if not m:
1482 return None
1483 num_str = m.group('num').replace(',', '.')
1484 mult = unit_table[m.group('unit')]
1485 return int(float(num_str) * mult)
1486
1487
1488 def parse_filesize(s):
1489 if s is None:
1490 return None
1491
1492 # The lower-case forms are of course incorrect and unofficial,
1493 # but we support those too
1494 _UNIT_TABLE = {
1495 'B': 1,
1496 'b': 1,
1497 'KiB': 1024,
1498 'KB': 1000,
1499 'kB': 1024,
1500 'Kb': 1000,
1501 'MiB': 1024 ** 2,
1502 'MB': 1000 ** 2,
1503 'mB': 1024 ** 2,
1504 'Mb': 1000 ** 2,
1505 'GiB': 1024 ** 3,
1506 'GB': 1000 ** 3,
1507 'gB': 1024 ** 3,
1508 'Gb': 1000 ** 3,
1509 'TiB': 1024 ** 4,
1510 'TB': 1000 ** 4,
1511 'tB': 1024 ** 4,
1512 'Tb': 1000 ** 4,
1513 'PiB': 1024 ** 5,
1514 'PB': 1000 ** 5,
1515 'pB': 1024 ** 5,
1516 'Pb': 1000 ** 5,
1517 'EiB': 1024 ** 6,
1518 'EB': 1000 ** 6,
1519 'eB': 1024 ** 6,
1520 'Eb': 1000 ** 6,
1521 'ZiB': 1024 ** 7,
1522 'ZB': 1000 ** 7,
1523 'zB': 1024 ** 7,
1524 'Zb': 1000 ** 7,
1525 'YiB': 1024 ** 8,
1526 'YB': 1000 ** 8,
1527 'yB': 1024 ** 8,
1528 'Yb': 1000 ** 8,
1529 }
1530
1531 return lookup_unit_table(_UNIT_TABLE, s)
1532
1533
1534 def parse_count(s):
1535 if s is None:
1536 return None
1537
1538 s = s.strip()
1539
1540 if re.match(r'^[\d,.]+$', s):
1541 return str_to_int(s)
1542
1543 _UNIT_TABLE = {
1544 'k': 1000,
1545 'K': 1000,
1546 'm': 1000 ** 2,
1547 'M': 1000 ** 2,
1548 'kk': 1000 ** 2,
1549 'KK': 1000 ** 2,
1550 }
1551
1552 return lookup_unit_table(_UNIT_TABLE, s)
1553
1554
1555 def month_by_name(name):
1556 """ Return the number of a month by (locale-independently) English name """
1557
1558 try:
1559 return ENGLISH_MONTH_NAMES.index(name) + 1
1560 except ValueError:
1561 return None
1562
1563
1564 def month_by_abbreviation(abbrev):
1565 """ Return the number of a month by (locale-independently) English
1566 abbreviations """
1567
1568 try:
1569 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1570 except ValueError:
1571 return None
1572
1573
1574 def fix_xml_ampersands(xml_str):
1575 """Replace all the '&' by '&amp;' in XML"""
1576 return re.sub(
1577 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1578 '&amp;',
1579 xml_str)
1580
1581
1582 def setproctitle(title):
1583 assert isinstance(title, compat_str)
1584
1585 # ctypes in Jython is not complete
1586 # http://bugs.jython.org/issue2148
1587 if sys.platform.startswith('java'):
1588 return
1589
1590 try:
1591 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1592 except OSError:
1593 return
1594 title_bytes = title.encode('utf-8')
1595 buf = ctypes.create_string_buffer(len(title_bytes))
1596 buf.value = title_bytes
1597 try:
1598 libc.prctl(15, buf, 0, 0, 0)
1599 except AttributeError:
1600 return # Strange libc, just skip this
1601
1602
1603 def remove_start(s, start):
1604 return s[len(start):] if s is not None and s.startswith(start) else s
1605
1606
1607 def remove_end(s, end):
1608 return s[:-len(end)] if s is not None and s.endswith(end) else s
1609
1610
1611 def remove_quotes(s):
1612 if s is None or len(s) < 2:
1613 return s
1614 for quote in ('"', "'", ):
1615 if s[0] == quote and s[-1] == quote:
1616 return s[1:-1]
1617 return s
1618
1619
1620 def url_basename(url):
1621 path = compat_urlparse.urlparse(url).path
1622 return path.strip('/').split('/')[-1]
1623
1624
1625 class HEADRequest(compat_urllib_request.Request):
1626 def get_method(self):
1627 return 'HEAD'
1628
1629
1630 class PUTRequest(compat_urllib_request.Request):
1631 def get_method(self):
1632 return 'PUT'
1633
1634
1635 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1636 if get_attr:
1637 if v is not None:
1638 v = getattr(v, get_attr, None)
1639 if v == '':
1640 v = None
1641 if v is None:
1642 return default
1643 try:
1644 return int(v) * invscale // scale
1645 except ValueError:
1646 return default
1647
1648
1649 def str_or_none(v, default=None):
1650 return default if v is None else compat_str(v)
1651
1652
1653 def str_to_int(int_str):
1654 """ A more relaxed version of int_or_none """
1655 if int_str is None:
1656 return None
1657 int_str = re.sub(r'[,\.\+]', '', int_str)
1658 return int(int_str)
1659
1660
1661 def float_or_none(v, scale=1, invscale=1, default=None):
1662 if v is None:
1663 return default
1664 try:
1665 return float(v) * invscale / scale
1666 except ValueError:
1667 return default
1668
1669
1670 def strip_or_none(v):
1671 return None if v is None else v.strip()
1672
1673
1674 def parse_duration(s):
1675 if not isinstance(s, compat_basestring):
1676 return None
1677
1678 s = s.strip()
1679
1680 days, hours, mins, secs, ms = [None] * 5
1681 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1682 if m:
1683 days, hours, mins, secs, ms = m.groups()
1684 else:
1685 m = re.match(
1686 r'''(?ix)(?:P?T)?
1687 (?:
1688 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1689 )?
1690 (?:
1691 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1692 )?
1693 (?:
1694 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1695 )?
1696 (?:
1697 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1698 )?$''', s)
1699 if m:
1700 days, hours, mins, secs, ms = m.groups()
1701 else:
1702 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1703 if m:
1704 hours, mins = m.groups()
1705 else:
1706 return None
1707
1708 duration = 0
1709 if secs:
1710 duration += float(secs)
1711 if mins:
1712 duration += float(mins) * 60
1713 if hours:
1714 duration += float(hours) * 60 * 60
1715 if days:
1716 duration += float(days) * 24 * 60 * 60
1717 if ms:
1718 duration += float(ms)
1719 return duration
1720
1721
1722 def prepend_extension(filename, ext, expected_real_ext=None):
1723 name, real_ext = os.path.splitext(filename)
1724 return (
1725 '{0}.{1}{2}'.format(name, ext, real_ext)
1726 if not expected_real_ext or real_ext[1:] == expected_real_ext
1727 else '{0}.{1}'.format(filename, ext))
1728
1729
1730 def replace_extension(filename, ext, expected_real_ext=None):
1731 name, real_ext = os.path.splitext(filename)
1732 return '{0}.{1}'.format(
1733 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1734 ext)
1735
1736
1737 def check_executable(exe, args=[]):
1738 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1739 args can be a list of arguments for a short output (like -version) """
1740 try:
1741 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1742 except OSError:
1743 return False
1744 return exe
1745
1746
1747 def get_exe_version(exe, args=['--version'],
1748 version_re=None, unrecognized='present'):
1749 """ Returns the version of the specified executable,
1750 or False if the executable is not present """
1751 try:
1752 out, _ = subprocess.Popen(
1753 [encodeArgument(exe)] + args,
1754 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1755 except OSError:
1756 return False
1757 if isinstance(out, bytes): # Python 2.x
1758 out = out.decode('ascii', 'ignore')
1759 return detect_exe_version(out, version_re, unrecognized)
1760
1761
1762 def detect_exe_version(output, version_re=None, unrecognized='present'):
1763 assert isinstance(output, compat_str)
1764 if version_re is None:
1765 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1766 m = re.search(version_re, output)
1767 if m:
1768 return m.group(1)
1769 else:
1770 return unrecognized
1771
1772
1773 class PagedList(object):
1774 def __len__(self):
1775 # This is only useful for tests
1776 return len(self.getslice())
1777
1778
1779 class OnDemandPagedList(PagedList):
1780 def __init__(self, pagefunc, pagesize, use_cache=False):
1781 self._pagefunc = pagefunc
1782 self._pagesize = pagesize
1783 self._use_cache = use_cache
1784 if use_cache:
1785 self._cache = {}
1786
1787 def getslice(self, start=0, end=None):
1788 res = []
1789 for pagenum in itertools.count(start // self._pagesize):
1790 firstid = pagenum * self._pagesize
1791 nextfirstid = pagenum * self._pagesize + self._pagesize
1792 if start >= nextfirstid:
1793 continue
1794
1795 page_results = None
1796 if self._use_cache:
1797 page_results = self._cache.get(pagenum)
1798 if page_results is None:
1799 page_results = list(self._pagefunc(pagenum))
1800 if self._use_cache:
1801 self._cache[pagenum] = page_results
1802
1803 startv = (
1804 start % self._pagesize
1805 if firstid <= start < nextfirstid
1806 else 0)
1807
1808 endv = (
1809 ((end - 1) % self._pagesize) + 1
1810 if (end is not None and firstid <= end <= nextfirstid)
1811 else None)
1812
1813 if startv != 0 or endv is not None:
1814 page_results = page_results[startv:endv]
1815 res.extend(page_results)
1816
1817 # A little optimization - if current page is not "full", ie. does
1818 # not contain page_size videos then we can assume that this page
1819 # is the last one - there are no more ids on further pages -
1820 # i.e. no need to query again.
1821 if len(page_results) + startv < self._pagesize:
1822 break
1823
1824 # If we got the whole page, but the next page is not interesting,
1825 # break out early as well
1826 if end == nextfirstid:
1827 break
1828 return res
1829
1830
1831 class InAdvancePagedList(PagedList):
1832 def __init__(self, pagefunc, pagecount, pagesize):
1833 self._pagefunc = pagefunc
1834 self._pagecount = pagecount
1835 self._pagesize = pagesize
1836
1837 def getslice(self, start=0, end=None):
1838 res = []
1839 start_page = start // self._pagesize
1840 end_page = (
1841 self._pagecount if end is None else (end // self._pagesize + 1))
1842 skip_elems = start - start_page * self._pagesize
1843 only_more = None if end is None else end - start
1844 for pagenum in range(start_page, end_page):
1845 page = list(self._pagefunc(pagenum))
1846 if skip_elems:
1847 page = page[skip_elems:]
1848 skip_elems = None
1849 if only_more is not None:
1850 if len(page) < only_more:
1851 only_more -= len(page)
1852 else:
1853 page = page[:only_more]
1854 res.extend(page)
1855 break
1856 res.extend(page)
1857 return res
1858
1859
1860 def uppercase_escape(s):
1861 unicode_escape = codecs.getdecoder('unicode_escape')
1862 return re.sub(
1863 r'\\U[0-9a-fA-F]{8}',
1864 lambda m: unicode_escape(m.group(0))[0],
1865 s)
1866
1867
1868 def lowercase_escape(s):
1869 unicode_escape = codecs.getdecoder('unicode_escape')
1870 return re.sub(
1871 r'\\u[0-9a-fA-F]{4}',
1872 lambda m: unicode_escape(m.group(0))[0],
1873 s)
1874
1875
1876 def escape_rfc3986(s):
1877 """Escape non-ASCII characters as suggested by RFC 3986"""
1878 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1879 s = s.encode('utf-8')
1880 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1881
1882
1883 def escape_url(url):
1884 """Escape URL as suggested by RFC 3986"""
1885 url_parsed = compat_urllib_parse_urlparse(url)
1886 return url_parsed._replace(
1887 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1888 path=escape_rfc3986(url_parsed.path),
1889 params=escape_rfc3986(url_parsed.params),
1890 query=escape_rfc3986(url_parsed.query),
1891 fragment=escape_rfc3986(url_parsed.fragment)
1892 ).geturl()
1893
1894
1895 def read_batch_urls(batch_fd):
1896 def fixup(url):
1897 if not isinstance(url, compat_str):
1898 url = url.decode('utf-8', 'replace')
1899 BOM_UTF8 = '\xef\xbb\xbf'
1900 if url.startswith(BOM_UTF8):
1901 url = url[len(BOM_UTF8):]
1902 url = url.strip()
1903 if url.startswith(('#', ';', ']')):
1904 return False
1905 return url
1906
1907 with contextlib.closing(batch_fd) as fd:
1908 return [url for url in map(fixup, fd) if url]
1909
1910
1911 def urlencode_postdata(*args, **kargs):
1912 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1913
1914
1915 def update_url_query(url, query):
1916 if not query:
1917 return url
1918 parsed_url = compat_urlparse.urlparse(url)
1919 qs = compat_parse_qs(parsed_url.query)
1920 qs.update(query)
1921 return compat_urlparse.urlunparse(parsed_url._replace(
1922 query=compat_urllib_parse_urlencode(qs, True)))
1923
1924
1925 def update_Request(req, url=None, data=None, headers={}, query={}):
1926 req_headers = req.headers.copy()
1927 req_headers.update(headers)
1928 req_data = data or req.data
1929 req_url = update_url_query(url or req.get_full_url(), query)
1930 req_get_method = req.get_method()
1931 if req_get_method == 'HEAD':
1932 req_type = HEADRequest
1933 elif req_get_method == 'PUT':
1934 req_type = PUTRequest
1935 else:
1936 req_type = compat_urllib_request.Request
1937 new_req = req_type(
1938 req_url, data=req_data, headers=req_headers,
1939 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1940 if hasattr(req, 'timeout'):
1941 new_req.timeout = req.timeout
1942 return new_req
1943
1944
1945 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1946 if isinstance(key_or_keys, (list, tuple)):
1947 for key in key_or_keys:
1948 if key not in d or d[key] is None or skip_false_values and not d[key]:
1949 continue
1950 return d[key]
1951 return default
1952 return d.get(key_or_keys, default)
1953
1954
1955 def try_get(src, getter, expected_type=None):
1956 try:
1957 v = getter(src)
1958 except (AttributeError, KeyError, TypeError, IndexError):
1959 pass
1960 else:
1961 if expected_type is None or isinstance(v, expected_type):
1962 return v
1963
1964
1965 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1966 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1967
1968
1969 US_RATINGS = {
1970 'G': 0,
1971 'PG': 10,
1972 'PG-13': 13,
1973 'R': 16,
1974 'NC': 18,
1975 }
1976
1977
1978 def parse_age_limit(s):
1979 if s is None:
1980 return None
1981 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1982 return int(m.group('age')) if m else US_RATINGS.get(s)
1983
1984
1985 def strip_jsonp(code):
1986 return re.sub(
1987 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1988
1989
1990 def js_to_json(code):
1991 def fix_kv(m):
1992 v = m.group(0)
1993 if v in ('true', 'false', 'null'):
1994 return v
1995 elif v.startswith('/*') or v == ',':
1996 return ""
1997
1998 if v[0] in ("'", '"'):
1999 v = re.sub(r'(?s)\\.|"', lambda m: {
2000 '"': '\\"',
2001 "\\'": "'",
2002 '\\\n': '',
2003 '\\x': '\\u00',
2004 }.get(m.group(0), m.group(0)), v[1:-1])
2005
2006 INTEGER_TABLE = (
2007 (r'^0[xX][0-9a-fA-F]+', 16),
2008 (r'^0+[0-7]+', 8),
2009 )
2010
2011 for regex, base in INTEGER_TABLE:
2012 im = re.match(regex, v)
2013 if im:
2014 i = int(im.group(0), base)
2015 return '"%d":' % i if v.endswith(':') else '%d' % i
2016
2017 return '"%s"' % v
2018
2019 return re.sub(r'''(?sx)
2020 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2021 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2022 /\*.*?\*/|,(?=\s*[\]}])|
2023 [a-zA-Z_][.a-zA-Z_0-9]*|
2024 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2025 [0-9]+(?=\s*:)
2026 ''', fix_kv, code)
2027
2028
2029 def qualities(quality_ids):
2030 """ Get a numeric quality value out of a list of possible values """
2031 def q(qid):
2032 try:
2033 return quality_ids.index(qid)
2034 except ValueError:
2035 return -1
2036 return q
2037
2038
2039 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2040
2041
2042 def limit_length(s, length):
2043 """ Add ellipses to overly long strings """
2044 if s is None:
2045 return None
2046 ELLIPSES = '...'
2047 if len(s) > length:
2048 return s[:length - len(ELLIPSES)] + ELLIPSES
2049 return s
2050
2051
2052 def version_tuple(v):
2053 return tuple(int(e) for e in re.split(r'[-.]', v))
2054
2055
2056 def is_outdated_version(version, limit, assume_new=True):
2057 if not version:
2058 return not assume_new
2059 try:
2060 return version_tuple(version) < version_tuple(limit)
2061 except ValueError:
2062 return not assume_new
2063
2064
2065 def ytdl_is_updateable():
2066 """ Returns if youtube-dl can be updated with -U """
2067 from zipimport import zipimporter
2068
2069 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2070
2071
2072 def args_to_str(args):
2073 # Get a short string representation for a subprocess command
2074 return ' '.join(compat_shlex_quote(a) for a in args)
2075
2076
2077 def error_to_compat_str(err):
2078 err_str = str(err)
2079 # On python 2 error byte string must be decoded with proper
2080 # encoding rather than ascii
2081 if sys.version_info[0] < 3:
2082 err_str = err_str.decode(preferredencoding())
2083 return err_str
2084
2085
2086 def mimetype2ext(mt):
2087 if mt is None:
2088 return None
2089
2090 ext = {
2091 'audio/mp4': 'm4a',
2092 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2093 # it's the most popular one
2094 'audio/mpeg': 'mp3',
2095 }.get(mt)
2096 if ext is not None:
2097 return ext
2098
2099 _, _, res = mt.rpartition('/')
2100 res = res.lower()
2101
2102 return {
2103 '3gpp': '3gp',
2104 'smptett+xml': 'tt',
2105 'srt': 'srt',
2106 'ttaf+xml': 'dfxp',
2107 'ttml+xml': 'ttml',
2108 'vtt': 'vtt',
2109 'x-flv': 'flv',
2110 'x-mp4-fragmented': 'mp4',
2111 'x-ms-wmv': 'wmv',
2112 'mpegurl': 'm3u8',
2113 'x-mpegurl': 'm3u8',
2114 'vnd.apple.mpegurl': 'm3u8',
2115 'dash+xml': 'mpd',
2116 'f4m': 'f4m',
2117 'f4m+xml': 'f4m',
2118 }.get(res, res)
2119
2120
2121 def urlhandle_detect_ext(url_handle):
2122 getheader = url_handle.headers.get
2123
2124 cd = getheader('Content-Disposition')
2125 if cd:
2126 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2127 if m:
2128 e = determine_ext(m.group('filename'), default_ext=None)
2129 if e:
2130 return e
2131
2132 return mimetype2ext(getheader('Content-Type'))
2133
2134
2135 def encode_data_uri(data, mime_type):
2136 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2137
2138
2139 def age_restricted(content_limit, age_limit):
2140 """ Returns True iff the content should be blocked """
2141
2142 if age_limit is None: # No limit set
2143 return False
2144 if content_limit is None:
2145 return False # Content available for everyone
2146 return age_limit < content_limit
2147
2148
2149 def is_html(first_bytes):
2150 """ Detect whether a file contains HTML by examining its first bytes. """
2151
2152 BOMS = [
2153 (b'\xef\xbb\xbf', 'utf-8'),
2154 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2155 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2156 (b'\xff\xfe', 'utf-16-le'),
2157 (b'\xfe\xff', 'utf-16-be'),
2158 ]
2159 for bom, enc in BOMS:
2160 if first_bytes.startswith(bom):
2161 s = first_bytes[len(bom):].decode(enc, 'replace')
2162 break
2163 else:
2164 s = first_bytes.decode('utf-8', 'replace')
2165
2166 return re.match(r'^\s*<', s)
2167
2168
2169 def determine_protocol(info_dict):
2170 protocol = info_dict.get('protocol')
2171 if protocol is not None:
2172 return protocol
2173
2174 url = info_dict['url']
2175 if url.startswith('rtmp'):
2176 return 'rtmp'
2177 elif url.startswith('mms'):
2178 return 'mms'
2179 elif url.startswith('rtsp'):
2180 return 'rtsp'
2181
2182 ext = determine_ext(url)
2183 if ext == 'm3u8':
2184 return 'm3u8'
2185 elif ext == 'f4m':
2186 return 'f4m'
2187
2188 return compat_urllib_parse_urlparse(url).scheme
2189
2190
2191 def render_table(header_row, data):
2192 """ Render a list of rows, each as a list of values """
2193 table = [header_row] + data
2194 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2195 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2196 return '\n'.join(format_str % tuple(row) for row in table)
2197
2198
2199 def _match_one(filter_part, dct):
2200 COMPARISON_OPERATORS = {
2201 '<': operator.lt,
2202 '<=': operator.le,
2203 '>': operator.gt,
2204 '>=': operator.ge,
2205 '=': operator.eq,
2206 '!=': operator.ne,
2207 }
2208 operator_rex = re.compile(r'''(?x)\s*
2209 (?P<key>[a-z_]+)
2210 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2211 (?:
2212 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2213 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2214 )
2215 \s*$
2216 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2217 m = operator_rex.search(filter_part)
2218 if m:
2219 op = COMPARISON_OPERATORS[m.group('op')]
2220 if m.group('strval') is not None:
2221 if m.group('op') not in ('=', '!='):
2222 raise ValueError(
2223 'Operator %s does not support string values!' % m.group('op'))
2224 comparison_value = m.group('strval')
2225 else:
2226 try:
2227 comparison_value = int(m.group('intval'))
2228 except ValueError:
2229 comparison_value = parse_filesize(m.group('intval'))
2230 if comparison_value is None:
2231 comparison_value = parse_filesize(m.group('intval') + 'B')
2232 if comparison_value is None:
2233 raise ValueError(
2234 'Invalid integer value %r in filter part %r' % (
2235 m.group('intval'), filter_part))
2236 actual_value = dct.get(m.group('key'))
2237 if actual_value is None:
2238 return m.group('none_inclusive')
2239 return op(actual_value, comparison_value)
2240
2241 UNARY_OPERATORS = {
2242 '': lambda v: v is not None,
2243 '!': lambda v: v is None,
2244 }
2245 operator_rex = re.compile(r'''(?x)\s*
2246 (?P<op>%s)\s*(?P<key>[a-z_]+)
2247 \s*$
2248 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2249 m = operator_rex.search(filter_part)
2250 if m:
2251 op = UNARY_OPERATORS[m.group('op')]
2252 actual_value = dct.get(m.group('key'))
2253 return op(actual_value)
2254
2255 raise ValueError('Invalid filter part %r' % filter_part)
2256
2257
2258 def match_str(filter_str, dct):
2259 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2260
2261 return all(
2262 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2263
2264
2265 def match_filter_func(filter_str):
2266 def _match_func(info_dict):
2267 if match_str(filter_str, info_dict):
2268 return None
2269 else:
2270 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2271 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2272 return _match_func
2273
2274
2275 def parse_dfxp_time_expr(time_expr):
2276 if not time_expr:
2277 return
2278
2279 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2280 if mobj:
2281 return float(mobj.group('time_offset'))
2282
2283 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2284 if mobj:
2285 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2286
2287
2288 def srt_subtitles_timecode(seconds):
2289 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2290
2291
2292 def dfxp2srt(dfxp_data):
2293 _x = functools.partial(xpath_with_ns, ns_map={
2294 'ttml': 'http://www.w3.org/ns/ttml',
2295 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2296 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2297 })
2298
2299 class TTMLPElementParser(object):
2300 out = ''
2301
2302 def start(self, tag, attrib):
2303 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2304 self.out += '\n'
2305
2306 def end(self, tag):
2307 pass
2308
2309 def data(self, data):
2310 self.out += data
2311
2312 def close(self):
2313 return self.out.strip()
2314
2315 def parse_node(node):
2316 target = TTMLPElementParser()
2317 parser = xml.etree.ElementTree.XMLParser(target=target)
2318 parser.feed(xml.etree.ElementTree.tostring(node))
2319 return parser.close()
2320
2321 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2322 out = []
2323 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2324
2325 if not paras:
2326 raise ValueError('Invalid dfxp/TTML subtitle')
2327
2328 for para, index in zip(paras, itertools.count(1)):
2329 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2330 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2331 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2332 if begin_time is None:
2333 continue
2334 if not end_time:
2335 if not dur:
2336 continue
2337 end_time = begin_time + dur
2338 out.append('%d\n%s --> %s\n%s\n\n' % (
2339 index,
2340 srt_subtitles_timecode(begin_time),
2341 srt_subtitles_timecode(end_time),
2342 parse_node(para)))
2343
2344 return ''.join(out)
2345
2346
2347 def cli_option(params, command_option, param):
2348 param = params.get(param)
2349 return [command_option, param] if param is not None else []
2350
2351
2352 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2353 param = params.get(param)
2354 assert isinstance(param, bool)
2355 if separator:
2356 return [command_option + separator + (true_value if param else false_value)]
2357 return [command_option, true_value if param else false_value]
2358
2359
2360 def cli_valueless_option(params, command_option, param, expected_value=True):
2361 param = params.get(param)
2362 return [command_option] if param == expected_value else []
2363
2364
2365 def cli_configuration_args(params, param, default=[]):
2366 ex_args = params.get(param)
2367 if ex_args is None:
2368 return default
2369 assert isinstance(ex_args, list)
2370 return ex_args
2371
2372
2373 class ISO639Utils(object):
2374 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2375 _lang_map = {
2376 'aa': 'aar',
2377 'ab': 'abk',
2378 'ae': 'ave',
2379 'af': 'afr',
2380 'ak': 'aka',
2381 'am': 'amh',
2382 'an': 'arg',
2383 'ar': 'ara',
2384 'as': 'asm',
2385 'av': 'ava',
2386 'ay': 'aym',
2387 'az': 'aze',
2388 'ba': 'bak',
2389 'be': 'bel',
2390 'bg': 'bul',
2391 'bh': 'bih',
2392 'bi': 'bis',
2393 'bm': 'bam',
2394 'bn': 'ben',
2395 'bo': 'bod',
2396 'br': 'bre',
2397 'bs': 'bos',
2398 'ca': 'cat',
2399 'ce': 'che',
2400 'ch': 'cha',
2401 'co': 'cos',
2402 'cr': 'cre',
2403 'cs': 'ces',
2404 'cu': 'chu',
2405 'cv': 'chv',
2406 'cy': 'cym',
2407 'da': 'dan',
2408 'de': 'deu',
2409 'dv': 'div',
2410 'dz': 'dzo',
2411 'ee': 'ewe',
2412 'el': 'ell',
2413 'en': 'eng',
2414 'eo': 'epo',
2415 'es': 'spa',
2416 'et': 'est',
2417 'eu': 'eus',
2418 'fa': 'fas',
2419 'ff': 'ful',
2420 'fi': 'fin',
2421 'fj': 'fij',
2422 'fo': 'fao',
2423 'fr': 'fra',
2424 'fy': 'fry',
2425 'ga': 'gle',
2426 'gd': 'gla',
2427 'gl': 'glg',
2428 'gn': 'grn',
2429 'gu': 'guj',
2430 'gv': 'glv',
2431 'ha': 'hau',
2432 'he': 'heb',
2433 'hi': 'hin',
2434 'ho': 'hmo',
2435 'hr': 'hrv',
2436 'ht': 'hat',
2437 'hu': 'hun',
2438 'hy': 'hye',
2439 'hz': 'her',
2440 'ia': 'ina',
2441 'id': 'ind',
2442 'ie': 'ile',
2443 'ig': 'ibo',
2444 'ii': 'iii',
2445 'ik': 'ipk',
2446 'io': 'ido',
2447 'is': 'isl',
2448 'it': 'ita',
2449 'iu': 'iku',
2450 'ja': 'jpn',
2451 'jv': 'jav',
2452 'ka': 'kat',
2453 'kg': 'kon',
2454 'ki': 'kik',
2455 'kj': 'kua',
2456 'kk': 'kaz',
2457 'kl': 'kal',
2458 'km': 'khm',
2459 'kn': 'kan',
2460 'ko': 'kor',
2461 'kr': 'kau',
2462 'ks': 'kas',
2463 'ku': 'kur',
2464 'kv': 'kom',
2465 'kw': 'cor',
2466 'ky': 'kir',
2467 'la': 'lat',
2468 'lb': 'ltz',
2469 'lg': 'lug',
2470 'li': 'lim',
2471 'ln': 'lin',
2472 'lo': 'lao',
2473 'lt': 'lit',
2474 'lu': 'lub',
2475 'lv': 'lav',
2476 'mg': 'mlg',
2477 'mh': 'mah',
2478 'mi': 'mri',
2479 'mk': 'mkd',
2480 'ml': 'mal',
2481 'mn': 'mon',
2482 'mr': 'mar',
2483 'ms': 'msa',
2484 'mt': 'mlt',
2485 'my': 'mya',
2486 'na': 'nau',
2487 'nb': 'nob',
2488 'nd': 'nde',
2489 'ne': 'nep',
2490 'ng': 'ndo',
2491 'nl': 'nld',
2492 'nn': 'nno',
2493 'no': 'nor',
2494 'nr': 'nbl',
2495 'nv': 'nav',
2496 'ny': 'nya',
2497 'oc': 'oci',
2498 'oj': 'oji',
2499 'om': 'orm',
2500 'or': 'ori',
2501 'os': 'oss',
2502 'pa': 'pan',
2503 'pi': 'pli',
2504 'pl': 'pol',
2505 'ps': 'pus',
2506 'pt': 'por',
2507 'qu': 'que',
2508 'rm': 'roh',
2509 'rn': 'run',
2510 'ro': 'ron',
2511 'ru': 'rus',
2512 'rw': 'kin',
2513 'sa': 'san',
2514 'sc': 'srd',
2515 'sd': 'snd',
2516 'se': 'sme',
2517 'sg': 'sag',
2518 'si': 'sin',
2519 'sk': 'slk',
2520 'sl': 'slv',
2521 'sm': 'smo',
2522 'sn': 'sna',
2523 'so': 'som',
2524 'sq': 'sqi',
2525 'sr': 'srp',
2526 'ss': 'ssw',
2527 'st': 'sot',
2528 'su': 'sun',
2529 'sv': 'swe',
2530 'sw': 'swa',
2531 'ta': 'tam',
2532 'te': 'tel',
2533 'tg': 'tgk',
2534 'th': 'tha',
2535 'ti': 'tir',
2536 'tk': 'tuk',
2537 'tl': 'tgl',
2538 'tn': 'tsn',
2539 'to': 'ton',
2540 'tr': 'tur',
2541 'ts': 'tso',
2542 'tt': 'tat',
2543 'tw': 'twi',
2544 'ty': 'tah',
2545 'ug': 'uig',
2546 'uk': 'ukr',
2547 'ur': 'urd',
2548 'uz': 'uzb',
2549 've': 'ven',
2550 'vi': 'vie',
2551 'vo': 'vol',
2552 'wa': 'wln',
2553 'wo': 'wol',
2554 'xh': 'xho',
2555 'yi': 'yid',
2556 'yo': 'yor',
2557 'za': 'zha',
2558 'zh': 'zho',
2559 'zu': 'zul',
2560 }
2561
2562 @classmethod
2563 def short2long(cls, code):
2564 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2565 return cls._lang_map.get(code[:2])
2566
2567 @classmethod
2568 def long2short(cls, code):
2569 """Convert language code from ISO 639-2/T to ISO 639-1"""
2570 for short_name, long_name in cls._lang_map.items():
2571 if long_name == code:
2572 return short_name
2573
2574
2575 class ISO3166Utils(object):
2576 # From http://data.okfn.org/data/core/country-list
2577 _country_map = {
2578 'AF': 'Afghanistan',
2579 'AX': 'Åland Islands',
2580 'AL': 'Albania',
2581 'DZ': 'Algeria',
2582 'AS': 'American Samoa',
2583 'AD': 'Andorra',
2584 'AO': 'Angola',
2585 'AI': 'Anguilla',
2586 'AQ': 'Antarctica',
2587 'AG': 'Antigua and Barbuda',
2588 'AR': 'Argentina',
2589 'AM': 'Armenia',
2590 'AW': 'Aruba',
2591 'AU': 'Australia',
2592 'AT': 'Austria',
2593 'AZ': 'Azerbaijan',
2594 'BS': 'Bahamas',
2595 'BH': 'Bahrain',
2596 'BD': 'Bangladesh',
2597 'BB': 'Barbados',
2598 'BY': 'Belarus',
2599 'BE': 'Belgium',
2600 'BZ': 'Belize',
2601 'BJ': 'Benin',
2602 'BM': 'Bermuda',
2603 'BT': 'Bhutan',
2604 'BO': 'Bolivia, Plurinational State of',
2605 'BQ': 'Bonaire, Sint Eustatius and Saba',
2606 'BA': 'Bosnia and Herzegovina',
2607 'BW': 'Botswana',
2608 'BV': 'Bouvet Island',
2609 'BR': 'Brazil',
2610 'IO': 'British Indian Ocean Territory',
2611 'BN': 'Brunei Darussalam',
2612 'BG': 'Bulgaria',
2613 'BF': 'Burkina Faso',
2614 'BI': 'Burundi',
2615 'KH': 'Cambodia',
2616 'CM': 'Cameroon',
2617 'CA': 'Canada',
2618 'CV': 'Cape Verde',
2619 'KY': 'Cayman Islands',
2620 'CF': 'Central African Republic',
2621 'TD': 'Chad',
2622 'CL': 'Chile',
2623 'CN': 'China',
2624 'CX': 'Christmas Island',
2625 'CC': 'Cocos (Keeling) Islands',
2626 'CO': 'Colombia',
2627 'KM': 'Comoros',
2628 'CG': 'Congo',
2629 'CD': 'Congo, the Democratic Republic of the',
2630 'CK': 'Cook Islands',
2631 'CR': 'Costa Rica',
2632 'CI': 'Côte d\'Ivoire',
2633 'HR': 'Croatia',
2634 'CU': 'Cuba',
2635 'CW': 'Curaçao',
2636 'CY': 'Cyprus',
2637 'CZ': 'Czech Republic',
2638 'DK': 'Denmark',
2639 'DJ': 'Djibouti',
2640 'DM': 'Dominica',
2641 'DO': 'Dominican Republic',
2642 'EC': 'Ecuador',
2643 'EG': 'Egypt',
2644 'SV': 'El Salvador',
2645 'GQ': 'Equatorial Guinea',
2646 'ER': 'Eritrea',
2647 'EE': 'Estonia',
2648 'ET': 'Ethiopia',
2649 'FK': 'Falkland Islands (Malvinas)',
2650 'FO': 'Faroe Islands',
2651 'FJ': 'Fiji',
2652 'FI': 'Finland',
2653 'FR': 'France',
2654 'GF': 'French Guiana',
2655 'PF': 'French Polynesia',
2656 'TF': 'French Southern Territories',
2657 'GA': 'Gabon',
2658 'GM': 'Gambia',
2659 'GE': 'Georgia',
2660 'DE': 'Germany',
2661 'GH': 'Ghana',
2662 'GI': 'Gibraltar',
2663 'GR': 'Greece',
2664 'GL': 'Greenland',
2665 'GD': 'Grenada',
2666 'GP': 'Guadeloupe',
2667 'GU': 'Guam',
2668 'GT': 'Guatemala',
2669 'GG': 'Guernsey',
2670 'GN': 'Guinea',
2671 'GW': 'Guinea-Bissau',
2672 'GY': 'Guyana',
2673 'HT': 'Haiti',
2674 'HM': 'Heard Island and McDonald Islands',
2675 'VA': 'Holy See (Vatican City State)',
2676 'HN': 'Honduras',
2677 'HK': 'Hong Kong',
2678 'HU': 'Hungary',
2679 'IS': 'Iceland',
2680 'IN': 'India',
2681 'ID': 'Indonesia',
2682 'IR': 'Iran, Islamic Republic of',
2683 'IQ': 'Iraq',
2684 'IE': 'Ireland',
2685 'IM': 'Isle of Man',
2686 'IL': 'Israel',
2687 'IT': 'Italy',
2688 'JM': 'Jamaica',
2689 'JP': 'Japan',
2690 'JE': 'Jersey',
2691 'JO': 'Jordan',
2692 'KZ': 'Kazakhstan',
2693 'KE': 'Kenya',
2694 'KI': 'Kiribati',
2695 'KP': 'Korea, Democratic People\'s Republic of',
2696 'KR': 'Korea, Republic of',
2697 'KW': 'Kuwait',
2698 'KG': 'Kyrgyzstan',
2699 'LA': 'Lao People\'s Democratic Republic',
2700 'LV': 'Latvia',
2701 'LB': 'Lebanon',
2702 'LS': 'Lesotho',
2703 'LR': 'Liberia',
2704 'LY': 'Libya',
2705 'LI': 'Liechtenstein',
2706 'LT': 'Lithuania',
2707 'LU': 'Luxembourg',
2708 'MO': 'Macao',
2709 'MK': 'Macedonia, the Former Yugoslav Republic of',
2710 'MG': 'Madagascar',
2711 'MW': 'Malawi',
2712 'MY': 'Malaysia',
2713 'MV': 'Maldives',
2714 'ML': 'Mali',
2715 'MT': 'Malta',
2716 'MH': 'Marshall Islands',
2717 'MQ': 'Martinique',
2718 'MR': 'Mauritania',
2719 'MU': 'Mauritius',
2720 'YT': 'Mayotte',
2721 'MX': 'Mexico',
2722 'FM': 'Micronesia, Federated States of',
2723 'MD': 'Moldova, Republic of',
2724 'MC': 'Monaco',
2725 'MN': 'Mongolia',
2726 'ME': 'Montenegro',
2727 'MS': 'Montserrat',
2728 'MA': 'Morocco',
2729 'MZ': 'Mozambique',
2730 'MM': 'Myanmar',
2731 'NA': 'Namibia',
2732 'NR': 'Nauru',
2733 'NP': 'Nepal',
2734 'NL': 'Netherlands',
2735 'NC': 'New Caledonia',
2736 'NZ': 'New Zealand',
2737 'NI': 'Nicaragua',
2738 'NE': 'Niger',
2739 'NG': 'Nigeria',
2740 'NU': 'Niue',
2741 'NF': 'Norfolk Island',
2742 'MP': 'Northern Mariana Islands',
2743 'NO': 'Norway',
2744 'OM': 'Oman',
2745 'PK': 'Pakistan',
2746 'PW': 'Palau',
2747 'PS': 'Palestine, State of',
2748 'PA': 'Panama',
2749 'PG': 'Papua New Guinea',
2750 'PY': 'Paraguay',
2751 'PE': 'Peru',
2752 'PH': 'Philippines',
2753 'PN': 'Pitcairn',
2754 'PL': 'Poland',
2755 'PT': 'Portugal',
2756 'PR': 'Puerto Rico',
2757 'QA': 'Qatar',
2758 'RE': 'Réunion',
2759 'RO': 'Romania',
2760 'RU': 'Russian Federation',
2761 'RW': 'Rwanda',
2762 'BL': 'Saint Barthélemy',
2763 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2764 'KN': 'Saint Kitts and Nevis',
2765 'LC': 'Saint Lucia',
2766 'MF': 'Saint Martin (French part)',
2767 'PM': 'Saint Pierre and Miquelon',
2768 'VC': 'Saint Vincent and the Grenadines',
2769 'WS': 'Samoa',
2770 'SM': 'San Marino',
2771 'ST': 'Sao Tome and Principe',
2772 'SA': 'Saudi Arabia',
2773 'SN': 'Senegal',
2774 'RS': 'Serbia',
2775 'SC': 'Seychelles',
2776 'SL': 'Sierra Leone',
2777 'SG': 'Singapore',
2778 'SX': 'Sint Maarten (Dutch part)',
2779 'SK': 'Slovakia',
2780 'SI': 'Slovenia',
2781 'SB': 'Solomon Islands',
2782 'SO': 'Somalia',
2783 'ZA': 'South Africa',
2784 'GS': 'South Georgia and the South Sandwich Islands',
2785 'SS': 'South Sudan',
2786 'ES': 'Spain',
2787 'LK': 'Sri Lanka',
2788 'SD': 'Sudan',
2789 'SR': 'Suriname',
2790 'SJ': 'Svalbard and Jan Mayen',
2791 'SZ': 'Swaziland',
2792 'SE': 'Sweden',
2793 'CH': 'Switzerland',
2794 'SY': 'Syrian Arab Republic',
2795 'TW': 'Taiwan, Province of China',
2796 'TJ': 'Tajikistan',
2797 'TZ': 'Tanzania, United Republic of',
2798 'TH': 'Thailand',
2799 'TL': 'Timor-Leste',
2800 'TG': 'Togo',
2801 'TK': 'Tokelau',
2802 'TO': 'Tonga',
2803 'TT': 'Trinidad and Tobago',
2804 'TN': 'Tunisia',
2805 'TR': 'Turkey',
2806 'TM': 'Turkmenistan',
2807 'TC': 'Turks and Caicos Islands',
2808 'TV': 'Tuvalu',
2809 'UG': 'Uganda',
2810 'UA': 'Ukraine',
2811 'AE': 'United Arab Emirates',
2812 'GB': 'United Kingdom',
2813 'US': 'United States',
2814 'UM': 'United States Minor Outlying Islands',
2815 'UY': 'Uruguay',
2816 'UZ': 'Uzbekistan',
2817 'VU': 'Vanuatu',
2818 'VE': 'Venezuela, Bolivarian Republic of',
2819 'VN': 'Viet Nam',
2820 'VG': 'Virgin Islands, British',
2821 'VI': 'Virgin Islands, U.S.',
2822 'WF': 'Wallis and Futuna',
2823 'EH': 'Western Sahara',
2824 'YE': 'Yemen',
2825 'ZM': 'Zambia',
2826 'ZW': 'Zimbabwe',
2827 }
2828
2829 @classmethod
2830 def short2full(cls, code):
2831 """Convert an ISO 3166-2 country code to the corresponding full name"""
2832 return cls._country_map.get(code.upper())
2833
2834
2835 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2836 def __init__(self, proxies=None):
2837 # Set default handlers
2838 for type in ('http', 'https'):
2839 setattr(self, '%s_open' % type,
2840 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2841 meth(r, proxy, type))
2842 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2843
2844 def proxy_open(self, req, proxy, type):
2845 req_proxy = req.headers.get('Ytdl-request-proxy')
2846 if req_proxy is not None:
2847 proxy = req_proxy
2848 del req.headers['Ytdl-request-proxy']
2849
2850 if proxy == '__noproxy__':
2851 return None # No Proxy
2852 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2853 req.add_header('Ytdl-socks-proxy', proxy)
2854 # youtube-dl's http/https handlers do wrapping the socket with socks
2855 return None
2856 return compat_urllib_request.ProxyHandler.proxy_open(
2857 self, req, proxy, type)
2858
2859
2860 def ohdave_rsa_encrypt(data, exponent, modulus):
2861 '''
2862 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2863
2864 Input:
2865 data: data to encrypt, bytes-like object
2866 exponent, modulus: parameter e and N of RSA algorithm, both integer
2867 Output: hex string of encrypted data
2868
2869 Limitation: supports one block encryption only
2870 '''
2871
2872 payload = int(binascii.hexlify(data[::-1]), 16)
2873 encrypted = pow(payload, exponent, modulus)
2874 return '%x' % encrypted
2875
2876
2877 def encode_base_n(num, n, table=None):
2878 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2879 if not table:
2880 table = FULL_TABLE[:n]
2881
2882 if n > len(table):
2883 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2884
2885 if num == 0:
2886 return table[0]
2887
2888 ret = ''
2889 while num:
2890 ret = table[num % n] + ret
2891 num = num // n
2892 return ret
2893
2894
2895 def decode_packed_codes(code):
2896 mobj = re.search(
2897 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2898 code)
2899 obfucasted_code, base, count, symbols = mobj.groups()
2900 base = int(base)
2901 count = int(count)
2902 symbols = symbols.split('|')
2903 symbol_table = {}
2904
2905 while count:
2906 count -= 1
2907 base_n_count = encode_base_n(count, base)
2908 symbol_table[base_n_count] = symbols[count] or base_n_count
2909
2910 return re.sub(
2911 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2912 obfucasted_code)
2913
2914
2915 def parse_m3u8_attributes(attrib):
2916 info = {}
2917 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2918 if val.startswith('"'):
2919 val = val[1:-1]
2920 info[key] = val
2921 return info
2922
2923
2924 def urshift(val, n):
2925 return val >> n if val >= 0 else (val + 0x100000000) >> n