]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[utils] Add decode_png for openload (#9706)
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import errno
15 import functools
16 import gzip
17 import io
18 import itertools
19 import json
20 import locale
21 import math
22 import operator
23 import os
24 import pipes
25 import platform
26 import re
27 import socket
28 import ssl
29 import subprocess
30 import sys
31 import tempfile
32 import traceback
33 import xml.etree.ElementTree
34 import zlib
35
36 from .compat import (
37 compat_HTMLParser,
38 compat_basestring,
39 compat_chr,
40 compat_etree_fromstring,
41 compat_html_entities,
42 compat_html_entities_html5,
43 compat_http_client,
44 compat_kwargs,
45 compat_parse_qs,
46 compat_shlex_quote,
47 compat_socket_create_connection,
48 compat_str,
49 compat_struct_pack,
50 compat_struct_unpack,
51 compat_urllib_error,
52 compat_urllib_parse,
53 compat_urllib_parse_urlencode,
54 compat_urllib_parse_urlparse,
55 compat_urllib_parse_unquote_plus,
56 compat_urllib_request,
57 compat_urlparse,
58 compat_xpath,
59 )
60
61 from .socks import (
62 ProxyType,
63 sockssocket,
64 )
65
66
67 def register_socks_protocols():
68 # "Register" SOCKS protocols
69 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
70 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
71 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
72 if scheme not in compat_urlparse.uses_netloc:
73 compat_urlparse.uses_netloc.append(scheme)
74
75
76 # This is not clearly defined otherwise
77 compiled_regex_type = type(re.compile(''))
78
79 std_headers = {
80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
85 }
86
87
88 NO_DEFAULT = object()
89
90 ENGLISH_MONTH_NAMES = [
91 'January', 'February', 'March', 'April', 'May', 'June',
92 'July', 'August', 'September', 'October', 'November', 'December']
93
94 KNOWN_EXTENSIONS = (
95 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
96 'flv', 'f4v', 'f4a', 'f4b',
97 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
98 'mkv', 'mka', 'mk3d',
99 'avi', 'divx',
100 'mov',
101 'asf', 'wmv', 'wma',
102 '3gp', '3g2',
103 'mp3',
104 'flac',
105 'ape',
106 'wav',
107 'f4f', 'f4m', 'm3u8', 'smil')
108
109 # needed for sanitizing filenames in restricted mode
110 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
111 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
112 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
113
114 DATE_FORMATS = (
115 '%d %B %Y',
116 '%d %b %Y',
117 '%B %d %Y',
118 '%b %d %Y',
119 '%b %dst %Y %I:%M',
120 '%b %dnd %Y %I:%M',
121 '%b %dth %Y %I:%M',
122 '%Y %m %d',
123 '%Y-%m-%d',
124 '%Y/%m/%d',
125 '%Y/%m/%d %H:%M:%S',
126 '%Y-%m-%d %H:%M:%S',
127 '%Y-%m-%d %H:%M:%S.%f',
128 '%d.%m.%Y %H:%M',
129 '%d.%m.%Y %H.%M',
130 '%Y-%m-%dT%H:%M:%SZ',
131 '%Y-%m-%dT%H:%M:%S.%fZ',
132 '%Y-%m-%dT%H:%M:%S.%f0Z',
133 '%Y-%m-%dT%H:%M:%S',
134 '%Y-%m-%dT%H:%M:%S.%f',
135 '%Y-%m-%dT%H:%M',
136 )
137
138 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
139 DATE_FORMATS_DAY_FIRST.extend([
140 '%d-%m-%Y',
141 '%d.%m.%Y',
142 '%d.%m.%y',
143 '%d/%m/%Y',
144 '%d/%m/%y',
145 '%d/%m/%Y %H:%M:%S',
146 ])
147
148 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
149 DATE_FORMATS_MONTH_FIRST.extend([
150 '%m-%d-%Y',
151 '%m.%d.%Y',
152 '%m/%d/%Y',
153 '%m/%d/%y',
154 '%m/%d/%Y %H:%M:%S',
155 ])
156
157
158 def preferredencoding():
159 """Get preferred encoding.
160
161 Returns the best encoding scheme for the system, based on
162 locale.getpreferredencoding() and some further tweaks.
163 """
164 try:
165 pref = locale.getpreferredencoding()
166 'TEST'.encode(pref)
167 except Exception:
168 pref = 'UTF-8'
169
170 return pref
171
172
173 def write_json_file(obj, fn):
174 """ Encode obj as JSON and write it to fn, atomically if possible """
175
176 fn = encodeFilename(fn)
177 if sys.version_info < (3, 0) and sys.platform != 'win32':
178 encoding = get_filesystem_encoding()
179 # os.path.basename returns a bytes object, but NamedTemporaryFile
180 # will fail if the filename contains non ascii characters unless we
181 # use a unicode object
182 path_basename = lambda f: os.path.basename(fn).decode(encoding)
183 # the same for os.path.dirname
184 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
185 else:
186 path_basename = os.path.basename
187 path_dirname = os.path.dirname
188
189 args = {
190 'suffix': '.tmp',
191 'prefix': path_basename(fn) + '.',
192 'dir': path_dirname(fn),
193 'delete': False,
194 }
195
196 # In Python 2.x, json.dump expects a bytestream.
197 # In Python 3.x, it writes to a character stream
198 if sys.version_info < (3, 0):
199 args['mode'] = 'wb'
200 else:
201 args.update({
202 'mode': 'w',
203 'encoding': 'utf-8',
204 })
205
206 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
207
208 try:
209 with tf:
210 json.dump(obj, tf)
211 if sys.platform == 'win32':
212 # Need to remove existing file on Windows, else os.rename raises
213 # WindowsError or FileExistsError.
214 try:
215 os.unlink(fn)
216 except OSError:
217 pass
218 os.rename(tf.name, fn)
219 except Exception:
220 try:
221 os.remove(tf.name)
222 except OSError:
223 pass
224 raise
225
226
227 if sys.version_info >= (2, 7):
228 def find_xpath_attr(node, xpath, key, val=None):
229 """ Find the xpath xpath[@key=val] """
230 assert re.match(r'^[a-zA-Z_-]+$', key)
231 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
232 return node.find(expr)
233 else:
234 def find_xpath_attr(node, xpath, key, val=None):
235 for f in node.findall(compat_xpath(xpath)):
236 if key not in f.attrib:
237 continue
238 if val is None or f.attrib.get(key) == val:
239 return f
240 return None
241
242 # On python2.6 the xml.etree.ElementTree.Element methods don't support
243 # the namespace parameter
244
245
246 def xpath_with_ns(path, ns_map):
247 components = [c.split(':') for c in path.split('/')]
248 replaced = []
249 for c in components:
250 if len(c) == 1:
251 replaced.append(c[0])
252 else:
253 ns, tag = c
254 replaced.append('{%s}%s' % (ns_map[ns], tag))
255 return '/'.join(replaced)
256
257
258 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
259 def _find_xpath(xpath):
260 return node.find(compat_xpath(xpath))
261
262 if isinstance(xpath, (str, compat_str)):
263 n = _find_xpath(xpath)
264 else:
265 for xp in xpath:
266 n = _find_xpath(xp)
267 if n is not None:
268 break
269
270 if n is None:
271 if default is not NO_DEFAULT:
272 return default
273 elif fatal:
274 name = xpath if name is None else name
275 raise ExtractorError('Could not find XML element %s' % name)
276 else:
277 return None
278 return n
279
280
281 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
282 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
283 if n is None or n == default:
284 return n
285 if n.text is None:
286 if default is not NO_DEFAULT:
287 return default
288 elif fatal:
289 name = xpath if name is None else name
290 raise ExtractorError('Could not find XML element\'s text %s' % name)
291 else:
292 return None
293 return n.text
294
295
296 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
297 n = find_xpath_attr(node, xpath, key)
298 if n is None:
299 if default is not NO_DEFAULT:
300 return default
301 elif fatal:
302 name = '%s[@%s]' % (xpath, key) if name is None else name
303 raise ExtractorError('Could not find XML attribute %s' % name)
304 else:
305 return None
306 return n.attrib[key]
307
308
309 def get_element_by_id(id, html):
310 """Return the content of the tag with the specified ID in the passed HTML document"""
311 return get_element_by_attribute('id', id, html)
312
313
314 def get_element_by_class(class_name, html):
315 return get_element_by_attribute(
316 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
317 html, escape_value=False)
318
319
320 def get_element_by_attribute(attribute, value, html, escape_value=True):
321 """Return the content of the tag with the specified attribute in the passed HTML document"""
322
323 value = re.escape(value) if escape_value else value
324
325 m = re.search(r'''(?xs)
326 <([a-zA-Z0-9:._-]+)
327 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
328 \s+%s=['"]?%s['"]?
329 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
330 \s*>
331 (?P<content>.*?)
332 </\1>
333 ''' % (re.escape(attribute), value), html)
334
335 if not m:
336 return None
337 res = m.group('content')
338
339 if res.startswith('"') or res.startswith("'"):
340 res = res[1:-1]
341
342 return unescapeHTML(res)
343
344
345 class HTMLAttributeParser(compat_HTMLParser):
346 """Trivial HTML parser to gather the attributes for a single element"""
347 def __init__(self):
348 self.attrs = {}
349 compat_HTMLParser.__init__(self)
350
351 def handle_starttag(self, tag, attrs):
352 self.attrs = dict(attrs)
353
354
355 def extract_attributes(html_element):
356 """Given a string for an HTML element such as
357 <el
358 a="foo" B="bar" c="&98;az" d=boz
359 empty= noval entity="&amp;"
360 sq='"' dq="'"
361 >
362 Decode and return a dictionary of attributes.
363 {
364 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
365 'empty': '', 'noval': None, 'entity': '&',
366 'sq': '"', 'dq': '\''
367 }.
368 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
369 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
370 """
371 parser = HTMLAttributeParser()
372 parser.feed(html_element)
373 parser.close()
374 return parser.attrs
375
376
377 def clean_html(html):
378 """Clean an HTML snippet into a readable string"""
379
380 if html is None: # Convenience for sanitizing descriptions etc.
381 return html
382
383 # Newline vs <br />
384 html = html.replace('\n', ' ')
385 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
386 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
387 # Strip html tags
388 html = re.sub('<.*?>', '', html)
389 # Replace html entities
390 html = unescapeHTML(html)
391 return html.strip()
392
393
394 def sanitize_open(filename, open_mode):
395 """Try to open the given filename, and slightly tweak it if this fails.
396
397 Attempts to open the given filename. If this fails, it tries to change
398 the filename slightly, step by step, until it's either able to open it
399 or it fails and raises a final exception, like the standard open()
400 function.
401
402 It returns the tuple (stream, definitive_file_name).
403 """
404 try:
405 if filename == '-':
406 if sys.platform == 'win32':
407 import msvcrt
408 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
409 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
410 stream = open(encodeFilename(filename), open_mode)
411 return (stream, filename)
412 except (IOError, OSError) as err:
413 if err.errno in (errno.EACCES,):
414 raise
415
416 # In case of error, try to remove win32 forbidden chars
417 alt_filename = sanitize_path(filename)
418 if alt_filename == filename:
419 raise
420 else:
421 # An exception here should be caught in the caller
422 stream = open(encodeFilename(alt_filename), open_mode)
423 return (stream, alt_filename)
424
425
426 def timeconvert(timestr):
427 """Convert RFC 2822 defined time string into system timestamp"""
428 timestamp = None
429 timetuple = email.utils.parsedate_tz(timestr)
430 if timetuple is not None:
431 timestamp = email.utils.mktime_tz(timetuple)
432 return timestamp
433
434
435 def sanitize_filename(s, restricted=False, is_id=False):
436 """Sanitizes a string so it could be used as part of a filename.
437 If restricted is set, use a stricter subset of allowed characters.
438 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
439 """
440 def replace_insane(char):
441 if restricted and char in ACCENT_CHARS:
442 return ACCENT_CHARS[char]
443 if char == '?' or ord(char) < 32 or ord(char) == 127:
444 return ''
445 elif char == '"':
446 return '' if restricted else '\''
447 elif char == ':':
448 return '_-' if restricted else ' -'
449 elif char in '\\/|*<>':
450 return '_'
451 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
452 return '_'
453 if restricted and ord(char) > 127:
454 return '_'
455 return char
456
457 # Handle timestamps
458 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
459 result = ''.join(map(replace_insane, s))
460 if not is_id:
461 while '__' in result:
462 result = result.replace('__', '_')
463 result = result.strip('_')
464 # Common case of "Foreign band name - English song title"
465 if restricted and result.startswith('-_'):
466 result = result[2:]
467 if result.startswith('-'):
468 result = '_' + result[len('-'):]
469 result = result.lstrip('.')
470 if not result:
471 result = '_'
472 return result
473
474
475 def sanitize_path(s):
476 """Sanitizes and normalizes path on Windows"""
477 if sys.platform != 'win32':
478 return s
479 drive_or_unc, _ = os.path.splitdrive(s)
480 if sys.version_info < (2, 7) and not drive_or_unc:
481 drive_or_unc, _ = os.path.splitunc(s)
482 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
483 if drive_or_unc:
484 norm_path.pop(0)
485 sanitized_path = [
486 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
487 for path_part in norm_path]
488 if drive_or_unc:
489 sanitized_path.insert(0, drive_or_unc + os.path.sep)
490 return os.path.join(*sanitized_path)
491
492
493 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
494 # unwanted failures due to missing protocol
495 def sanitize_url(url):
496 return 'http:%s' % url if url.startswith('//') else url
497
498
499 def sanitized_Request(url, *args, **kwargs):
500 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
501
502
503 def orderedSet(iterable):
504 """ Remove all duplicates from the input iterable """
505 res = []
506 for el in iterable:
507 if el not in res:
508 res.append(el)
509 return res
510
511
512 def _htmlentity_transform(entity_with_semicolon):
513 """Transforms an HTML entity to a character."""
514 entity = entity_with_semicolon[:-1]
515
516 # Known non-numeric HTML entity
517 if entity in compat_html_entities.name2codepoint:
518 return compat_chr(compat_html_entities.name2codepoint[entity])
519
520 # TODO: HTML5 allows entities without a semicolon. For example,
521 # '&Eacuteric' should be decoded as 'Éric'.
522 if entity_with_semicolon in compat_html_entities_html5:
523 return compat_html_entities_html5[entity_with_semicolon]
524
525 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
526 if mobj is not None:
527 numstr = mobj.group(1)
528 if numstr.startswith('x'):
529 base = 16
530 numstr = '0%s' % numstr
531 else:
532 base = 10
533 # See https://github.com/rg3/youtube-dl/issues/7518
534 try:
535 return compat_chr(int(numstr, base))
536 except ValueError:
537 pass
538
539 # Unknown entity in name, return its literal representation
540 return '&%s;' % entity
541
542
543 def unescapeHTML(s):
544 if s is None:
545 return None
546 assert type(s) == compat_str
547
548 return re.sub(
549 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
550
551
552 def get_subprocess_encoding():
553 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
554 # For subprocess calls, encode with locale encoding
555 # Refer to http://stackoverflow.com/a/9951851/35070
556 encoding = preferredencoding()
557 else:
558 encoding = sys.getfilesystemencoding()
559 if encoding is None:
560 encoding = 'utf-8'
561 return encoding
562
563
564 def encodeFilename(s, for_subprocess=False):
565 """
566 @param s The name of the file
567 """
568
569 assert type(s) == compat_str
570
571 # Python 3 has a Unicode API
572 if sys.version_info >= (3, 0):
573 return s
574
575 # Pass '' directly to use Unicode APIs on Windows 2000 and up
576 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
577 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
578 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
579 return s
580
581 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
582 if sys.platform.startswith('java'):
583 return s
584
585 return s.encode(get_subprocess_encoding(), 'ignore')
586
587
588 def decodeFilename(b, for_subprocess=False):
589
590 if sys.version_info >= (3, 0):
591 return b
592
593 if not isinstance(b, bytes):
594 return b
595
596 return b.decode(get_subprocess_encoding(), 'ignore')
597
598
599 def encodeArgument(s):
600 if not isinstance(s, compat_str):
601 # Legacy code that uses byte strings
602 # Uncomment the following line after fixing all post processors
603 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
604 s = s.decode('ascii')
605 return encodeFilename(s, True)
606
607
608 def decodeArgument(b):
609 return decodeFilename(b, True)
610
611
612 def decodeOption(optval):
613 if optval is None:
614 return optval
615 if isinstance(optval, bytes):
616 optval = optval.decode(preferredencoding())
617
618 assert isinstance(optval, compat_str)
619 return optval
620
621
622 def formatSeconds(secs):
623 if secs > 3600:
624 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
625 elif secs > 60:
626 return '%d:%02d' % (secs // 60, secs % 60)
627 else:
628 return '%d' % secs
629
630
631 def make_HTTPS_handler(params, **kwargs):
632 opts_no_check_certificate = params.get('nocheckcertificate', False)
633 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
634 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
635 if opts_no_check_certificate:
636 context.check_hostname = False
637 context.verify_mode = ssl.CERT_NONE
638 try:
639 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
640 except TypeError:
641 # Python 2.7.8
642 # (create_default_context present but HTTPSHandler has no context=)
643 pass
644
645 if sys.version_info < (3, 2):
646 return YoutubeDLHTTPSHandler(params, **kwargs)
647 else: # Python < 3.4
648 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
649 context.verify_mode = (ssl.CERT_NONE
650 if opts_no_check_certificate
651 else ssl.CERT_REQUIRED)
652 context.set_default_verify_paths()
653 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
654
655
656 def bug_reports_message():
657 if ytdl_is_updateable():
658 update_cmd = 'type youtube-dl -U to update'
659 else:
660 update_cmd = 'see https://yt-dl.org/update on how to update'
661 msg = '; please report this issue on https://yt-dl.org/bug .'
662 msg += ' Make sure you are using the latest version; %s.' % update_cmd
663 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
664 return msg
665
666
667 class ExtractorError(Exception):
668 """Error during info extraction."""
669
670 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
671 """ tb, if given, is the original traceback (so that it can be printed out).
672 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
673 """
674
675 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
676 expected = True
677 if video_id is not None:
678 msg = video_id + ': ' + msg
679 if cause:
680 msg += ' (caused by %r)' % cause
681 if not expected:
682 msg += bug_reports_message()
683 super(ExtractorError, self).__init__(msg)
684
685 self.traceback = tb
686 self.exc_info = sys.exc_info() # preserve original exception
687 self.cause = cause
688 self.video_id = video_id
689
690 def format_traceback(self):
691 if self.traceback is None:
692 return None
693 return ''.join(traceback.format_tb(self.traceback))
694
695
696 class UnsupportedError(ExtractorError):
697 def __init__(self, url):
698 super(UnsupportedError, self).__init__(
699 'Unsupported URL: %s' % url, expected=True)
700 self.url = url
701
702
703 class RegexNotFoundError(ExtractorError):
704 """Error when a regex didn't match"""
705 pass
706
707
708 class DownloadError(Exception):
709 """Download Error exception.
710
711 This exception may be thrown by FileDownloader objects if they are not
712 configured to continue on errors. They will contain the appropriate
713 error message.
714 """
715
716 def __init__(self, msg, exc_info=None):
717 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
718 super(DownloadError, self).__init__(msg)
719 self.exc_info = exc_info
720
721
722 class SameFileError(Exception):
723 """Same File exception.
724
725 This exception will be thrown by FileDownloader objects if they detect
726 multiple files would have to be downloaded to the same file on disk.
727 """
728 pass
729
730
731 class PostProcessingError(Exception):
732 """Post Processing exception.
733
734 This exception may be raised by PostProcessor's .run() method to
735 indicate an error in the postprocessing task.
736 """
737
738 def __init__(self, msg):
739 self.msg = msg
740
741
742 class MaxDownloadsReached(Exception):
743 """ --max-downloads limit has been reached. """
744 pass
745
746
747 class UnavailableVideoError(Exception):
748 """Unavailable Format exception.
749
750 This exception will be thrown when a video is requested
751 in a format that is not available for that video.
752 """
753 pass
754
755
756 class ContentTooShortError(Exception):
757 """Content Too Short exception.
758
759 This exception may be raised by FileDownloader objects when a file they
760 download is too small for what the server announced first, indicating
761 the connection was probably interrupted.
762 """
763
764 def __init__(self, downloaded, expected):
765 # Both in bytes
766 self.downloaded = downloaded
767 self.expected = expected
768
769
770 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
771 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
772 # expected HTTP responses to meet HTTP/1.0 or later (see also
773 # https://github.com/rg3/youtube-dl/issues/6727)
774 if sys.version_info < (3, 0):
775 kwargs[b'strict'] = True
776 hc = http_class(*args, **kwargs)
777 source_address = ydl_handler._params.get('source_address')
778 if source_address is not None:
779 sa = (source_address, 0)
780 if hasattr(hc, 'source_address'): # Python 2.7+
781 hc.source_address = sa
782 else: # Python 2.6
783 def _hc_connect(self, *args, **kwargs):
784 sock = compat_socket_create_connection(
785 (self.host, self.port), self.timeout, sa)
786 if is_https:
787 self.sock = ssl.wrap_socket(
788 sock, self.key_file, self.cert_file,
789 ssl_version=ssl.PROTOCOL_TLSv1)
790 else:
791 self.sock = sock
792 hc.connect = functools.partial(_hc_connect, hc)
793
794 return hc
795
796
797 def handle_youtubedl_headers(headers):
798 filtered_headers = headers
799
800 if 'Youtubedl-no-compression' in filtered_headers:
801 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
802 del filtered_headers['Youtubedl-no-compression']
803
804 return filtered_headers
805
806
807 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
808 """Handler for HTTP requests and responses.
809
810 This class, when installed with an OpenerDirector, automatically adds
811 the standard headers to every HTTP request and handles gzipped and
812 deflated responses from web servers. If compression is to be avoided in
813 a particular request, the original request in the program code only has
814 to include the HTTP header "Youtubedl-no-compression", which will be
815 removed before making the real request.
816
817 Part of this code was copied from:
818
819 http://techknack.net/python-urllib2-handlers/
820
821 Andrew Rowls, the author of that code, agreed to release it to the
822 public domain.
823 """
824
825 def __init__(self, params, *args, **kwargs):
826 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
827 self._params = params
828
829 def http_open(self, req):
830 conn_class = compat_http_client.HTTPConnection
831
832 socks_proxy = req.headers.get('Ytdl-socks-proxy')
833 if socks_proxy:
834 conn_class = make_socks_conn_class(conn_class, socks_proxy)
835 del req.headers['Ytdl-socks-proxy']
836
837 return self.do_open(functools.partial(
838 _create_http_connection, self, conn_class, False),
839 req)
840
841 @staticmethod
842 def deflate(data):
843 try:
844 return zlib.decompress(data, -zlib.MAX_WBITS)
845 except zlib.error:
846 return zlib.decompress(data)
847
848 @staticmethod
849 def addinfourl_wrapper(stream, headers, url, code):
850 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
851 return compat_urllib_request.addinfourl(stream, headers, url, code)
852 ret = compat_urllib_request.addinfourl(stream, headers, url)
853 ret.code = code
854 return ret
855
856 def http_request(self, req):
857 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
858 # always respected by websites, some tend to give out URLs with non percent-encoded
859 # non-ASCII characters (see telemb.py, ard.py [#3412])
860 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
861 # To work around aforementioned issue we will replace request's original URL with
862 # percent-encoded one
863 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
864 # the code of this workaround has been moved here from YoutubeDL.urlopen()
865 url = req.get_full_url()
866 url_escaped = escape_url(url)
867
868 # Substitute URL if any change after escaping
869 if url != url_escaped:
870 req = update_Request(req, url=url_escaped)
871
872 for h, v in std_headers.items():
873 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
874 # The dict keys are capitalized because of this bug by urllib
875 if h.capitalize() not in req.headers:
876 req.add_header(h, v)
877
878 req.headers = handle_youtubedl_headers(req.headers)
879
880 if sys.version_info < (2, 7) and '#' in req.get_full_url():
881 # Python 2.6 is brain-dead when it comes to fragments
882 req._Request__original = req._Request__original.partition('#')[0]
883 req._Request__r_type = req._Request__r_type.partition('#')[0]
884
885 return req
886
887 def http_response(self, req, resp):
888 old_resp = resp
889 # gzip
890 if resp.headers.get('Content-encoding', '') == 'gzip':
891 content = resp.read()
892 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
893 try:
894 uncompressed = io.BytesIO(gz.read())
895 except IOError as original_ioerror:
896 # There may be junk add the end of the file
897 # See http://stackoverflow.com/q/4928560/35070 for details
898 for i in range(1, 1024):
899 try:
900 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
901 uncompressed = io.BytesIO(gz.read())
902 except IOError:
903 continue
904 break
905 else:
906 raise original_ioerror
907 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
908 resp.msg = old_resp.msg
909 del resp.headers['Content-encoding']
910 # deflate
911 if resp.headers.get('Content-encoding', '') == 'deflate':
912 gz = io.BytesIO(self.deflate(resp.read()))
913 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
914 resp.msg = old_resp.msg
915 del resp.headers['Content-encoding']
916 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
917 # https://github.com/rg3/youtube-dl/issues/6457).
918 if 300 <= resp.code < 400:
919 location = resp.headers.get('Location')
920 if location:
921 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
922 if sys.version_info >= (3, 0):
923 location = location.encode('iso-8859-1').decode('utf-8')
924 else:
925 location = location.decode('utf-8')
926 location_escaped = escape_url(location)
927 if location != location_escaped:
928 del resp.headers['Location']
929 if sys.version_info < (3, 0):
930 location_escaped = location_escaped.encode('utf-8')
931 resp.headers['Location'] = location_escaped
932 return resp
933
934 https_request = http_request
935 https_response = http_response
936
937
938 def make_socks_conn_class(base_class, socks_proxy):
939 assert issubclass(base_class, (
940 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
941
942 url_components = compat_urlparse.urlparse(socks_proxy)
943 if url_components.scheme.lower() == 'socks5':
944 socks_type = ProxyType.SOCKS5
945 elif url_components.scheme.lower() in ('socks', 'socks4'):
946 socks_type = ProxyType.SOCKS4
947 elif url_components.scheme.lower() == 'socks4a':
948 socks_type = ProxyType.SOCKS4A
949
950 def unquote_if_non_empty(s):
951 if not s:
952 return s
953 return compat_urllib_parse_unquote_plus(s)
954
955 proxy_args = (
956 socks_type,
957 url_components.hostname, url_components.port or 1080,
958 True, # Remote DNS
959 unquote_if_non_empty(url_components.username),
960 unquote_if_non_empty(url_components.password),
961 )
962
963 class SocksConnection(base_class):
964 def connect(self):
965 self.sock = sockssocket()
966 self.sock.setproxy(*proxy_args)
967 if type(self.timeout) in (int, float):
968 self.sock.settimeout(self.timeout)
969 self.sock.connect((self.host, self.port))
970
971 if isinstance(self, compat_http_client.HTTPSConnection):
972 if hasattr(self, '_context'): # Python > 2.6
973 self.sock = self._context.wrap_socket(
974 self.sock, server_hostname=self.host)
975 else:
976 self.sock = ssl.wrap_socket(self.sock)
977
978 return SocksConnection
979
980
981 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
982 def __init__(self, params, https_conn_class=None, *args, **kwargs):
983 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
984 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
985 self._params = params
986
987 def https_open(self, req):
988 kwargs = {}
989 conn_class = self._https_conn_class
990
991 if hasattr(self, '_context'): # python > 2.6
992 kwargs['context'] = self._context
993 if hasattr(self, '_check_hostname'): # python 3.x
994 kwargs['check_hostname'] = self._check_hostname
995
996 socks_proxy = req.headers.get('Ytdl-socks-proxy')
997 if socks_proxy:
998 conn_class = make_socks_conn_class(conn_class, socks_proxy)
999 del req.headers['Ytdl-socks-proxy']
1000
1001 return self.do_open(functools.partial(
1002 _create_http_connection, self, conn_class, True),
1003 req, **kwargs)
1004
1005
1006 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1007 def __init__(self, cookiejar=None):
1008 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1009
1010 def http_response(self, request, response):
1011 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1012 # characters in Set-Cookie HTTP header of last response (see
1013 # https://github.com/rg3/youtube-dl/issues/6769).
1014 # In order to at least prevent crashing we will percent encode Set-Cookie
1015 # header before HTTPCookieProcessor starts processing it.
1016 # if sys.version_info < (3, 0) and response.headers:
1017 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1018 # set_cookie = response.headers.get(set_cookie_header)
1019 # if set_cookie:
1020 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1021 # if set_cookie != set_cookie_escaped:
1022 # del response.headers[set_cookie_header]
1023 # response.headers[set_cookie_header] = set_cookie_escaped
1024 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1025
1026 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1027 https_response = http_response
1028
1029
1030 def extract_timezone(date_str):
1031 m = re.search(
1032 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1033 date_str)
1034 if not m:
1035 timezone = datetime.timedelta()
1036 else:
1037 date_str = date_str[:-len(m.group('tz'))]
1038 if not m.group('sign'):
1039 timezone = datetime.timedelta()
1040 else:
1041 sign = 1 if m.group('sign') == '+' else -1
1042 timezone = datetime.timedelta(
1043 hours=sign * int(m.group('hours')),
1044 minutes=sign * int(m.group('minutes')))
1045 return timezone, date_str
1046
1047
1048 def parse_iso8601(date_str, delimiter='T', timezone=None):
1049 """ Return a UNIX timestamp from the given date """
1050
1051 if date_str is None:
1052 return None
1053
1054 date_str = re.sub(r'\.[0-9]+', '', date_str)
1055
1056 if timezone is None:
1057 timezone, date_str = extract_timezone(date_str)
1058
1059 try:
1060 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1061 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1062 return calendar.timegm(dt.timetuple())
1063 except ValueError:
1064 pass
1065
1066
1067 def date_formats(day_first=True):
1068 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1069
1070
1071 def unified_strdate(date_str, day_first=True):
1072 """Return a string with the date in the format YYYYMMDD"""
1073
1074 if date_str is None:
1075 return None
1076 upload_date = None
1077 # Replace commas
1078 date_str = date_str.replace(',', ' ')
1079 # Remove AM/PM + timezone
1080 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1081 _, date_str = extract_timezone(date_str)
1082
1083 for expression in date_formats(day_first):
1084 try:
1085 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1086 except ValueError:
1087 pass
1088 if upload_date is None:
1089 timetuple = email.utils.parsedate_tz(date_str)
1090 if timetuple:
1091 try:
1092 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1093 except ValueError:
1094 pass
1095 if upload_date is not None:
1096 return compat_str(upload_date)
1097
1098
1099 def unified_timestamp(date_str, day_first=True):
1100 if date_str is None:
1101 return None
1102
1103 date_str = date_str.replace(',', ' ')
1104
1105 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1106 timezone, date_str = extract_timezone(date_str)
1107
1108 # Remove AM/PM + timezone
1109 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1110
1111 for expression in date_formats(day_first):
1112 try:
1113 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1114 return calendar.timegm(dt.timetuple())
1115 except ValueError:
1116 pass
1117 timetuple = email.utils.parsedate_tz(date_str)
1118 if timetuple:
1119 return calendar.timegm(timetuple) + pm_delta * 3600
1120
1121
1122 def determine_ext(url, default_ext='unknown_video'):
1123 if url is None:
1124 return default_ext
1125 guess = url.partition('?')[0].rpartition('.')[2]
1126 if re.match(r'^[A-Za-z0-9]+$', guess):
1127 return guess
1128 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1129 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1130 return guess.rstrip('/')
1131 else:
1132 return default_ext
1133
1134
1135 def subtitles_filename(filename, sub_lang, sub_format):
1136 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1137
1138
1139 def date_from_str(date_str):
1140 """
1141 Return a datetime object from a string in the format YYYYMMDD or
1142 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1143 today = datetime.date.today()
1144 if date_str in ('now', 'today'):
1145 return today
1146 if date_str == 'yesterday':
1147 return today - datetime.timedelta(days=1)
1148 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1149 if match is not None:
1150 sign = match.group('sign')
1151 time = int(match.group('time'))
1152 if sign == '-':
1153 time = -time
1154 unit = match.group('unit')
1155 # A bad approximation?
1156 if unit == 'month':
1157 unit = 'day'
1158 time *= 30
1159 elif unit == 'year':
1160 unit = 'day'
1161 time *= 365
1162 unit += 's'
1163 delta = datetime.timedelta(**{unit: time})
1164 return today + delta
1165 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1166
1167
1168 def hyphenate_date(date_str):
1169 """
1170 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1171 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1172 if match is not None:
1173 return '-'.join(match.groups())
1174 else:
1175 return date_str
1176
1177
1178 class DateRange(object):
1179 """Represents a time interval between two dates"""
1180
1181 def __init__(self, start=None, end=None):
1182 """start and end must be strings in the format accepted by date"""
1183 if start is not None:
1184 self.start = date_from_str(start)
1185 else:
1186 self.start = datetime.datetime.min.date()
1187 if end is not None:
1188 self.end = date_from_str(end)
1189 else:
1190 self.end = datetime.datetime.max.date()
1191 if self.start > self.end:
1192 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1193
1194 @classmethod
1195 def day(cls, day):
1196 """Returns a range that only contains the given day"""
1197 return cls(day, day)
1198
1199 def __contains__(self, date):
1200 """Check if the date is in the range"""
1201 if not isinstance(date, datetime.date):
1202 date = date_from_str(date)
1203 return self.start <= date <= self.end
1204
1205 def __str__(self):
1206 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1207
1208
1209 def platform_name():
1210 """ Returns the platform name as a compat_str """
1211 res = platform.platform()
1212 if isinstance(res, bytes):
1213 res = res.decode(preferredencoding())
1214
1215 assert isinstance(res, compat_str)
1216 return res
1217
1218
1219 def _windows_write_string(s, out):
1220 """ Returns True if the string was written using special methods,
1221 False if it has yet to be written out."""
1222 # Adapted from http://stackoverflow.com/a/3259271/35070
1223
1224 import ctypes
1225 import ctypes.wintypes
1226
1227 WIN_OUTPUT_IDS = {
1228 1: -11,
1229 2: -12,
1230 }
1231
1232 try:
1233 fileno = out.fileno()
1234 except AttributeError:
1235 # If the output stream doesn't have a fileno, it's virtual
1236 return False
1237 except io.UnsupportedOperation:
1238 # Some strange Windows pseudo files?
1239 return False
1240 if fileno not in WIN_OUTPUT_IDS:
1241 return False
1242
1243 GetStdHandle = ctypes.WINFUNCTYPE(
1244 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1245 (b'GetStdHandle', ctypes.windll.kernel32))
1246 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1247
1248 WriteConsoleW = ctypes.WINFUNCTYPE(
1249 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1250 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1251 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1252 written = ctypes.wintypes.DWORD(0)
1253
1254 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1255 FILE_TYPE_CHAR = 0x0002
1256 FILE_TYPE_REMOTE = 0x8000
1257 GetConsoleMode = ctypes.WINFUNCTYPE(
1258 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1259 ctypes.POINTER(ctypes.wintypes.DWORD))(
1260 (b'GetConsoleMode', ctypes.windll.kernel32))
1261 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1262
1263 def not_a_console(handle):
1264 if handle == INVALID_HANDLE_VALUE or handle is None:
1265 return True
1266 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1267 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1268
1269 if not_a_console(h):
1270 return False
1271
1272 def next_nonbmp_pos(s):
1273 try:
1274 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1275 except StopIteration:
1276 return len(s)
1277
1278 while s:
1279 count = min(next_nonbmp_pos(s), 1024)
1280
1281 ret = WriteConsoleW(
1282 h, s, count if count else 2, ctypes.byref(written), None)
1283 if ret == 0:
1284 raise OSError('Failed to write string')
1285 if not count: # We just wrote a non-BMP character
1286 assert written.value == 2
1287 s = s[1:]
1288 else:
1289 assert written.value > 0
1290 s = s[written.value:]
1291 return True
1292
1293
1294 def write_string(s, out=None, encoding=None):
1295 if out is None:
1296 out = sys.stderr
1297 assert type(s) == compat_str
1298
1299 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1300 if _windows_write_string(s, out):
1301 return
1302
1303 if ('b' in getattr(out, 'mode', '') or
1304 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1305 byt = s.encode(encoding or preferredencoding(), 'ignore')
1306 out.write(byt)
1307 elif hasattr(out, 'buffer'):
1308 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1309 byt = s.encode(enc, 'ignore')
1310 out.buffer.write(byt)
1311 else:
1312 out.write(s)
1313 out.flush()
1314
1315
1316 def bytes_to_intlist(bs):
1317 if not bs:
1318 return []
1319 if isinstance(bs[0], int): # Python 3
1320 return list(bs)
1321 else:
1322 return [ord(c) for c in bs]
1323
1324
1325 def intlist_to_bytes(xs):
1326 if not xs:
1327 return b''
1328 return compat_struct_pack('%dB' % len(xs), *xs)
1329
1330
1331 # Cross-platform file locking
1332 if sys.platform == 'win32':
1333 import ctypes.wintypes
1334 import msvcrt
1335
1336 class OVERLAPPED(ctypes.Structure):
1337 _fields_ = [
1338 ('Internal', ctypes.wintypes.LPVOID),
1339 ('InternalHigh', ctypes.wintypes.LPVOID),
1340 ('Offset', ctypes.wintypes.DWORD),
1341 ('OffsetHigh', ctypes.wintypes.DWORD),
1342 ('hEvent', ctypes.wintypes.HANDLE),
1343 ]
1344
1345 kernel32 = ctypes.windll.kernel32
1346 LockFileEx = kernel32.LockFileEx
1347 LockFileEx.argtypes = [
1348 ctypes.wintypes.HANDLE, # hFile
1349 ctypes.wintypes.DWORD, # dwFlags
1350 ctypes.wintypes.DWORD, # dwReserved
1351 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1352 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1353 ctypes.POINTER(OVERLAPPED) # Overlapped
1354 ]
1355 LockFileEx.restype = ctypes.wintypes.BOOL
1356 UnlockFileEx = kernel32.UnlockFileEx
1357 UnlockFileEx.argtypes = [
1358 ctypes.wintypes.HANDLE, # hFile
1359 ctypes.wintypes.DWORD, # dwReserved
1360 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1361 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1362 ctypes.POINTER(OVERLAPPED) # Overlapped
1363 ]
1364 UnlockFileEx.restype = ctypes.wintypes.BOOL
1365 whole_low = 0xffffffff
1366 whole_high = 0x7fffffff
1367
1368 def _lock_file(f, exclusive):
1369 overlapped = OVERLAPPED()
1370 overlapped.Offset = 0
1371 overlapped.OffsetHigh = 0
1372 overlapped.hEvent = 0
1373 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1374 handle = msvcrt.get_osfhandle(f.fileno())
1375 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1376 whole_low, whole_high, f._lock_file_overlapped_p):
1377 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1378
1379 def _unlock_file(f):
1380 assert f._lock_file_overlapped_p
1381 handle = msvcrt.get_osfhandle(f.fileno())
1382 if not UnlockFileEx(handle, 0,
1383 whole_low, whole_high, f._lock_file_overlapped_p):
1384 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1385
1386 else:
1387 # Some platforms, such as Jython, is missing fcntl
1388 try:
1389 import fcntl
1390
1391 def _lock_file(f, exclusive):
1392 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1393
1394 def _unlock_file(f):
1395 fcntl.flock(f, fcntl.LOCK_UN)
1396 except ImportError:
1397 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1398
1399 def _lock_file(f, exclusive):
1400 raise IOError(UNSUPPORTED_MSG)
1401
1402 def _unlock_file(f):
1403 raise IOError(UNSUPPORTED_MSG)
1404
1405
1406 class locked_file(object):
1407 def __init__(self, filename, mode, encoding=None):
1408 assert mode in ['r', 'a', 'w']
1409 self.f = io.open(filename, mode, encoding=encoding)
1410 self.mode = mode
1411
1412 def __enter__(self):
1413 exclusive = self.mode != 'r'
1414 try:
1415 _lock_file(self.f, exclusive)
1416 except IOError:
1417 self.f.close()
1418 raise
1419 return self
1420
1421 def __exit__(self, etype, value, traceback):
1422 try:
1423 _unlock_file(self.f)
1424 finally:
1425 self.f.close()
1426
1427 def __iter__(self):
1428 return iter(self.f)
1429
1430 def write(self, *args):
1431 return self.f.write(*args)
1432
1433 def read(self, *args):
1434 return self.f.read(*args)
1435
1436
1437 def get_filesystem_encoding():
1438 encoding = sys.getfilesystemencoding()
1439 return encoding if encoding is not None else 'utf-8'
1440
1441
1442 def shell_quote(args):
1443 quoted_args = []
1444 encoding = get_filesystem_encoding()
1445 for a in args:
1446 if isinstance(a, bytes):
1447 # We may get a filename encoded with 'encodeFilename'
1448 a = a.decode(encoding)
1449 quoted_args.append(pipes.quote(a))
1450 return ' '.join(quoted_args)
1451
1452
1453 def smuggle_url(url, data):
1454 """ Pass additional data in a URL for internal use. """
1455
1456 url, idata = unsmuggle_url(url, {})
1457 data.update(idata)
1458 sdata = compat_urllib_parse_urlencode(
1459 {'__youtubedl_smuggle': json.dumps(data)})
1460 return url + '#' + sdata
1461
1462
1463 def unsmuggle_url(smug_url, default=None):
1464 if '#__youtubedl_smuggle' not in smug_url:
1465 return smug_url, default
1466 url, _, sdata = smug_url.rpartition('#')
1467 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1468 data = json.loads(jsond)
1469 return url, data
1470
1471
1472 def format_bytes(bytes):
1473 if bytes is None:
1474 return 'N/A'
1475 if type(bytes) is str:
1476 bytes = float(bytes)
1477 if bytes == 0.0:
1478 exponent = 0
1479 else:
1480 exponent = int(math.log(bytes, 1024.0))
1481 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1482 converted = float(bytes) / float(1024 ** exponent)
1483 return '%.2f%s' % (converted, suffix)
1484
1485
1486 def lookup_unit_table(unit_table, s):
1487 units_re = '|'.join(re.escape(u) for u in unit_table)
1488 m = re.match(
1489 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1490 if not m:
1491 return None
1492 num_str = m.group('num').replace(',', '.')
1493 mult = unit_table[m.group('unit')]
1494 return int(float(num_str) * mult)
1495
1496
1497 def parse_filesize(s):
1498 if s is None:
1499 return None
1500
1501 # The lower-case forms are of course incorrect and unofficial,
1502 # but we support those too
1503 _UNIT_TABLE = {
1504 'B': 1,
1505 'b': 1,
1506 'KiB': 1024,
1507 'KB': 1000,
1508 'kB': 1024,
1509 'Kb': 1000,
1510 'MiB': 1024 ** 2,
1511 'MB': 1000 ** 2,
1512 'mB': 1024 ** 2,
1513 'Mb': 1000 ** 2,
1514 'GiB': 1024 ** 3,
1515 'GB': 1000 ** 3,
1516 'gB': 1024 ** 3,
1517 'Gb': 1000 ** 3,
1518 'TiB': 1024 ** 4,
1519 'TB': 1000 ** 4,
1520 'tB': 1024 ** 4,
1521 'Tb': 1000 ** 4,
1522 'PiB': 1024 ** 5,
1523 'PB': 1000 ** 5,
1524 'pB': 1024 ** 5,
1525 'Pb': 1000 ** 5,
1526 'EiB': 1024 ** 6,
1527 'EB': 1000 ** 6,
1528 'eB': 1024 ** 6,
1529 'Eb': 1000 ** 6,
1530 'ZiB': 1024 ** 7,
1531 'ZB': 1000 ** 7,
1532 'zB': 1024 ** 7,
1533 'Zb': 1000 ** 7,
1534 'YiB': 1024 ** 8,
1535 'YB': 1000 ** 8,
1536 'yB': 1024 ** 8,
1537 'Yb': 1000 ** 8,
1538 }
1539
1540 return lookup_unit_table(_UNIT_TABLE, s)
1541
1542
1543 def parse_count(s):
1544 if s is None:
1545 return None
1546
1547 s = s.strip()
1548
1549 if re.match(r'^[\d,.]+$', s):
1550 return str_to_int(s)
1551
1552 _UNIT_TABLE = {
1553 'k': 1000,
1554 'K': 1000,
1555 'm': 1000 ** 2,
1556 'M': 1000 ** 2,
1557 'kk': 1000 ** 2,
1558 'KK': 1000 ** 2,
1559 }
1560
1561 return lookup_unit_table(_UNIT_TABLE, s)
1562
1563
1564 def month_by_name(name):
1565 """ Return the number of a month by (locale-independently) English name """
1566
1567 try:
1568 return ENGLISH_MONTH_NAMES.index(name) + 1
1569 except ValueError:
1570 return None
1571
1572
1573 def month_by_abbreviation(abbrev):
1574 """ Return the number of a month by (locale-independently) English
1575 abbreviations """
1576
1577 try:
1578 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1579 except ValueError:
1580 return None
1581
1582
1583 def fix_xml_ampersands(xml_str):
1584 """Replace all the '&' by '&amp;' in XML"""
1585 return re.sub(
1586 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1587 '&amp;',
1588 xml_str)
1589
1590
1591 def setproctitle(title):
1592 assert isinstance(title, compat_str)
1593
1594 # ctypes in Jython is not complete
1595 # http://bugs.jython.org/issue2148
1596 if sys.platform.startswith('java'):
1597 return
1598
1599 try:
1600 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1601 except OSError:
1602 return
1603 title_bytes = title.encode('utf-8')
1604 buf = ctypes.create_string_buffer(len(title_bytes))
1605 buf.value = title_bytes
1606 try:
1607 libc.prctl(15, buf, 0, 0, 0)
1608 except AttributeError:
1609 return # Strange libc, just skip this
1610
1611
1612 def remove_start(s, start):
1613 return s[len(start):] if s is not None and s.startswith(start) else s
1614
1615
1616 def remove_end(s, end):
1617 return s[:-len(end)] if s is not None and s.endswith(end) else s
1618
1619
1620 def remove_quotes(s):
1621 if s is None or len(s) < 2:
1622 return s
1623 for quote in ('"', "'", ):
1624 if s[0] == quote and s[-1] == quote:
1625 return s[1:-1]
1626 return s
1627
1628
1629 def url_basename(url):
1630 path = compat_urlparse.urlparse(url).path
1631 return path.strip('/').split('/')[-1]
1632
1633
1634 class HEADRequest(compat_urllib_request.Request):
1635 def get_method(self):
1636 return 'HEAD'
1637
1638
1639 class PUTRequest(compat_urllib_request.Request):
1640 def get_method(self):
1641 return 'PUT'
1642
1643
1644 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1645 if get_attr:
1646 if v is not None:
1647 v = getattr(v, get_attr, None)
1648 if v == '':
1649 v = None
1650 if v is None:
1651 return default
1652 try:
1653 return int(v) * invscale // scale
1654 except ValueError:
1655 return default
1656
1657
1658 def str_or_none(v, default=None):
1659 return default if v is None else compat_str(v)
1660
1661
1662 def str_to_int(int_str):
1663 """ A more relaxed version of int_or_none """
1664 if int_str is None:
1665 return None
1666 int_str = re.sub(r'[,\.\+]', '', int_str)
1667 return int(int_str)
1668
1669
1670 def float_or_none(v, scale=1, invscale=1, default=None):
1671 if v is None:
1672 return default
1673 try:
1674 return float(v) * invscale / scale
1675 except ValueError:
1676 return default
1677
1678
1679 def strip_or_none(v):
1680 return None if v is None else v.strip()
1681
1682
1683 def parse_duration(s):
1684 if not isinstance(s, compat_basestring):
1685 return None
1686
1687 s = s.strip()
1688
1689 days, hours, mins, secs, ms = [None] * 5
1690 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1691 if m:
1692 days, hours, mins, secs, ms = m.groups()
1693 else:
1694 m = re.match(
1695 r'''(?ix)(?:P?T)?
1696 (?:
1697 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1698 )?
1699 (?:
1700 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1701 )?
1702 (?:
1703 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1704 )?
1705 (?:
1706 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1707 )?$''', s)
1708 if m:
1709 days, hours, mins, secs, ms = m.groups()
1710 else:
1711 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1712 if m:
1713 hours, mins = m.groups()
1714 else:
1715 return None
1716
1717 duration = 0
1718 if secs:
1719 duration += float(secs)
1720 if mins:
1721 duration += float(mins) * 60
1722 if hours:
1723 duration += float(hours) * 60 * 60
1724 if days:
1725 duration += float(days) * 24 * 60 * 60
1726 if ms:
1727 duration += float(ms)
1728 return duration
1729
1730
1731 def prepend_extension(filename, ext, expected_real_ext=None):
1732 name, real_ext = os.path.splitext(filename)
1733 return (
1734 '{0}.{1}{2}'.format(name, ext, real_ext)
1735 if not expected_real_ext or real_ext[1:] == expected_real_ext
1736 else '{0}.{1}'.format(filename, ext))
1737
1738
1739 def replace_extension(filename, ext, expected_real_ext=None):
1740 name, real_ext = os.path.splitext(filename)
1741 return '{0}.{1}'.format(
1742 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1743 ext)
1744
1745
1746 def check_executable(exe, args=[]):
1747 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1748 args can be a list of arguments for a short output (like -version) """
1749 try:
1750 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1751 except OSError:
1752 return False
1753 return exe
1754
1755
1756 def get_exe_version(exe, args=['--version'],
1757 version_re=None, unrecognized='present'):
1758 """ Returns the version of the specified executable,
1759 or False if the executable is not present """
1760 try:
1761 out, _ = subprocess.Popen(
1762 [encodeArgument(exe)] + args,
1763 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1764 except OSError:
1765 return False
1766 if isinstance(out, bytes): # Python 2.x
1767 out = out.decode('ascii', 'ignore')
1768 return detect_exe_version(out, version_re, unrecognized)
1769
1770
1771 def detect_exe_version(output, version_re=None, unrecognized='present'):
1772 assert isinstance(output, compat_str)
1773 if version_re is None:
1774 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1775 m = re.search(version_re, output)
1776 if m:
1777 return m.group(1)
1778 else:
1779 return unrecognized
1780
1781
1782 class PagedList(object):
1783 def __len__(self):
1784 # This is only useful for tests
1785 return len(self.getslice())
1786
1787
1788 class OnDemandPagedList(PagedList):
1789 def __init__(self, pagefunc, pagesize, use_cache=False):
1790 self._pagefunc = pagefunc
1791 self._pagesize = pagesize
1792 self._use_cache = use_cache
1793 if use_cache:
1794 self._cache = {}
1795
1796 def getslice(self, start=0, end=None):
1797 res = []
1798 for pagenum in itertools.count(start // self._pagesize):
1799 firstid = pagenum * self._pagesize
1800 nextfirstid = pagenum * self._pagesize + self._pagesize
1801 if start >= nextfirstid:
1802 continue
1803
1804 page_results = None
1805 if self._use_cache:
1806 page_results = self._cache.get(pagenum)
1807 if page_results is None:
1808 page_results = list(self._pagefunc(pagenum))
1809 if self._use_cache:
1810 self._cache[pagenum] = page_results
1811
1812 startv = (
1813 start % self._pagesize
1814 if firstid <= start < nextfirstid
1815 else 0)
1816
1817 endv = (
1818 ((end - 1) % self._pagesize) + 1
1819 if (end is not None and firstid <= end <= nextfirstid)
1820 else None)
1821
1822 if startv != 0 or endv is not None:
1823 page_results = page_results[startv:endv]
1824 res.extend(page_results)
1825
1826 # A little optimization - if current page is not "full", ie. does
1827 # not contain page_size videos then we can assume that this page
1828 # is the last one - there are no more ids on further pages -
1829 # i.e. no need to query again.
1830 if len(page_results) + startv < self._pagesize:
1831 break
1832
1833 # If we got the whole page, but the next page is not interesting,
1834 # break out early as well
1835 if end == nextfirstid:
1836 break
1837 return res
1838
1839
1840 class InAdvancePagedList(PagedList):
1841 def __init__(self, pagefunc, pagecount, pagesize):
1842 self._pagefunc = pagefunc
1843 self._pagecount = pagecount
1844 self._pagesize = pagesize
1845
1846 def getslice(self, start=0, end=None):
1847 res = []
1848 start_page = start // self._pagesize
1849 end_page = (
1850 self._pagecount if end is None else (end // self._pagesize + 1))
1851 skip_elems = start - start_page * self._pagesize
1852 only_more = None if end is None else end - start
1853 for pagenum in range(start_page, end_page):
1854 page = list(self._pagefunc(pagenum))
1855 if skip_elems:
1856 page = page[skip_elems:]
1857 skip_elems = None
1858 if only_more is not None:
1859 if len(page) < only_more:
1860 only_more -= len(page)
1861 else:
1862 page = page[:only_more]
1863 res.extend(page)
1864 break
1865 res.extend(page)
1866 return res
1867
1868
1869 def uppercase_escape(s):
1870 unicode_escape = codecs.getdecoder('unicode_escape')
1871 return re.sub(
1872 r'\\U[0-9a-fA-F]{8}',
1873 lambda m: unicode_escape(m.group(0))[0],
1874 s)
1875
1876
1877 def lowercase_escape(s):
1878 unicode_escape = codecs.getdecoder('unicode_escape')
1879 return re.sub(
1880 r'\\u[0-9a-fA-F]{4}',
1881 lambda m: unicode_escape(m.group(0))[0],
1882 s)
1883
1884
1885 def escape_rfc3986(s):
1886 """Escape non-ASCII characters as suggested by RFC 3986"""
1887 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1888 s = s.encode('utf-8')
1889 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1890
1891
1892 def escape_url(url):
1893 """Escape URL as suggested by RFC 3986"""
1894 url_parsed = compat_urllib_parse_urlparse(url)
1895 return url_parsed._replace(
1896 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1897 path=escape_rfc3986(url_parsed.path),
1898 params=escape_rfc3986(url_parsed.params),
1899 query=escape_rfc3986(url_parsed.query),
1900 fragment=escape_rfc3986(url_parsed.fragment)
1901 ).geturl()
1902
1903
1904 def read_batch_urls(batch_fd):
1905 def fixup(url):
1906 if not isinstance(url, compat_str):
1907 url = url.decode('utf-8', 'replace')
1908 BOM_UTF8 = '\xef\xbb\xbf'
1909 if url.startswith(BOM_UTF8):
1910 url = url[len(BOM_UTF8):]
1911 url = url.strip()
1912 if url.startswith(('#', ';', ']')):
1913 return False
1914 return url
1915
1916 with contextlib.closing(batch_fd) as fd:
1917 return [url for url in map(fixup, fd) if url]
1918
1919
1920 def urlencode_postdata(*args, **kargs):
1921 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1922
1923
1924 def update_url_query(url, query):
1925 if not query:
1926 return url
1927 parsed_url = compat_urlparse.urlparse(url)
1928 qs = compat_parse_qs(parsed_url.query)
1929 qs.update(query)
1930 return compat_urlparse.urlunparse(parsed_url._replace(
1931 query=compat_urllib_parse_urlencode(qs, True)))
1932
1933
1934 def update_Request(req, url=None, data=None, headers={}, query={}):
1935 req_headers = req.headers.copy()
1936 req_headers.update(headers)
1937 req_data = data or req.data
1938 req_url = update_url_query(url or req.get_full_url(), query)
1939 req_get_method = req.get_method()
1940 if req_get_method == 'HEAD':
1941 req_type = HEADRequest
1942 elif req_get_method == 'PUT':
1943 req_type = PUTRequest
1944 else:
1945 req_type = compat_urllib_request.Request
1946 new_req = req_type(
1947 req_url, data=req_data, headers=req_headers,
1948 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1949 if hasattr(req, 'timeout'):
1950 new_req.timeout = req.timeout
1951 return new_req
1952
1953
1954 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1955 if isinstance(key_or_keys, (list, tuple)):
1956 for key in key_or_keys:
1957 if key not in d or d[key] is None or skip_false_values and not d[key]:
1958 continue
1959 return d[key]
1960 return default
1961 return d.get(key_or_keys, default)
1962
1963
1964 def try_get(src, getter, expected_type=None):
1965 try:
1966 v = getter(src)
1967 except (AttributeError, KeyError, TypeError, IndexError):
1968 pass
1969 else:
1970 if expected_type is None or isinstance(v, expected_type):
1971 return v
1972
1973
1974 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1975 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1976
1977
1978 US_RATINGS = {
1979 'G': 0,
1980 'PG': 10,
1981 'PG-13': 13,
1982 'R': 16,
1983 'NC': 18,
1984 }
1985
1986
1987 def parse_age_limit(s):
1988 if s is None:
1989 return None
1990 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1991 return int(m.group('age')) if m else US_RATINGS.get(s)
1992
1993
1994 def strip_jsonp(code):
1995 return re.sub(
1996 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1997
1998
1999 def js_to_json(code):
2000 def fix_kv(m):
2001 v = m.group(0)
2002 if v in ('true', 'false', 'null'):
2003 return v
2004 elif v.startswith('/*') or v == ',':
2005 return ""
2006
2007 if v[0] in ("'", '"'):
2008 v = re.sub(r'(?s)\\.|"', lambda m: {
2009 '"': '\\"',
2010 "\\'": "'",
2011 '\\\n': '',
2012 '\\x': '\\u00',
2013 }.get(m.group(0), m.group(0)), v[1:-1])
2014
2015 INTEGER_TABLE = (
2016 (r'^0[xX][0-9a-fA-F]+', 16),
2017 (r'^0+[0-7]+', 8),
2018 )
2019
2020 for regex, base in INTEGER_TABLE:
2021 im = re.match(regex, v)
2022 if im:
2023 i = int(im.group(0), base)
2024 return '"%d":' % i if v.endswith(':') else '%d' % i
2025
2026 return '"%s"' % v
2027
2028 return re.sub(r'''(?sx)
2029 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2030 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2031 /\*.*?\*/|,(?=\s*[\]}])|
2032 [a-zA-Z_][.a-zA-Z_0-9]*|
2033 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2034 [0-9]+(?=\s*:)
2035 ''', fix_kv, code)
2036
2037
2038 def qualities(quality_ids):
2039 """ Get a numeric quality value out of a list of possible values """
2040 def q(qid):
2041 try:
2042 return quality_ids.index(qid)
2043 except ValueError:
2044 return -1
2045 return q
2046
2047
2048 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2049
2050
2051 def limit_length(s, length):
2052 """ Add ellipses to overly long strings """
2053 if s is None:
2054 return None
2055 ELLIPSES = '...'
2056 if len(s) > length:
2057 return s[:length - len(ELLIPSES)] + ELLIPSES
2058 return s
2059
2060
2061 def version_tuple(v):
2062 return tuple(int(e) for e in re.split(r'[-.]', v))
2063
2064
2065 def is_outdated_version(version, limit, assume_new=True):
2066 if not version:
2067 return not assume_new
2068 try:
2069 return version_tuple(version) < version_tuple(limit)
2070 except ValueError:
2071 return not assume_new
2072
2073
2074 def ytdl_is_updateable():
2075 """ Returns if youtube-dl can be updated with -U """
2076 from zipimport import zipimporter
2077
2078 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2079
2080
2081 def args_to_str(args):
2082 # Get a short string representation for a subprocess command
2083 return ' '.join(compat_shlex_quote(a) for a in args)
2084
2085
2086 def error_to_compat_str(err):
2087 err_str = str(err)
2088 # On python 2 error byte string must be decoded with proper
2089 # encoding rather than ascii
2090 if sys.version_info[0] < 3:
2091 err_str = err_str.decode(preferredencoding())
2092 return err_str
2093
2094
2095 def mimetype2ext(mt):
2096 if mt is None:
2097 return None
2098
2099 ext = {
2100 'audio/mp4': 'm4a',
2101 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2102 # it's the most popular one
2103 'audio/mpeg': 'mp3',
2104 }.get(mt)
2105 if ext is not None:
2106 return ext
2107
2108 _, _, res = mt.rpartition('/')
2109 res = res.lower()
2110
2111 return {
2112 '3gpp': '3gp',
2113 'smptett+xml': 'tt',
2114 'srt': 'srt',
2115 'ttaf+xml': 'dfxp',
2116 'ttml+xml': 'ttml',
2117 'vtt': 'vtt',
2118 'x-flv': 'flv',
2119 'x-mp4-fragmented': 'mp4',
2120 'x-ms-wmv': 'wmv',
2121 'mpegurl': 'm3u8',
2122 'x-mpegurl': 'm3u8',
2123 'vnd.apple.mpegurl': 'm3u8',
2124 'dash+xml': 'mpd',
2125 'f4m': 'f4m',
2126 'f4m+xml': 'f4m',
2127 'hds+xml': 'f4m',
2128 'vnd.ms-sstr+xml': 'ism',
2129 }.get(res, res)
2130
2131
2132 def parse_codecs(codecs_str):
2133 # http://tools.ietf.org/html/rfc6381
2134 if not codecs_str:
2135 return {}
2136 splited_codecs = list(filter(None, map(
2137 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2138 vcodec, acodec = None, None
2139 for full_codec in splited_codecs:
2140 codec = full_codec.split('.')[0]
2141 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2142 if not vcodec:
2143 vcodec = full_codec
2144 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'):
2145 if not acodec:
2146 acodec = full_codec
2147 else:
2148 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2149 if not vcodec and not acodec:
2150 if len(splited_codecs) == 2:
2151 return {
2152 'vcodec': vcodec,
2153 'acodec': acodec,
2154 }
2155 elif len(splited_codecs) == 1:
2156 return {
2157 'vcodec': 'none',
2158 'acodec': vcodec,
2159 }
2160 else:
2161 return {
2162 'vcodec': vcodec or 'none',
2163 'acodec': acodec or 'none',
2164 }
2165 return {}
2166
2167
2168 def urlhandle_detect_ext(url_handle):
2169 getheader = url_handle.headers.get
2170
2171 cd = getheader('Content-Disposition')
2172 if cd:
2173 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2174 if m:
2175 e = determine_ext(m.group('filename'), default_ext=None)
2176 if e:
2177 return e
2178
2179 return mimetype2ext(getheader('Content-Type'))
2180
2181
2182 def encode_data_uri(data, mime_type):
2183 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2184
2185
2186 def age_restricted(content_limit, age_limit):
2187 """ Returns True iff the content should be blocked """
2188
2189 if age_limit is None: # No limit set
2190 return False
2191 if content_limit is None:
2192 return False # Content available for everyone
2193 return age_limit < content_limit
2194
2195
2196 def is_html(first_bytes):
2197 """ Detect whether a file contains HTML by examining its first bytes. """
2198
2199 BOMS = [
2200 (b'\xef\xbb\xbf', 'utf-8'),
2201 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2202 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2203 (b'\xff\xfe', 'utf-16-le'),
2204 (b'\xfe\xff', 'utf-16-be'),
2205 ]
2206 for bom, enc in BOMS:
2207 if first_bytes.startswith(bom):
2208 s = first_bytes[len(bom):].decode(enc, 'replace')
2209 break
2210 else:
2211 s = first_bytes.decode('utf-8', 'replace')
2212
2213 return re.match(r'^\s*<', s)
2214
2215
2216 def determine_protocol(info_dict):
2217 protocol = info_dict.get('protocol')
2218 if protocol is not None:
2219 return protocol
2220
2221 url = info_dict['url']
2222 if url.startswith('rtmp'):
2223 return 'rtmp'
2224 elif url.startswith('mms'):
2225 return 'mms'
2226 elif url.startswith('rtsp'):
2227 return 'rtsp'
2228
2229 ext = determine_ext(url)
2230 if ext == 'm3u8':
2231 return 'm3u8'
2232 elif ext == 'f4m':
2233 return 'f4m'
2234
2235 return compat_urllib_parse_urlparse(url).scheme
2236
2237
2238 def render_table(header_row, data):
2239 """ Render a list of rows, each as a list of values """
2240 table = [header_row] + data
2241 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2242 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2243 return '\n'.join(format_str % tuple(row) for row in table)
2244
2245
2246 def _match_one(filter_part, dct):
2247 COMPARISON_OPERATORS = {
2248 '<': operator.lt,
2249 '<=': operator.le,
2250 '>': operator.gt,
2251 '>=': operator.ge,
2252 '=': operator.eq,
2253 '!=': operator.ne,
2254 }
2255 operator_rex = re.compile(r'''(?x)\s*
2256 (?P<key>[a-z_]+)
2257 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2258 (?:
2259 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2260 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2261 )
2262 \s*$
2263 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2264 m = operator_rex.search(filter_part)
2265 if m:
2266 op = COMPARISON_OPERATORS[m.group('op')]
2267 if m.group('strval') is not None:
2268 if m.group('op') not in ('=', '!='):
2269 raise ValueError(
2270 'Operator %s does not support string values!' % m.group('op'))
2271 comparison_value = m.group('strval')
2272 else:
2273 try:
2274 comparison_value = int(m.group('intval'))
2275 except ValueError:
2276 comparison_value = parse_filesize(m.group('intval'))
2277 if comparison_value is None:
2278 comparison_value = parse_filesize(m.group('intval') + 'B')
2279 if comparison_value is None:
2280 raise ValueError(
2281 'Invalid integer value %r in filter part %r' % (
2282 m.group('intval'), filter_part))
2283 actual_value = dct.get(m.group('key'))
2284 if actual_value is None:
2285 return m.group('none_inclusive')
2286 return op(actual_value, comparison_value)
2287
2288 UNARY_OPERATORS = {
2289 '': lambda v: v is not None,
2290 '!': lambda v: v is None,
2291 }
2292 operator_rex = re.compile(r'''(?x)\s*
2293 (?P<op>%s)\s*(?P<key>[a-z_]+)
2294 \s*$
2295 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2296 m = operator_rex.search(filter_part)
2297 if m:
2298 op = UNARY_OPERATORS[m.group('op')]
2299 actual_value = dct.get(m.group('key'))
2300 return op(actual_value)
2301
2302 raise ValueError('Invalid filter part %r' % filter_part)
2303
2304
2305 def match_str(filter_str, dct):
2306 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2307
2308 return all(
2309 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2310
2311
2312 def match_filter_func(filter_str):
2313 def _match_func(info_dict):
2314 if match_str(filter_str, info_dict):
2315 return None
2316 else:
2317 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2318 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2319 return _match_func
2320
2321
2322 def parse_dfxp_time_expr(time_expr):
2323 if not time_expr:
2324 return
2325
2326 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2327 if mobj:
2328 return float(mobj.group('time_offset'))
2329
2330 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2331 if mobj:
2332 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2333
2334
2335 def srt_subtitles_timecode(seconds):
2336 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2337
2338
2339 def dfxp2srt(dfxp_data):
2340 _x = functools.partial(xpath_with_ns, ns_map={
2341 'ttml': 'http://www.w3.org/ns/ttml',
2342 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2343 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2344 })
2345
2346 class TTMLPElementParser(object):
2347 out = ''
2348
2349 def start(self, tag, attrib):
2350 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2351 self.out += '\n'
2352
2353 def end(self, tag):
2354 pass
2355
2356 def data(self, data):
2357 self.out += data
2358
2359 def close(self):
2360 return self.out.strip()
2361
2362 def parse_node(node):
2363 target = TTMLPElementParser()
2364 parser = xml.etree.ElementTree.XMLParser(target=target)
2365 parser.feed(xml.etree.ElementTree.tostring(node))
2366 return parser.close()
2367
2368 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2369 out = []
2370 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2371
2372 if not paras:
2373 raise ValueError('Invalid dfxp/TTML subtitle')
2374
2375 for para, index in zip(paras, itertools.count(1)):
2376 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2377 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2378 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2379 if begin_time is None:
2380 continue
2381 if not end_time:
2382 if not dur:
2383 continue
2384 end_time = begin_time + dur
2385 out.append('%d\n%s --> %s\n%s\n\n' % (
2386 index,
2387 srt_subtitles_timecode(begin_time),
2388 srt_subtitles_timecode(end_time),
2389 parse_node(para)))
2390
2391 return ''.join(out)
2392
2393
2394 def cli_option(params, command_option, param):
2395 param = params.get(param)
2396 return [command_option, param] if param is not None else []
2397
2398
2399 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2400 param = params.get(param)
2401 assert isinstance(param, bool)
2402 if separator:
2403 return [command_option + separator + (true_value if param else false_value)]
2404 return [command_option, true_value if param else false_value]
2405
2406
2407 def cli_valueless_option(params, command_option, param, expected_value=True):
2408 param = params.get(param)
2409 return [command_option] if param == expected_value else []
2410
2411
2412 def cli_configuration_args(params, param, default=[]):
2413 ex_args = params.get(param)
2414 if ex_args is None:
2415 return default
2416 assert isinstance(ex_args, list)
2417 return ex_args
2418
2419
2420 class ISO639Utils(object):
2421 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2422 _lang_map = {
2423 'aa': 'aar',
2424 'ab': 'abk',
2425 'ae': 'ave',
2426 'af': 'afr',
2427 'ak': 'aka',
2428 'am': 'amh',
2429 'an': 'arg',
2430 'ar': 'ara',
2431 'as': 'asm',
2432 'av': 'ava',
2433 'ay': 'aym',
2434 'az': 'aze',
2435 'ba': 'bak',
2436 'be': 'bel',
2437 'bg': 'bul',
2438 'bh': 'bih',
2439 'bi': 'bis',
2440 'bm': 'bam',
2441 'bn': 'ben',
2442 'bo': 'bod',
2443 'br': 'bre',
2444 'bs': 'bos',
2445 'ca': 'cat',
2446 'ce': 'che',
2447 'ch': 'cha',
2448 'co': 'cos',
2449 'cr': 'cre',
2450 'cs': 'ces',
2451 'cu': 'chu',
2452 'cv': 'chv',
2453 'cy': 'cym',
2454 'da': 'dan',
2455 'de': 'deu',
2456 'dv': 'div',
2457 'dz': 'dzo',
2458 'ee': 'ewe',
2459 'el': 'ell',
2460 'en': 'eng',
2461 'eo': 'epo',
2462 'es': 'spa',
2463 'et': 'est',
2464 'eu': 'eus',
2465 'fa': 'fas',
2466 'ff': 'ful',
2467 'fi': 'fin',
2468 'fj': 'fij',
2469 'fo': 'fao',
2470 'fr': 'fra',
2471 'fy': 'fry',
2472 'ga': 'gle',
2473 'gd': 'gla',
2474 'gl': 'glg',
2475 'gn': 'grn',
2476 'gu': 'guj',
2477 'gv': 'glv',
2478 'ha': 'hau',
2479 'he': 'heb',
2480 'hi': 'hin',
2481 'ho': 'hmo',
2482 'hr': 'hrv',
2483 'ht': 'hat',
2484 'hu': 'hun',
2485 'hy': 'hye',
2486 'hz': 'her',
2487 'ia': 'ina',
2488 'id': 'ind',
2489 'ie': 'ile',
2490 'ig': 'ibo',
2491 'ii': 'iii',
2492 'ik': 'ipk',
2493 'io': 'ido',
2494 'is': 'isl',
2495 'it': 'ita',
2496 'iu': 'iku',
2497 'ja': 'jpn',
2498 'jv': 'jav',
2499 'ka': 'kat',
2500 'kg': 'kon',
2501 'ki': 'kik',
2502 'kj': 'kua',
2503 'kk': 'kaz',
2504 'kl': 'kal',
2505 'km': 'khm',
2506 'kn': 'kan',
2507 'ko': 'kor',
2508 'kr': 'kau',
2509 'ks': 'kas',
2510 'ku': 'kur',
2511 'kv': 'kom',
2512 'kw': 'cor',
2513 'ky': 'kir',
2514 'la': 'lat',
2515 'lb': 'ltz',
2516 'lg': 'lug',
2517 'li': 'lim',
2518 'ln': 'lin',
2519 'lo': 'lao',
2520 'lt': 'lit',
2521 'lu': 'lub',
2522 'lv': 'lav',
2523 'mg': 'mlg',
2524 'mh': 'mah',
2525 'mi': 'mri',
2526 'mk': 'mkd',
2527 'ml': 'mal',
2528 'mn': 'mon',
2529 'mr': 'mar',
2530 'ms': 'msa',
2531 'mt': 'mlt',
2532 'my': 'mya',
2533 'na': 'nau',
2534 'nb': 'nob',
2535 'nd': 'nde',
2536 'ne': 'nep',
2537 'ng': 'ndo',
2538 'nl': 'nld',
2539 'nn': 'nno',
2540 'no': 'nor',
2541 'nr': 'nbl',
2542 'nv': 'nav',
2543 'ny': 'nya',
2544 'oc': 'oci',
2545 'oj': 'oji',
2546 'om': 'orm',
2547 'or': 'ori',
2548 'os': 'oss',
2549 'pa': 'pan',
2550 'pi': 'pli',
2551 'pl': 'pol',
2552 'ps': 'pus',
2553 'pt': 'por',
2554 'qu': 'que',
2555 'rm': 'roh',
2556 'rn': 'run',
2557 'ro': 'ron',
2558 'ru': 'rus',
2559 'rw': 'kin',
2560 'sa': 'san',
2561 'sc': 'srd',
2562 'sd': 'snd',
2563 'se': 'sme',
2564 'sg': 'sag',
2565 'si': 'sin',
2566 'sk': 'slk',
2567 'sl': 'slv',
2568 'sm': 'smo',
2569 'sn': 'sna',
2570 'so': 'som',
2571 'sq': 'sqi',
2572 'sr': 'srp',
2573 'ss': 'ssw',
2574 'st': 'sot',
2575 'su': 'sun',
2576 'sv': 'swe',
2577 'sw': 'swa',
2578 'ta': 'tam',
2579 'te': 'tel',
2580 'tg': 'tgk',
2581 'th': 'tha',
2582 'ti': 'tir',
2583 'tk': 'tuk',
2584 'tl': 'tgl',
2585 'tn': 'tsn',
2586 'to': 'ton',
2587 'tr': 'tur',
2588 'ts': 'tso',
2589 'tt': 'tat',
2590 'tw': 'twi',
2591 'ty': 'tah',
2592 'ug': 'uig',
2593 'uk': 'ukr',
2594 'ur': 'urd',
2595 'uz': 'uzb',
2596 've': 'ven',
2597 'vi': 'vie',
2598 'vo': 'vol',
2599 'wa': 'wln',
2600 'wo': 'wol',
2601 'xh': 'xho',
2602 'yi': 'yid',
2603 'yo': 'yor',
2604 'za': 'zha',
2605 'zh': 'zho',
2606 'zu': 'zul',
2607 }
2608
2609 @classmethod
2610 def short2long(cls, code):
2611 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2612 return cls._lang_map.get(code[:2])
2613
2614 @classmethod
2615 def long2short(cls, code):
2616 """Convert language code from ISO 639-2/T to ISO 639-1"""
2617 for short_name, long_name in cls._lang_map.items():
2618 if long_name == code:
2619 return short_name
2620
2621
2622 class ISO3166Utils(object):
2623 # From http://data.okfn.org/data/core/country-list
2624 _country_map = {
2625 'AF': 'Afghanistan',
2626 'AX': 'Åland Islands',
2627 'AL': 'Albania',
2628 'DZ': 'Algeria',
2629 'AS': 'American Samoa',
2630 'AD': 'Andorra',
2631 'AO': 'Angola',
2632 'AI': 'Anguilla',
2633 'AQ': 'Antarctica',
2634 'AG': 'Antigua and Barbuda',
2635 'AR': 'Argentina',
2636 'AM': 'Armenia',
2637 'AW': 'Aruba',
2638 'AU': 'Australia',
2639 'AT': 'Austria',
2640 'AZ': 'Azerbaijan',
2641 'BS': 'Bahamas',
2642 'BH': 'Bahrain',
2643 'BD': 'Bangladesh',
2644 'BB': 'Barbados',
2645 'BY': 'Belarus',
2646 'BE': 'Belgium',
2647 'BZ': 'Belize',
2648 'BJ': 'Benin',
2649 'BM': 'Bermuda',
2650 'BT': 'Bhutan',
2651 'BO': 'Bolivia, Plurinational State of',
2652 'BQ': 'Bonaire, Sint Eustatius and Saba',
2653 'BA': 'Bosnia and Herzegovina',
2654 'BW': 'Botswana',
2655 'BV': 'Bouvet Island',
2656 'BR': 'Brazil',
2657 'IO': 'British Indian Ocean Territory',
2658 'BN': 'Brunei Darussalam',
2659 'BG': 'Bulgaria',
2660 'BF': 'Burkina Faso',
2661 'BI': 'Burundi',
2662 'KH': 'Cambodia',
2663 'CM': 'Cameroon',
2664 'CA': 'Canada',
2665 'CV': 'Cape Verde',
2666 'KY': 'Cayman Islands',
2667 'CF': 'Central African Republic',
2668 'TD': 'Chad',
2669 'CL': 'Chile',
2670 'CN': 'China',
2671 'CX': 'Christmas Island',
2672 'CC': 'Cocos (Keeling) Islands',
2673 'CO': 'Colombia',
2674 'KM': 'Comoros',
2675 'CG': 'Congo',
2676 'CD': 'Congo, the Democratic Republic of the',
2677 'CK': 'Cook Islands',
2678 'CR': 'Costa Rica',
2679 'CI': 'Côte d\'Ivoire',
2680 'HR': 'Croatia',
2681 'CU': 'Cuba',
2682 'CW': 'Curaçao',
2683 'CY': 'Cyprus',
2684 'CZ': 'Czech Republic',
2685 'DK': 'Denmark',
2686 'DJ': 'Djibouti',
2687 'DM': 'Dominica',
2688 'DO': 'Dominican Republic',
2689 'EC': 'Ecuador',
2690 'EG': 'Egypt',
2691 'SV': 'El Salvador',
2692 'GQ': 'Equatorial Guinea',
2693 'ER': 'Eritrea',
2694 'EE': 'Estonia',
2695 'ET': 'Ethiopia',
2696 'FK': 'Falkland Islands (Malvinas)',
2697 'FO': 'Faroe Islands',
2698 'FJ': 'Fiji',
2699 'FI': 'Finland',
2700 'FR': 'France',
2701 'GF': 'French Guiana',
2702 'PF': 'French Polynesia',
2703 'TF': 'French Southern Territories',
2704 'GA': 'Gabon',
2705 'GM': 'Gambia',
2706 'GE': 'Georgia',
2707 'DE': 'Germany',
2708 'GH': 'Ghana',
2709 'GI': 'Gibraltar',
2710 'GR': 'Greece',
2711 'GL': 'Greenland',
2712 'GD': 'Grenada',
2713 'GP': 'Guadeloupe',
2714 'GU': 'Guam',
2715 'GT': 'Guatemala',
2716 'GG': 'Guernsey',
2717 'GN': 'Guinea',
2718 'GW': 'Guinea-Bissau',
2719 'GY': 'Guyana',
2720 'HT': 'Haiti',
2721 'HM': 'Heard Island and McDonald Islands',
2722 'VA': 'Holy See (Vatican City State)',
2723 'HN': 'Honduras',
2724 'HK': 'Hong Kong',
2725 'HU': 'Hungary',
2726 'IS': 'Iceland',
2727 'IN': 'India',
2728 'ID': 'Indonesia',
2729 'IR': 'Iran, Islamic Republic of',
2730 'IQ': 'Iraq',
2731 'IE': 'Ireland',
2732 'IM': 'Isle of Man',
2733 'IL': 'Israel',
2734 'IT': 'Italy',
2735 'JM': 'Jamaica',
2736 'JP': 'Japan',
2737 'JE': 'Jersey',
2738 'JO': 'Jordan',
2739 'KZ': 'Kazakhstan',
2740 'KE': 'Kenya',
2741 'KI': 'Kiribati',
2742 'KP': 'Korea, Democratic People\'s Republic of',
2743 'KR': 'Korea, Republic of',
2744 'KW': 'Kuwait',
2745 'KG': 'Kyrgyzstan',
2746 'LA': 'Lao People\'s Democratic Republic',
2747 'LV': 'Latvia',
2748 'LB': 'Lebanon',
2749 'LS': 'Lesotho',
2750 'LR': 'Liberia',
2751 'LY': 'Libya',
2752 'LI': 'Liechtenstein',
2753 'LT': 'Lithuania',
2754 'LU': 'Luxembourg',
2755 'MO': 'Macao',
2756 'MK': 'Macedonia, the Former Yugoslav Republic of',
2757 'MG': 'Madagascar',
2758 'MW': 'Malawi',
2759 'MY': 'Malaysia',
2760 'MV': 'Maldives',
2761 'ML': 'Mali',
2762 'MT': 'Malta',
2763 'MH': 'Marshall Islands',
2764 'MQ': 'Martinique',
2765 'MR': 'Mauritania',
2766 'MU': 'Mauritius',
2767 'YT': 'Mayotte',
2768 'MX': 'Mexico',
2769 'FM': 'Micronesia, Federated States of',
2770 'MD': 'Moldova, Republic of',
2771 'MC': 'Monaco',
2772 'MN': 'Mongolia',
2773 'ME': 'Montenegro',
2774 'MS': 'Montserrat',
2775 'MA': 'Morocco',
2776 'MZ': 'Mozambique',
2777 'MM': 'Myanmar',
2778 'NA': 'Namibia',
2779 'NR': 'Nauru',
2780 'NP': 'Nepal',
2781 'NL': 'Netherlands',
2782 'NC': 'New Caledonia',
2783 'NZ': 'New Zealand',
2784 'NI': 'Nicaragua',
2785 'NE': 'Niger',
2786 'NG': 'Nigeria',
2787 'NU': 'Niue',
2788 'NF': 'Norfolk Island',
2789 'MP': 'Northern Mariana Islands',
2790 'NO': 'Norway',
2791 'OM': 'Oman',
2792 'PK': 'Pakistan',
2793 'PW': 'Palau',
2794 'PS': 'Palestine, State of',
2795 'PA': 'Panama',
2796 'PG': 'Papua New Guinea',
2797 'PY': 'Paraguay',
2798 'PE': 'Peru',
2799 'PH': 'Philippines',
2800 'PN': 'Pitcairn',
2801 'PL': 'Poland',
2802 'PT': 'Portugal',
2803 'PR': 'Puerto Rico',
2804 'QA': 'Qatar',
2805 'RE': 'Réunion',
2806 'RO': 'Romania',
2807 'RU': 'Russian Federation',
2808 'RW': 'Rwanda',
2809 'BL': 'Saint Barthélemy',
2810 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2811 'KN': 'Saint Kitts and Nevis',
2812 'LC': 'Saint Lucia',
2813 'MF': 'Saint Martin (French part)',
2814 'PM': 'Saint Pierre and Miquelon',
2815 'VC': 'Saint Vincent and the Grenadines',
2816 'WS': 'Samoa',
2817 'SM': 'San Marino',
2818 'ST': 'Sao Tome and Principe',
2819 'SA': 'Saudi Arabia',
2820 'SN': 'Senegal',
2821 'RS': 'Serbia',
2822 'SC': 'Seychelles',
2823 'SL': 'Sierra Leone',
2824 'SG': 'Singapore',
2825 'SX': 'Sint Maarten (Dutch part)',
2826 'SK': 'Slovakia',
2827 'SI': 'Slovenia',
2828 'SB': 'Solomon Islands',
2829 'SO': 'Somalia',
2830 'ZA': 'South Africa',
2831 'GS': 'South Georgia and the South Sandwich Islands',
2832 'SS': 'South Sudan',
2833 'ES': 'Spain',
2834 'LK': 'Sri Lanka',
2835 'SD': 'Sudan',
2836 'SR': 'Suriname',
2837 'SJ': 'Svalbard and Jan Mayen',
2838 'SZ': 'Swaziland',
2839 'SE': 'Sweden',
2840 'CH': 'Switzerland',
2841 'SY': 'Syrian Arab Republic',
2842 'TW': 'Taiwan, Province of China',
2843 'TJ': 'Tajikistan',
2844 'TZ': 'Tanzania, United Republic of',
2845 'TH': 'Thailand',
2846 'TL': 'Timor-Leste',
2847 'TG': 'Togo',
2848 'TK': 'Tokelau',
2849 'TO': 'Tonga',
2850 'TT': 'Trinidad and Tobago',
2851 'TN': 'Tunisia',
2852 'TR': 'Turkey',
2853 'TM': 'Turkmenistan',
2854 'TC': 'Turks and Caicos Islands',
2855 'TV': 'Tuvalu',
2856 'UG': 'Uganda',
2857 'UA': 'Ukraine',
2858 'AE': 'United Arab Emirates',
2859 'GB': 'United Kingdom',
2860 'US': 'United States',
2861 'UM': 'United States Minor Outlying Islands',
2862 'UY': 'Uruguay',
2863 'UZ': 'Uzbekistan',
2864 'VU': 'Vanuatu',
2865 'VE': 'Venezuela, Bolivarian Republic of',
2866 'VN': 'Viet Nam',
2867 'VG': 'Virgin Islands, British',
2868 'VI': 'Virgin Islands, U.S.',
2869 'WF': 'Wallis and Futuna',
2870 'EH': 'Western Sahara',
2871 'YE': 'Yemen',
2872 'ZM': 'Zambia',
2873 'ZW': 'Zimbabwe',
2874 }
2875
2876 @classmethod
2877 def short2full(cls, code):
2878 """Convert an ISO 3166-2 country code to the corresponding full name"""
2879 return cls._country_map.get(code.upper())
2880
2881
2882 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2883 def __init__(self, proxies=None):
2884 # Set default handlers
2885 for type in ('http', 'https'):
2886 setattr(self, '%s_open' % type,
2887 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2888 meth(r, proxy, type))
2889 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2890
2891 def proxy_open(self, req, proxy, type):
2892 req_proxy = req.headers.get('Ytdl-request-proxy')
2893 if req_proxy is not None:
2894 proxy = req_proxy
2895 del req.headers['Ytdl-request-proxy']
2896
2897 if proxy == '__noproxy__':
2898 return None # No Proxy
2899 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2900 req.add_header('Ytdl-socks-proxy', proxy)
2901 # youtube-dl's http/https handlers do wrapping the socket with socks
2902 return None
2903 return compat_urllib_request.ProxyHandler.proxy_open(
2904 self, req, proxy, type)
2905
2906
2907 def ohdave_rsa_encrypt(data, exponent, modulus):
2908 '''
2909 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2910
2911 Input:
2912 data: data to encrypt, bytes-like object
2913 exponent, modulus: parameter e and N of RSA algorithm, both integer
2914 Output: hex string of encrypted data
2915
2916 Limitation: supports one block encryption only
2917 '''
2918
2919 payload = int(binascii.hexlify(data[::-1]), 16)
2920 encrypted = pow(payload, exponent, modulus)
2921 return '%x' % encrypted
2922
2923
2924 def encode_base_n(num, n, table=None):
2925 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2926 if not table:
2927 table = FULL_TABLE[:n]
2928
2929 if n > len(table):
2930 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2931
2932 if num == 0:
2933 return table[0]
2934
2935 ret = ''
2936 while num:
2937 ret = table[num % n] + ret
2938 num = num // n
2939 return ret
2940
2941
2942 def decode_packed_codes(code):
2943 mobj = re.search(
2944 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2945 code)
2946 obfucasted_code, base, count, symbols = mobj.groups()
2947 base = int(base)
2948 count = int(count)
2949 symbols = symbols.split('|')
2950 symbol_table = {}
2951
2952 while count:
2953 count -= 1
2954 base_n_count = encode_base_n(count, base)
2955 symbol_table[base_n_count] = symbols[count] or base_n_count
2956
2957 return re.sub(
2958 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2959 obfucasted_code)
2960
2961
2962 def parse_m3u8_attributes(attrib):
2963 info = {}
2964 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2965 if val.startswith('"'):
2966 val = val[1:-1]
2967 info[key] = val
2968 return info
2969
2970
2971 def urshift(val, n):
2972 return val >> n if val >= 0 else (val + 0x100000000) >> n
2973
2974
2975 # Based on png2str() written by @gdkchan and improved by @yokrysty
2976 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
2977 def decode_png(png_data):
2978 # Reference: https://www.w3.org/TR/PNG/
2979 header = png_data[8:]
2980
2981 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
2982 raise IOError('Not a valid PNG file.')
2983
2984 int_map = {1: '>B', 2: '>H', 4: '>I'}
2985 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
2986
2987 chunks = []
2988
2989 while header:
2990 length = unpack_integer(header[:4])
2991 header = header[4:]
2992
2993 chunk_type = header[:4]
2994 header = header[4:]
2995
2996 chunk_data = header[:length]
2997 header = header[length:]
2998
2999 header = header[4:] # Skip CRC
3000
3001 chunks.append({
3002 'type': chunk_type,
3003 'length': length,
3004 'data': chunk_data
3005 })
3006
3007 ihdr = chunks[0]['data']
3008
3009 width = unpack_integer(ihdr[:4])
3010 height = unpack_integer(ihdr[4:8])
3011
3012 idat = b''
3013
3014 for chunk in chunks:
3015 if chunk['type'] == b'IDAT':
3016 idat += chunk['data']
3017
3018 if not idat:
3019 raise IOError('Unable to read PNG data.')
3020
3021 decompressed_data = bytearray(zlib.decompress(idat))
3022
3023 stride = width * 3
3024 pixels = []
3025
3026 def _get_pixel(idx):
3027 x = idx % stride
3028 y = idx // stride
3029 return pixels[y][x]
3030
3031 for y in range(height):
3032 basePos = y * (1 + stride)
3033 filter_type = decompressed_data[basePos]
3034
3035 current_row = []
3036
3037 pixels.append(current_row)
3038
3039 for x in range(stride):
3040 color = decompressed_data[1 + basePos + x]
3041 basex = y * stride + x
3042 left = 0
3043 up = 0
3044
3045 if x > 2:
3046 left = _get_pixel(basex - 3)
3047 if y > 0:
3048 up = _get_pixel(basex - stride)
3049
3050 if filter_type == 1: # Sub
3051 color = (color + left) & 0xff
3052 elif filter_type == 2: # Up
3053 color = (color + up) & 0xff
3054 elif filter_type == 3: # Average
3055 color = (color + ((left + up) >> 1)) & 0xff
3056 elif filter_type == 4: # Paeth
3057 a = left
3058 b = up
3059 c = 0
3060
3061 if x > 2 and y > 0:
3062 c = _get_pixel(basex - stride - 3)
3063
3064 p = a + b - c
3065
3066 pa = abs(p - a)
3067 pb = abs(p - b)
3068 pc = abs(p - c)
3069
3070 if pa <= pb and pa <= pc:
3071 color = (color + a) & 0xff
3072 elif pb <= pc:
3073 color = (color + b) & 0xff
3074 else:
3075 color = (color + c) & 0xff
3076
3077 current_row.append(color)
3078
3079 return width, height, pixels