]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[openload] Fix title extraction (Closes #9298)
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import errno
15 import functools
16 import gzip
17 import itertools
18 import io
19 import json
20 import locale
21 import math
22 import operator
23 import os
24 import pipes
25 import platform
26 import re
27 import ssl
28 import socket
29 import struct
30 import subprocess
31 import sys
32 import tempfile
33 import traceback
34 import xml.etree.ElementTree
35 import zlib
36
37 from .compat import (
38 compat_HTMLParser,
39 compat_basestring,
40 compat_chr,
41 compat_etree_fromstring,
42 compat_html_entities,
43 compat_http_client,
44 compat_kwargs,
45 compat_parse_qs,
46 compat_socket_create_connection,
47 compat_str,
48 compat_urllib_error,
49 compat_urllib_parse,
50 compat_urllib_parse_urlencode,
51 compat_urllib_parse_urlparse,
52 compat_urllib_request,
53 compat_urlparse,
54 compat_xpath,
55 shlex_quote,
56 )
57
58
59 # This is not clearly defined otherwise
60 compiled_regex_type = type(re.compile(''))
61
62 std_headers = {
63 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
64 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
65 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
66 'Accept-Encoding': 'gzip, deflate',
67 'Accept-Language': 'en-us,en;q=0.5',
68 }
69
70
71 NO_DEFAULT = object()
72
73 ENGLISH_MONTH_NAMES = [
74 'January', 'February', 'March', 'April', 'May', 'June',
75 'July', 'August', 'September', 'October', 'November', 'December']
76
77 KNOWN_EXTENSIONS = (
78 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
79 'flv', 'f4v', 'f4a', 'f4b',
80 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
81 'mkv', 'mka', 'mk3d',
82 'avi', 'divx',
83 'mov',
84 'asf', 'wmv', 'wma',
85 '3gp', '3g2',
86 'mp3',
87 'flac',
88 'ape',
89 'wav',
90 'f4f', 'f4m', 'm3u8', 'smil')
91
92
93 def preferredencoding():
94 """Get preferred encoding.
95
96 Returns the best encoding scheme for the system, based on
97 locale.getpreferredencoding() and some further tweaks.
98 """
99 try:
100 pref = locale.getpreferredencoding()
101 'TEST'.encode(pref)
102 except Exception:
103 pref = 'UTF-8'
104
105 return pref
106
107
108 def write_json_file(obj, fn):
109 """ Encode obj as JSON and write it to fn, atomically if possible """
110
111 fn = encodeFilename(fn)
112 if sys.version_info < (3, 0) and sys.platform != 'win32':
113 encoding = get_filesystem_encoding()
114 # os.path.basename returns a bytes object, but NamedTemporaryFile
115 # will fail if the filename contains non ascii characters unless we
116 # use a unicode object
117 path_basename = lambda f: os.path.basename(fn).decode(encoding)
118 # the same for os.path.dirname
119 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
120 else:
121 path_basename = os.path.basename
122 path_dirname = os.path.dirname
123
124 args = {
125 'suffix': '.tmp',
126 'prefix': path_basename(fn) + '.',
127 'dir': path_dirname(fn),
128 'delete': False,
129 }
130
131 # In Python 2.x, json.dump expects a bytestream.
132 # In Python 3.x, it writes to a character stream
133 if sys.version_info < (3, 0):
134 args['mode'] = 'wb'
135 else:
136 args.update({
137 'mode': 'w',
138 'encoding': 'utf-8',
139 })
140
141 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
142
143 try:
144 with tf:
145 json.dump(obj, tf)
146 if sys.platform == 'win32':
147 # Need to remove existing file on Windows, else os.rename raises
148 # WindowsError or FileExistsError.
149 try:
150 os.unlink(fn)
151 except OSError:
152 pass
153 os.rename(tf.name, fn)
154 except Exception:
155 try:
156 os.remove(tf.name)
157 except OSError:
158 pass
159 raise
160
161
162 if sys.version_info >= (2, 7):
163 def find_xpath_attr(node, xpath, key, val=None):
164 """ Find the xpath xpath[@key=val] """
165 assert re.match(r'^[a-zA-Z_-]+$', key)
166 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
167 return node.find(expr)
168 else:
169 def find_xpath_attr(node, xpath, key, val=None):
170 for f in node.findall(compat_xpath(xpath)):
171 if key not in f.attrib:
172 continue
173 if val is None or f.attrib.get(key) == val:
174 return f
175 return None
176
177 # On python2.6 the xml.etree.ElementTree.Element methods don't support
178 # the namespace parameter
179
180
181 def xpath_with_ns(path, ns_map):
182 components = [c.split(':') for c in path.split('/')]
183 replaced = []
184 for c in components:
185 if len(c) == 1:
186 replaced.append(c[0])
187 else:
188 ns, tag = c
189 replaced.append('{%s}%s' % (ns_map[ns], tag))
190 return '/'.join(replaced)
191
192
193 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
194 def _find_xpath(xpath):
195 return node.find(compat_xpath(xpath))
196
197 if isinstance(xpath, (str, compat_str)):
198 n = _find_xpath(xpath)
199 else:
200 for xp in xpath:
201 n = _find_xpath(xp)
202 if n is not None:
203 break
204
205 if n is None:
206 if default is not NO_DEFAULT:
207 return default
208 elif fatal:
209 name = xpath if name is None else name
210 raise ExtractorError('Could not find XML element %s' % name)
211 else:
212 return None
213 return n
214
215
216 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
217 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
218 if n is None or n == default:
219 return n
220 if n.text is None:
221 if default is not NO_DEFAULT:
222 return default
223 elif fatal:
224 name = xpath if name is None else name
225 raise ExtractorError('Could not find XML element\'s text %s' % name)
226 else:
227 return None
228 return n.text
229
230
231 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
232 n = find_xpath_attr(node, xpath, key)
233 if n is None:
234 if default is not NO_DEFAULT:
235 return default
236 elif fatal:
237 name = '%s[@%s]' % (xpath, key) if name is None else name
238 raise ExtractorError('Could not find XML attribute %s' % name)
239 else:
240 return None
241 return n.attrib[key]
242
243
244 def get_element_by_id(id, html):
245 """Return the content of the tag with the specified ID in the passed HTML document"""
246 return get_element_by_attribute('id', id, html)
247
248
249 def get_element_by_attribute(attribute, value, html):
250 """Return the content of the tag with the specified attribute in the passed HTML document"""
251
252 m = re.search(r'''(?xs)
253 <([a-zA-Z0-9:._-]+)
254 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
255 \s+%s=['"]?%s['"]?
256 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
257 \s*>
258 (?P<content>.*?)
259 </\1>
260 ''' % (re.escape(attribute), re.escape(value)), html)
261
262 if not m:
263 return None
264 res = m.group('content')
265
266 if res.startswith('"') or res.startswith("'"):
267 res = res[1:-1]
268
269 return unescapeHTML(res)
270
271
272 class HTMLAttributeParser(compat_HTMLParser):
273 """Trivial HTML parser to gather the attributes for a single element"""
274 def __init__(self):
275 self.attrs = {}
276 compat_HTMLParser.__init__(self)
277
278 def handle_starttag(self, tag, attrs):
279 self.attrs = dict(attrs)
280
281
282 def extract_attributes(html_element):
283 """Given a string for an HTML element such as
284 <el
285 a="foo" B="bar" c="&98;az" d=boz
286 empty= noval entity="&amp;"
287 sq='"' dq="'"
288 >
289 Decode and return a dictionary of attributes.
290 {
291 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
292 'empty': '', 'noval': None, 'entity': '&',
293 'sq': '"', 'dq': '\''
294 }.
295 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
296 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
297 """
298 parser = HTMLAttributeParser()
299 parser.feed(html_element)
300 parser.close()
301 return parser.attrs
302
303
304 def clean_html(html):
305 """Clean an HTML snippet into a readable string"""
306
307 if html is None: # Convenience for sanitizing descriptions etc.
308 return html
309
310 # Newline vs <br />
311 html = html.replace('\n', ' ')
312 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
313 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
314 # Strip html tags
315 html = re.sub('<.*?>', '', html)
316 # Replace html entities
317 html = unescapeHTML(html)
318 return html.strip()
319
320
321 def sanitize_open(filename, open_mode):
322 """Try to open the given filename, and slightly tweak it if this fails.
323
324 Attempts to open the given filename. If this fails, it tries to change
325 the filename slightly, step by step, until it's either able to open it
326 or it fails and raises a final exception, like the standard open()
327 function.
328
329 It returns the tuple (stream, definitive_file_name).
330 """
331 try:
332 if filename == '-':
333 if sys.platform == 'win32':
334 import msvcrt
335 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
336 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
337 stream = open(encodeFilename(filename), open_mode)
338 return (stream, filename)
339 except (IOError, OSError) as err:
340 if err.errno in (errno.EACCES,):
341 raise
342
343 # In case of error, try to remove win32 forbidden chars
344 alt_filename = sanitize_path(filename)
345 if alt_filename == filename:
346 raise
347 else:
348 # An exception here should be caught in the caller
349 stream = open(encodeFilename(alt_filename), open_mode)
350 return (stream, alt_filename)
351
352
353 def timeconvert(timestr):
354 """Convert RFC 2822 defined time string into system timestamp"""
355 timestamp = None
356 timetuple = email.utils.parsedate_tz(timestr)
357 if timetuple is not None:
358 timestamp = email.utils.mktime_tz(timetuple)
359 return timestamp
360
361
362 def sanitize_filename(s, restricted=False, is_id=False):
363 """Sanitizes a string so it could be used as part of a filename.
364 If restricted is set, use a stricter subset of allowed characters.
365 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
366 """
367 def replace_insane(char):
368 if char == '?' or ord(char) < 32 or ord(char) == 127:
369 return ''
370 elif char == '"':
371 return '' if restricted else '\''
372 elif char == ':':
373 return '_-' if restricted else ' -'
374 elif char in '\\/|*<>':
375 return '_'
376 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
377 return '_'
378 if restricted and ord(char) > 127:
379 return '_'
380 return char
381
382 # Handle timestamps
383 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
384 result = ''.join(map(replace_insane, s))
385 if not is_id:
386 while '__' in result:
387 result = result.replace('__', '_')
388 result = result.strip('_')
389 # Common case of "Foreign band name - English song title"
390 if restricted and result.startswith('-_'):
391 result = result[2:]
392 if result.startswith('-'):
393 result = '_' + result[len('-'):]
394 result = result.lstrip('.')
395 if not result:
396 result = '_'
397 return result
398
399
400 def sanitize_path(s):
401 """Sanitizes and normalizes path on Windows"""
402 if sys.platform != 'win32':
403 return s
404 drive_or_unc, _ = os.path.splitdrive(s)
405 if sys.version_info < (2, 7) and not drive_or_unc:
406 drive_or_unc, _ = os.path.splitunc(s)
407 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
408 if drive_or_unc:
409 norm_path.pop(0)
410 sanitized_path = [
411 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
412 for path_part in norm_path]
413 if drive_or_unc:
414 sanitized_path.insert(0, drive_or_unc + os.path.sep)
415 return os.path.join(*sanitized_path)
416
417
418 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
419 # unwanted failures due to missing protocol
420 def sanitize_url(url):
421 return 'http:%s' % url if url.startswith('//') else url
422
423
424 def sanitized_Request(url, *args, **kwargs):
425 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
426
427
428 def orderedSet(iterable):
429 """ Remove all duplicates from the input iterable """
430 res = []
431 for el in iterable:
432 if el not in res:
433 res.append(el)
434 return res
435
436
437 def _htmlentity_transform(entity):
438 """Transforms an HTML entity to a character."""
439 # Known non-numeric HTML entity
440 if entity in compat_html_entities.name2codepoint:
441 return compat_chr(compat_html_entities.name2codepoint[entity])
442
443 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
444 if mobj is not None:
445 numstr = mobj.group(1)
446 if numstr.startswith('x'):
447 base = 16
448 numstr = '0%s' % numstr
449 else:
450 base = 10
451 # See https://github.com/rg3/youtube-dl/issues/7518
452 try:
453 return compat_chr(int(numstr, base))
454 except ValueError:
455 pass
456
457 # Unknown entity in name, return its literal representation
458 return '&%s;' % entity
459
460
461 def unescapeHTML(s):
462 if s is None:
463 return None
464 assert type(s) == compat_str
465
466 return re.sub(
467 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
468
469
470 def get_subprocess_encoding():
471 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
472 # For subprocess calls, encode with locale encoding
473 # Refer to http://stackoverflow.com/a/9951851/35070
474 encoding = preferredencoding()
475 else:
476 encoding = sys.getfilesystemencoding()
477 if encoding is None:
478 encoding = 'utf-8'
479 return encoding
480
481
482 def encodeFilename(s, for_subprocess=False):
483 """
484 @param s The name of the file
485 """
486
487 assert type(s) == compat_str
488
489 # Python 3 has a Unicode API
490 if sys.version_info >= (3, 0):
491 return s
492
493 # Pass '' directly to use Unicode APIs on Windows 2000 and up
494 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
495 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
496 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
497 return s
498
499 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
500 if sys.platform.startswith('java'):
501 return s
502
503 return s.encode(get_subprocess_encoding(), 'ignore')
504
505
506 def decodeFilename(b, for_subprocess=False):
507
508 if sys.version_info >= (3, 0):
509 return b
510
511 if not isinstance(b, bytes):
512 return b
513
514 return b.decode(get_subprocess_encoding(), 'ignore')
515
516
517 def encodeArgument(s):
518 if not isinstance(s, compat_str):
519 # Legacy code that uses byte strings
520 # Uncomment the following line after fixing all post processors
521 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
522 s = s.decode('ascii')
523 return encodeFilename(s, True)
524
525
526 def decodeArgument(b):
527 return decodeFilename(b, True)
528
529
530 def decodeOption(optval):
531 if optval is None:
532 return optval
533 if isinstance(optval, bytes):
534 optval = optval.decode(preferredencoding())
535
536 assert isinstance(optval, compat_str)
537 return optval
538
539
540 def formatSeconds(secs):
541 if secs > 3600:
542 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
543 elif secs > 60:
544 return '%d:%02d' % (secs // 60, secs % 60)
545 else:
546 return '%d' % secs
547
548
549 def make_HTTPS_handler(params, **kwargs):
550 opts_no_check_certificate = params.get('nocheckcertificate', False)
551 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
552 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
553 if opts_no_check_certificate:
554 context.check_hostname = False
555 context.verify_mode = ssl.CERT_NONE
556 try:
557 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
558 except TypeError:
559 # Python 2.7.8
560 # (create_default_context present but HTTPSHandler has no context=)
561 pass
562
563 if sys.version_info < (3, 2):
564 return YoutubeDLHTTPSHandler(params, **kwargs)
565 else: # Python < 3.4
566 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
567 context.verify_mode = (ssl.CERT_NONE
568 if opts_no_check_certificate
569 else ssl.CERT_REQUIRED)
570 context.set_default_verify_paths()
571 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
572
573
574 def bug_reports_message():
575 if ytdl_is_updateable():
576 update_cmd = 'type youtube-dl -U to update'
577 else:
578 update_cmd = 'see https://yt-dl.org/update on how to update'
579 msg = '; please report this issue on https://yt-dl.org/bug .'
580 msg += ' Make sure you are using the latest version; %s.' % update_cmd
581 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
582 return msg
583
584
585 class ExtractorError(Exception):
586 """Error during info extraction."""
587
588 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
589 """ tb, if given, is the original traceback (so that it can be printed out).
590 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
591 """
592
593 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
594 expected = True
595 if video_id is not None:
596 msg = video_id + ': ' + msg
597 if cause:
598 msg += ' (caused by %r)' % cause
599 if not expected:
600 msg += bug_reports_message()
601 super(ExtractorError, self).__init__(msg)
602
603 self.traceback = tb
604 self.exc_info = sys.exc_info() # preserve original exception
605 self.cause = cause
606 self.video_id = video_id
607
608 def format_traceback(self):
609 if self.traceback is None:
610 return None
611 return ''.join(traceback.format_tb(self.traceback))
612
613
614 class UnsupportedError(ExtractorError):
615 def __init__(self, url):
616 super(UnsupportedError, self).__init__(
617 'Unsupported URL: %s' % url, expected=True)
618 self.url = url
619
620
621 class RegexNotFoundError(ExtractorError):
622 """Error when a regex didn't match"""
623 pass
624
625
626 class DownloadError(Exception):
627 """Download Error exception.
628
629 This exception may be thrown by FileDownloader objects if they are not
630 configured to continue on errors. They will contain the appropriate
631 error message.
632 """
633
634 def __init__(self, msg, exc_info=None):
635 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
636 super(DownloadError, self).__init__(msg)
637 self.exc_info = exc_info
638
639
640 class SameFileError(Exception):
641 """Same File exception.
642
643 This exception will be thrown by FileDownloader objects if they detect
644 multiple files would have to be downloaded to the same file on disk.
645 """
646 pass
647
648
649 class PostProcessingError(Exception):
650 """Post Processing exception.
651
652 This exception may be raised by PostProcessor's .run() method to
653 indicate an error in the postprocessing task.
654 """
655
656 def __init__(self, msg):
657 self.msg = msg
658
659
660 class MaxDownloadsReached(Exception):
661 """ --max-downloads limit has been reached. """
662 pass
663
664
665 class UnavailableVideoError(Exception):
666 """Unavailable Format exception.
667
668 This exception will be thrown when a video is requested
669 in a format that is not available for that video.
670 """
671 pass
672
673
674 class ContentTooShortError(Exception):
675 """Content Too Short exception.
676
677 This exception may be raised by FileDownloader objects when a file they
678 download is too small for what the server announced first, indicating
679 the connection was probably interrupted.
680 """
681
682 def __init__(self, downloaded, expected):
683 # Both in bytes
684 self.downloaded = downloaded
685 self.expected = expected
686
687
688 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
689 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
690 # expected HTTP responses to meet HTTP/1.0 or later (see also
691 # https://github.com/rg3/youtube-dl/issues/6727)
692 if sys.version_info < (3, 0):
693 kwargs[b'strict'] = True
694 hc = http_class(*args, **kwargs)
695 source_address = ydl_handler._params.get('source_address')
696 if source_address is not None:
697 sa = (source_address, 0)
698 if hasattr(hc, 'source_address'): # Python 2.7+
699 hc.source_address = sa
700 else: # Python 2.6
701 def _hc_connect(self, *args, **kwargs):
702 sock = compat_socket_create_connection(
703 (self.host, self.port), self.timeout, sa)
704 if is_https:
705 self.sock = ssl.wrap_socket(
706 sock, self.key_file, self.cert_file,
707 ssl_version=ssl.PROTOCOL_TLSv1)
708 else:
709 self.sock = sock
710 hc.connect = functools.partial(_hc_connect, hc)
711
712 return hc
713
714
715 def handle_youtubedl_headers(headers):
716 filtered_headers = headers
717
718 if 'Youtubedl-no-compression' in filtered_headers:
719 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
720 del filtered_headers['Youtubedl-no-compression']
721
722 return filtered_headers
723
724
725 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
726 """Handler for HTTP requests and responses.
727
728 This class, when installed with an OpenerDirector, automatically adds
729 the standard headers to every HTTP request and handles gzipped and
730 deflated responses from web servers. If compression is to be avoided in
731 a particular request, the original request in the program code only has
732 to include the HTTP header "Youtubedl-no-compression", which will be
733 removed before making the real request.
734
735 Part of this code was copied from:
736
737 http://techknack.net/python-urllib2-handlers/
738
739 Andrew Rowls, the author of that code, agreed to release it to the
740 public domain.
741 """
742
743 def __init__(self, params, *args, **kwargs):
744 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
745 self._params = params
746
747 def http_open(self, req):
748 return self.do_open(functools.partial(
749 _create_http_connection, self, compat_http_client.HTTPConnection, False),
750 req)
751
752 @staticmethod
753 def deflate(data):
754 try:
755 return zlib.decompress(data, -zlib.MAX_WBITS)
756 except zlib.error:
757 return zlib.decompress(data)
758
759 @staticmethod
760 def addinfourl_wrapper(stream, headers, url, code):
761 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
762 return compat_urllib_request.addinfourl(stream, headers, url, code)
763 ret = compat_urllib_request.addinfourl(stream, headers, url)
764 ret.code = code
765 return ret
766
767 def http_request(self, req):
768 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
769 # always respected by websites, some tend to give out URLs with non percent-encoded
770 # non-ASCII characters (see telemb.py, ard.py [#3412])
771 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
772 # To work around aforementioned issue we will replace request's original URL with
773 # percent-encoded one
774 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
775 # the code of this workaround has been moved here from YoutubeDL.urlopen()
776 url = req.get_full_url()
777 url_escaped = escape_url(url)
778
779 # Substitute URL if any change after escaping
780 if url != url_escaped:
781 req = update_Request(req, url=url_escaped)
782
783 for h, v in std_headers.items():
784 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
785 # The dict keys are capitalized because of this bug by urllib
786 if h.capitalize() not in req.headers:
787 req.add_header(h, v)
788
789 req.headers = handle_youtubedl_headers(req.headers)
790
791 if sys.version_info < (2, 7) and '#' in req.get_full_url():
792 # Python 2.6 is brain-dead when it comes to fragments
793 req._Request__original = req._Request__original.partition('#')[0]
794 req._Request__r_type = req._Request__r_type.partition('#')[0]
795
796 return req
797
798 def http_response(self, req, resp):
799 old_resp = resp
800 # gzip
801 if resp.headers.get('Content-encoding', '') == 'gzip':
802 content = resp.read()
803 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
804 try:
805 uncompressed = io.BytesIO(gz.read())
806 except IOError as original_ioerror:
807 # There may be junk add the end of the file
808 # See http://stackoverflow.com/q/4928560/35070 for details
809 for i in range(1, 1024):
810 try:
811 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
812 uncompressed = io.BytesIO(gz.read())
813 except IOError:
814 continue
815 break
816 else:
817 raise original_ioerror
818 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
819 resp.msg = old_resp.msg
820 del resp.headers['Content-encoding']
821 # deflate
822 if resp.headers.get('Content-encoding', '') == 'deflate':
823 gz = io.BytesIO(self.deflate(resp.read()))
824 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
825 resp.msg = old_resp.msg
826 del resp.headers['Content-encoding']
827 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
828 # https://github.com/rg3/youtube-dl/issues/6457).
829 if 300 <= resp.code < 400:
830 location = resp.headers.get('Location')
831 if location:
832 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
833 if sys.version_info >= (3, 0):
834 location = location.encode('iso-8859-1').decode('utf-8')
835 location_escaped = escape_url(location)
836 if location != location_escaped:
837 del resp.headers['Location']
838 resp.headers['Location'] = location_escaped
839 return resp
840
841 https_request = http_request
842 https_response = http_response
843
844
845 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
846 def __init__(self, params, https_conn_class=None, *args, **kwargs):
847 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
848 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
849 self._params = params
850
851 def https_open(self, req):
852 kwargs = {}
853 if hasattr(self, '_context'): # python > 2.6
854 kwargs['context'] = self._context
855 if hasattr(self, '_check_hostname'): # python 3.x
856 kwargs['check_hostname'] = self._check_hostname
857 return self.do_open(functools.partial(
858 _create_http_connection, self, self._https_conn_class, True),
859 req, **kwargs)
860
861
862 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
863 def __init__(self, cookiejar=None):
864 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
865
866 def http_response(self, request, response):
867 # Python 2 will choke on next HTTP request in row if there are non-ASCII
868 # characters in Set-Cookie HTTP header of last response (see
869 # https://github.com/rg3/youtube-dl/issues/6769).
870 # In order to at least prevent crashing we will percent encode Set-Cookie
871 # header before HTTPCookieProcessor starts processing it.
872 # if sys.version_info < (3, 0) and response.headers:
873 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
874 # set_cookie = response.headers.get(set_cookie_header)
875 # if set_cookie:
876 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
877 # if set_cookie != set_cookie_escaped:
878 # del response.headers[set_cookie_header]
879 # response.headers[set_cookie_header] = set_cookie_escaped
880 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
881
882 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
883 https_response = http_response
884
885
886 def parse_iso8601(date_str, delimiter='T', timezone=None):
887 """ Return a UNIX timestamp from the given date """
888
889 if date_str is None:
890 return None
891
892 date_str = re.sub(r'\.[0-9]+', '', date_str)
893
894 if timezone is None:
895 m = re.search(
896 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
897 date_str)
898 if not m:
899 timezone = datetime.timedelta()
900 else:
901 date_str = date_str[:-len(m.group(0))]
902 if not m.group('sign'):
903 timezone = datetime.timedelta()
904 else:
905 sign = 1 if m.group('sign') == '+' else -1
906 timezone = datetime.timedelta(
907 hours=sign * int(m.group('hours')),
908 minutes=sign * int(m.group('minutes')))
909 try:
910 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
911 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912 return calendar.timegm(dt.timetuple())
913 except ValueError:
914 pass
915
916
917 def unified_strdate(date_str, day_first=True):
918 """Return a string with the date in the format YYYYMMDD"""
919
920 if date_str is None:
921 return None
922 upload_date = None
923 # Replace commas
924 date_str = date_str.replace(',', ' ')
925 # %z (UTC offset) is only supported in python>=3.2
926 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
927 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
928 # Remove AM/PM + timezone
929 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
930
931 format_expressions = [
932 '%d %B %Y',
933 '%d %b %Y',
934 '%B %d %Y',
935 '%b %d %Y',
936 '%b %dst %Y %I:%M',
937 '%b %dnd %Y %I:%M',
938 '%b %dth %Y %I:%M',
939 '%Y %m %d',
940 '%Y-%m-%d',
941 '%Y/%m/%d',
942 '%Y/%m/%d %H:%M:%S',
943 '%Y-%m-%d %H:%M:%S',
944 '%Y-%m-%d %H:%M:%S.%f',
945 '%d.%m.%Y %H:%M',
946 '%d.%m.%Y %H.%M',
947 '%Y-%m-%dT%H:%M:%SZ',
948 '%Y-%m-%dT%H:%M:%S.%fZ',
949 '%Y-%m-%dT%H:%M:%S.%f0Z',
950 '%Y-%m-%dT%H:%M:%S',
951 '%Y-%m-%dT%H:%M:%S.%f',
952 '%Y-%m-%dT%H:%M',
953 ]
954 if day_first:
955 format_expressions.extend([
956 '%d-%m-%Y',
957 '%d.%m.%Y',
958 '%d/%m/%Y',
959 '%d/%m/%y',
960 '%d/%m/%Y %H:%M:%S',
961 ])
962 else:
963 format_expressions.extend([
964 '%m-%d-%Y',
965 '%m.%d.%Y',
966 '%m/%d/%Y',
967 '%m/%d/%y',
968 '%m/%d/%Y %H:%M:%S',
969 ])
970 for expression in format_expressions:
971 try:
972 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
973 except ValueError:
974 pass
975 if upload_date is None:
976 timetuple = email.utils.parsedate_tz(date_str)
977 if timetuple:
978 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
979 if upload_date is not None:
980 return compat_str(upload_date)
981
982
983 def determine_ext(url, default_ext='unknown_video'):
984 if url is None:
985 return default_ext
986 guess = url.partition('?')[0].rpartition('.')[2]
987 if re.match(r'^[A-Za-z0-9]+$', guess):
988 return guess
989 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
990 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
991 return guess.rstrip('/')
992 else:
993 return default_ext
994
995
996 def subtitles_filename(filename, sub_lang, sub_format):
997 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
998
999
1000 def date_from_str(date_str):
1001 """
1002 Return a datetime object from a string in the format YYYYMMDD or
1003 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1004 today = datetime.date.today()
1005 if date_str in ('now', 'today'):
1006 return today
1007 if date_str == 'yesterday':
1008 return today - datetime.timedelta(days=1)
1009 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1010 if match is not None:
1011 sign = match.group('sign')
1012 time = int(match.group('time'))
1013 if sign == '-':
1014 time = -time
1015 unit = match.group('unit')
1016 # A bad approximation?
1017 if unit == 'month':
1018 unit = 'day'
1019 time *= 30
1020 elif unit == 'year':
1021 unit = 'day'
1022 time *= 365
1023 unit += 's'
1024 delta = datetime.timedelta(**{unit: time})
1025 return today + delta
1026 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1027
1028
1029 def hyphenate_date(date_str):
1030 """
1031 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1032 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1033 if match is not None:
1034 return '-'.join(match.groups())
1035 else:
1036 return date_str
1037
1038
1039 class DateRange(object):
1040 """Represents a time interval between two dates"""
1041
1042 def __init__(self, start=None, end=None):
1043 """start and end must be strings in the format accepted by date"""
1044 if start is not None:
1045 self.start = date_from_str(start)
1046 else:
1047 self.start = datetime.datetime.min.date()
1048 if end is not None:
1049 self.end = date_from_str(end)
1050 else:
1051 self.end = datetime.datetime.max.date()
1052 if self.start > self.end:
1053 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1054
1055 @classmethod
1056 def day(cls, day):
1057 """Returns a range that only contains the given day"""
1058 return cls(day, day)
1059
1060 def __contains__(self, date):
1061 """Check if the date is in the range"""
1062 if not isinstance(date, datetime.date):
1063 date = date_from_str(date)
1064 return self.start <= date <= self.end
1065
1066 def __str__(self):
1067 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1068
1069
1070 def platform_name():
1071 """ Returns the platform name as a compat_str """
1072 res = platform.platform()
1073 if isinstance(res, bytes):
1074 res = res.decode(preferredencoding())
1075
1076 assert isinstance(res, compat_str)
1077 return res
1078
1079
1080 def _windows_write_string(s, out):
1081 """ Returns True if the string was written using special methods,
1082 False if it has yet to be written out."""
1083 # Adapted from http://stackoverflow.com/a/3259271/35070
1084
1085 import ctypes
1086 import ctypes.wintypes
1087
1088 WIN_OUTPUT_IDS = {
1089 1: -11,
1090 2: -12,
1091 }
1092
1093 try:
1094 fileno = out.fileno()
1095 except AttributeError:
1096 # If the output stream doesn't have a fileno, it's virtual
1097 return False
1098 except io.UnsupportedOperation:
1099 # Some strange Windows pseudo files?
1100 return False
1101 if fileno not in WIN_OUTPUT_IDS:
1102 return False
1103
1104 GetStdHandle = ctypes.WINFUNCTYPE(
1105 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1106 (b'GetStdHandle', ctypes.windll.kernel32))
1107 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1108
1109 WriteConsoleW = ctypes.WINFUNCTYPE(
1110 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1111 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1112 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1113 written = ctypes.wintypes.DWORD(0)
1114
1115 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1116 FILE_TYPE_CHAR = 0x0002
1117 FILE_TYPE_REMOTE = 0x8000
1118 GetConsoleMode = ctypes.WINFUNCTYPE(
1119 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1120 ctypes.POINTER(ctypes.wintypes.DWORD))(
1121 (b'GetConsoleMode', ctypes.windll.kernel32))
1122 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1123
1124 def not_a_console(handle):
1125 if handle == INVALID_HANDLE_VALUE or handle is None:
1126 return True
1127 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1128 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1129
1130 if not_a_console(h):
1131 return False
1132
1133 def next_nonbmp_pos(s):
1134 try:
1135 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1136 except StopIteration:
1137 return len(s)
1138
1139 while s:
1140 count = min(next_nonbmp_pos(s), 1024)
1141
1142 ret = WriteConsoleW(
1143 h, s, count if count else 2, ctypes.byref(written), None)
1144 if ret == 0:
1145 raise OSError('Failed to write string')
1146 if not count: # We just wrote a non-BMP character
1147 assert written.value == 2
1148 s = s[1:]
1149 else:
1150 assert written.value > 0
1151 s = s[written.value:]
1152 return True
1153
1154
1155 def write_string(s, out=None, encoding=None):
1156 if out is None:
1157 out = sys.stderr
1158 assert type(s) == compat_str
1159
1160 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1161 if _windows_write_string(s, out):
1162 return
1163
1164 if ('b' in getattr(out, 'mode', '') or
1165 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1166 byt = s.encode(encoding or preferredencoding(), 'ignore')
1167 out.write(byt)
1168 elif hasattr(out, 'buffer'):
1169 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1170 byt = s.encode(enc, 'ignore')
1171 out.buffer.write(byt)
1172 else:
1173 out.write(s)
1174 out.flush()
1175
1176
1177 def bytes_to_intlist(bs):
1178 if not bs:
1179 return []
1180 if isinstance(bs[0], int): # Python 3
1181 return list(bs)
1182 else:
1183 return [ord(c) for c in bs]
1184
1185
1186 def intlist_to_bytes(xs):
1187 if not xs:
1188 return b''
1189 return struct_pack('%dB' % len(xs), *xs)
1190
1191
1192 # Cross-platform file locking
1193 if sys.platform == 'win32':
1194 import ctypes.wintypes
1195 import msvcrt
1196
1197 class OVERLAPPED(ctypes.Structure):
1198 _fields_ = [
1199 ('Internal', ctypes.wintypes.LPVOID),
1200 ('InternalHigh', ctypes.wintypes.LPVOID),
1201 ('Offset', ctypes.wintypes.DWORD),
1202 ('OffsetHigh', ctypes.wintypes.DWORD),
1203 ('hEvent', ctypes.wintypes.HANDLE),
1204 ]
1205
1206 kernel32 = ctypes.windll.kernel32
1207 LockFileEx = kernel32.LockFileEx
1208 LockFileEx.argtypes = [
1209 ctypes.wintypes.HANDLE, # hFile
1210 ctypes.wintypes.DWORD, # dwFlags
1211 ctypes.wintypes.DWORD, # dwReserved
1212 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1213 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1214 ctypes.POINTER(OVERLAPPED) # Overlapped
1215 ]
1216 LockFileEx.restype = ctypes.wintypes.BOOL
1217 UnlockFileEx = kernel32.UnlockFileEx
1218 UnlockFileEx.argtypes = [
1219 ctypes.wintypes.HANDLE, # hFile
1220 ctypes.wintypes.DWORD, # dwReserved
1221 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1222 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1223 ctypes.POINTER(OVERLAPPED) # Overlapped
1224 ]
1225 UnlockFileEx.restype = ctypes.wintypes.BOOL
1226 whole_low = 0xffffffff
1227 whole_high = 0x7fffffff
1228
1229 def _lock_file(f, exclusive):
1230 overlapped = OVERLAPPED()
1231 overlapped.Offset = 0
1232 overlapped.OffsetHigh = 0
1233 overlapped.hEvent = 0
1234 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1235 handle = msvcrt.get_osfhandle(f.fileno())
1236 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1237 whole_low, whole_high, f._lock_file_overlapped_p):
1238 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1239
1240 def _unlock_file(f):
1241 assert f._lock_file_overlapped_p
1242 handle = msvcrt.get_osfhandle(f.fileno())
1243 if not UnlockFileEx(handle, 0,
1244 whole_low, whole_high, f._lock_file_overlapped_p):
1245 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1246
1247 else:
1248 # Some platforms, such as Jython, is missing fcntl
1249 try:
1250 import fcntl
1251
1252 def _lock_file(f, exclusive):
1253 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1254
1255 def _unlock_file(f):
1256 fcntl.flock(f, fcntl.LOCK_UN)
1257 except ImportError:
1258 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1259
1260 def _lock_file(f, exclusive):
1261 raise IOError(UNSUPPORTED_MSG)
1262
1263 def _unlock_file(f):
1264 raise IOError(UNSUPPORTED_MSG)
1265
1266
1267 class locked_file(object):
1268 def __init__(self, filename, mode, encoding=None):
1269 assert mode in ['r', 'a', 'w']
1270 self.f = io.open(filename, mode, encoding=encoding)
1271 self.mode = mode
1272
1273 def __enter__(self):
1274 exclusive = self.mode != 'r'
1275 try:
1276 _lock_file(self.f, exclusive)
1277 except IOError:
1278 self.f.close()
1279 raise
1280 return self
1281
1282 def __exit__(self, etype, value, traceback):
1283 try:
1284 _unlock_file(self.f)
1285 finally:
1286 self.f.close()
1287
1288 def __iter__(self):
1289 return iter(self.f)
1290
1291 def write(self, *args):
1292 return self.f.write(*args)
1293
1294 def read(self, *args):
1295 return self.f.read(*args)
1296
1297
1298 def get_filesystem_encoding():
1299 encoding = sys.getfilesystemencoding()
1300 return encoding if encoding is not None else 'utf-8'
1301
1302
1303 def shell_quote(args):
1304 quoted_args = []
1305 encoding = get_filesystem_encoding()
1306 for a in args:
1307 if isinstance(a, bytes):
1308 # We may get a filename encoded with 'encodeFilename'
1309 a = a.decode(encoding)
1310 quoted_args.append(pipes.quote(a))
1311 return ' '.join(quoted_args)
1312
1313
1314 def smuggle_url(url, data):
1315 """ Pass additional data in a URL for internal use. """
1316
1317 sdata = compat_urllib_parse_urlencode(
1318 {'__youtubedl_smuggle': json.dumps(data)})
1319 return url + '#' + sdata
1320
1321
1322 def unsmuggle_url(smug_url, default=None):
1323 if '#__youtubedl_smuggle' not in smug_url:
1324 return smug_url, default
1325 url, _, sdata = smug_url.rpartition('#')
1326 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1327 data = json.loads(jsond)
1328 return url, data
1329
1330
1331 def format_bytes(bytes):
1332 if bytes is None:
1333 return 'N/A'
1334 if type(bytes) is str:
1335 bytes = float(bytes)
1336 if bytes == 0.0:
1337 exponent = 0
1338 else:
1339 exponent = int(math.log(bytes, 1024.0))
1340 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1341 converted = float(bytes) / float(1024 ** exponent)
1342 return '%.2f%s' % (converted, suffix)
1343
1344
1345 def lookup_unit_table(unit_table, s):
1346 units_re = '|'.join(re.escape(u) for u in unit_table)
1347 m = re.match(
1348 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1349 if not m:
1350 return None
1351 num_str = m.group('num').replace(',', '.')
1352 mult = unit_table[m.group('unit')]
1353 return int(float(num_str) * mult)
1354
1355
1356 def parse_filesize(s):
1357 if s is None:
1358 return None
1359
1360 # The lower-case forms are of course incorrect and unofficial,
1361 # but we support those too
1362 _UNIT_TABLE = {
1363 'B': 1,
1364 'b': 1,
1365 'KiB': 1024,
1366 'KB': 1000,
1367 'kB': 1024,
1368 'Kb': 1000,
1369 'MiB': 1024 ** 2,
1370 'MB': 1000 ** 2,
1371 'mB': 1024 ** 2,
1372 'Mb': 1000 ** 2,
1373 'GiB': 1024 ** 3,
1374 'GB': 1000 ** 3,
1375 'gB': 1024 ** 3,
1376 'Gb': 1000 ** 3,
1377 'TiB': 1024 ** 4,
1378 'TB': 1000 ** 4,
1379 'tB': 1024 ** 4,
1380 'Tb': 1000 ** 4,
1381 'PiB': 1024 ** 5,
1382 'PB': 1000 ** 5,
1383 'pB': 1024 ** 5,
1384 'Pb': 1000 ** 5,
1385 'EiB': 1024 ** 6,
1386 'EB': 1000 ** 6,
1387 'eB': 1024 ** 6,
1388 'Eb': 1000 ** 6,
1389 'ZiB': 1024 ** 7,
1390 'ZB': 1000 ** 7,
1391 'zB': 1024 ** 7,
1392 'Zb': 1000 ** 7,
1393 'YiB': 1024 ** 8,
1394 'YB': 1000 ** 8,
1395 'yB': 1024 ** 8,
1396 'Yb': 1000 ** 8,
1397 }
1398
1399 return lookup_unit_table(_UNIT_TABLE, s)
1400
1401
1402 def parse_count(s):
1403 if s is None:
1404 return None
1405
1406 s = s.strip()
1407
1408 if re.match(r'^[\d,.]+$', s):
1409 return str_to_int(s)
1410
1411 _UNIT_TABLE = {
1412 'k': 1000,
1413 'K': 1000,
1414 'm': 1000 ** 2,
1415 'M': 1000 ** 2,
1416 'kk': 1000 ** 2,
1417 'KK': 1000 ** 2,
1418 }
1419
1420 return lookup_unit_table(_UNIT_TABLE, s)
1421
1422
1423 def month_by_name(name):
1424 """ Return the number of a month by (locale-independently) English name """
1425
1426 try:
1427 return ENGLISH_MONTH_NAMES.index(name) + 1
1428 except ValueError:
1429 return None
1430
1431
1432 def month_by_abbreviation(abbrev):
1433 """ Return the number of a month by (locale-independently) English
1434 abbreviations """
1435
1436 try:
1437 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1438 except ValueError:
1439 return None
1440
1441
1442 def fix_xml_ampersands(xml_str):
1443 """Replace all the '&' by '&amp;' in XML"""
1444 return re.sub(
1445 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1446 '&amp;',
1447 xml_str)
1448
1449
1450 def setproctitle(title):
1451 assert isinstance(title, compat_str)
1452
1453 # ctypes in Jython is not complete
1454 # http://bugs.jython.org/issue2148
1455 if sys.platform.startswith('java'):
1456 return
1457
1458 try:
1459 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1460 except OSError:
1461 return
1462 title_bytes = title.encode('utf-8')
1463 buf = ctypes.create_string_buffer(len(title_bytes))
1464 buf.value = title_bytes
1465 try:
1466 libc.prctl(15, buf, 0, 0, 0)
1467 except AttributeError:
1468 return # Strange libc, just skip this
1469
1470
1471 def remove_start(s, start):
1472 if s.startswith(start):
1473 return s[len(start):]
1474 return s
1475
1476
1477 def remove_end(s, end):
1478 if s.endswith(end):
1479 return s[:-len(end)]
1480 return s
1481
1482
1483 def remove_quotes(s):
1484 if s is None or len(s) < 2:
1485 return s
1486 for quote in ('"', "'", ):
1487 if s[0] == quote and s[-1] == quote:
1488 return s[1:-1]
1489 return s
1490
1491
1492 def url_basename(url):
1493 path = compat_urlparse.urlparse(url).path
1494 return path.strip('/').split('/')[-1]
1495
1496
1497 class HEADRequest(compat_urllib_request.Request):
1498 def get_method(self):
1499 return 'HEAD'
1500
1501
1502 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1503 if get_attr:
1504 if v is not None:
1505 v = getattr(v, get_attr, None)
1506 if v == '':
1507 v = None
1508 if v is None:
1509 return default
1510 try:
1511 return int(v) * invscale // scale
1512 except ValueError:
1513 return default
1514
1515
1516 def str_or_none(v, default=None):
1517 return default if v is None else compat_str(v)
1518
1519
1520 def str_to_int(int_str):
1521 """ A more relaxed version of int_or_none """
1522 if int_str is None:
1523 return None
1524 int_str = re.sub(r'[,\.\+]', '', int_str)
1525 return int(int_str)
1526
1527
1528 def float_or_none(v, scale=1, invscale=1, default=None):
1529 if v is None:
1530 return default
1531 try:
1532 return float(v) * invscale / scale
1533 except ValueError:
1534 return default
1535
1536
1537 def parse_duration(s):
1538 if not isinstance(s, compat_basestring):
1539 return None
1540
1541 s = s.strip()
1542
1543 days, hours, mins, secs, ms = [None] * 5
1544 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1545 if m:
1546 days, hours, mins, secs, ms = m.groups()
1547 else:
1548 m = re.match(
1549 r'''(?ix)(?:P?T)?
1550 (?:
1551 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1552 )?
1553 (?:
1554 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1555 )?
1556 (?:
1557 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1558 )?
1559 (?:
1560 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1561 )?$''', s)
1562 if m:
1563 days, hours, mins, secs, ms = m.groups()
1564 else:
1565 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1566 if m:
1567 hours, mins = m.groups()
1568 else:
1569 return None
1570
1571 duration = 0
1572 if secs:
1573 duration += float(secs)
1574 if mins:
1575 duration += float(mins) * 60
1576 if hours:
1577 duration += float(hours) * 60 * 60
1578 if days:
1579 duration += float(days) * 24 * 60 * 60
1580 if ms:
1581 duration += float(ms)
1582 return duration
1583
1584
1585 def prepend_extension(filename, ext, expected_real_ext=None):
1586 name, real_ext = os.path.splitext(filename)
1587 return (
1588 '{0}.{1}{2}'.format(name, ext, real_ext)
1589 if not expected_real_ext or real_ext[1:] == expected_real_ext
1590 else '{0}.{1}'.format(filename, ext))
1591
1592
1593 def replace_extension(filename, ext, expected_real_ext=None):
1594 name, real_ext = os.path.splitext(filename)
1595 return '{0}.{1}'.format(
1596 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1597 ext)
1598
1599
1600 def check_executable(exe, args=[]):
1601 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1602 args can be a list of arguments for a short output (like -version) """
1603 try:
1604 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1605 except OSError:
1606 return False
1607 return exe
1608
1609
1610 def get_exe_version(exe, args=['--version'],
1611 version_re=None, unrecognized='present'):
1612 """ Returns the version of the specified executable,
1613 or False if the executable is not present """
1614 try:
1615 out, _ = subprocess.Popen(
1616 [encodeArgument(exe)] + args,
1617 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1618 except OSError:
1619 return False
1620 if isinstance(out, bytes): # Python 2.x
1621 out = out.decode('ascii', 'ignore')
1622 return detect_exe_version(out, version_re, unrecognized)
1623
1624
1625 def detect_exe_version(output, version_re=None, unrecognized='present'):
1626 assert isinstance(output, compat_str)
1627 if version_re is None:
1628 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1629 m = re.search(version_re, output)
1630 if m:
1631 return m.group(1)
1632 else:
1633 return unrecognized
1634
1635
1636 class PagedList(object):
1637 def __len__(self):
1638 # This is only useful for tests
1639 return len(self.getslice())
1640
1641
1642 class OnDemandPagedList(PagedList):
1643 def __init__(self, pagefunc, pagesize, use_cache=False):
1644 self._pagefunc = pagefunc
1645 self._pagesize = pagesize
1646 self._use_cache = use_cache
1647 if use_cache:
1648 self._cache = {}
1649
1650 def getslice(self, start=0, end=None):
1651 res = []
1652 for pagenum in itertools.count(start // self._pagesize):
1653 firstid = pagenum * self._pagesize
1654 nextfirstid = pagenum * self._pagesize + self._pagesize
1655 if start >= nextfirstid:
1656 continue
1657
1658 page_results = None
1659 if self._use_cache:
1660 page_results = self._cache.get(pagenum)
1661 if page_results is None:
1662 page_results = list(self._pagefunc(pagenum))
1663 if self._use_cache:
1664 self._cache[pagenum] = page_results
1665
1666 startv = (
1667 start % self._pagesize
1668 if firstid <= start < nextfirstid
1669 else 0)
1670
1671 endv = (
1672 ((end - 1) % self._pagesize) + 1
1673 if (end is not None and firstid <= end <= nextfirstid)
1674 else None)
1675
1676 if startv != 0 or endv is not None:
1677 page_results = page_results[startv:endv]
1678 res.extend(page_results)
1679
1680 # A little optimization - if current page is not "full", ie. does
1681 # not contain page_size videos then we can assume that this page
1682 # is the last one - there are no more ids on further pages -
1683 # i.e. no need to query again.
1684 if len(page_results) + startv < self._pagesize:
1685 break
1686
1687 # If we got the whole page, but the next page is not interesting,
1688 # break out early as well
1689 if end == nextfirstid:
1690 break
1691 return res
1692
1693
1694 class InAdvancePagedList(PagedList):
1695 def __init__(self, pagefunc, pagecount, pagesize):
1696 self._pagefunc = pagefunc
1697 self._pagecount = pagecount
1698 self._pagesize = pagesize
1699
1700 def getslice(self, start=0, end=None):
1701 res = []
1702 start_page = start // self._pagesize
1703 end_page = (
1704 self._pagecount if end is None else (end // self._pagesize + 1))
1705 skip_elems = start - start_page * self._pagesize
1706 only_more = None if end is None else end - start
1707 for pagenum in range(start_page, end_page):
1708 page = list(self._pagefunc(pagenum))
1709 if skip_elems:
1710 page = page[skip_elems:]
1711 skip_elems = None
1712 if only_more is not None:
1713 if len(page) < only_more:
1714 only_more -= len(page)
1715 else:
1716 page = page[:only_more]
1717 res.extend(page)
1718 break
1719 res.extend(page)
1720 return res
1721
1722
1723 def uppercase_escape(s):
1724 unicode_escape = codecs.getdecoder('unicode_escape')
1725 return re.sub(
1726 r'\\U[0-9a-fA-F]{8}',
1727 lambda m: unicode_escape(m.group(0))[0],
1728 s)
1729
1730
1731 def lowercase_escape(s):
1732 unicode_escape = codecs.getdecoder('unicode_escape')
1733 return re.sub(
1734 r'\\u[0-9a-fA-F]{4}',
1735 lambda m: unicode_escape(m.group(0))[0],
1736 s)
1737
1738
1739 def escape_rfc3986(s):
1740 """Escape non-ASCII characters as suggested by RFC 3986"""
1741 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1742 s = s.encode('utf-8')
1743 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1744
1745
1746 def escape_url(url):
1747 """Escape URL as suggested by RFC 3986"""
1748 url_parsed = compat_urllib_parse_urlparse(url)
1749 return url_parsed._replace(
1750 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1751 path=escape_rfc3986(url_parsed.path),
1752 params=escape_rfc3986(url_parsed.params),
1753 query=escape_rfc3986(url_parsed.query),
1754 fragment=escape_rfc3986(url_parsed.fragment)
1755 ).geturl()
1756
1757 try:
1758 struct.pack('!I', 0)
1759 except TypeError:
1760 # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
1761 # See https://bugs.python.org/issue19099
1762 def struct_pack(spec, *args):
1763 if isinstance(spec, compat_str):
1764 spec = spec.encode('ascii')
1765 return struct.pack(spec, *args)
1766
1767 def struct_unpack(spec, *args):
1768 if isinstance(spec, compat_str):
1769 spec = spec.encode('ascii')
1770 return struct.unpack(spec, *args)
1771 else:
1772 struct_pack = struct.pack
1773 struct_unpack = struct.unpack
1774
1775
1776 def read_batch_urls(batch_fd):
1777 def fixup(url):
1778 if not isinstance(url, compat_str):
1779 url = url.decode('utf-8', 'replace')
1780 BOM_UTF8 = '\xef\xbb\xbf'
1781 if url.startswith(BOM_UTF8):
1782 url = url[len(BOM_UTF8):]
1783 url = url.strip()
1784 if url.startswith(('#', ';', ']')):
1785 return False
1786 return url
1787
1788 with contextlib.closing(batch_fd) as fd:
1789 return [url for url in map(fixup, fd) if url]
1790
1791
1792 def urlencode_postdata(*args, **kargs):
1793 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1794
1795
1796 def update_url_query(url, query):
1797 if not query:
1798 return url
1799 parsed_url = compat_urlparse.urlparse(url)
1800 qs = compat_parse_qs(parsed_url.query)
1801 qs.update(query)
1802 return compat_urlparse.urlunparse(parsed_url._replace(
1803 query=compat_urllib_parse_urlencode(qs, True)))
1804
1805
1806 def update_Request(req, url=None, data=None, headers={}, query={}):
1807 req_headers = req.headers.copy()
1808 req_headers.update(headers)
1809 req_data = data or req.data
1810 req_url = update_url_query(url or req.get_full_url(), query)
1811 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1812 new_req = req_type(
1813 req_url, data=req_data, headers=req_headers,
1814 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1815 if hasattr(req, 'timeout'):
1816 new_req.timeout = req.timeout
1817 return new_req
1818
1819
1820 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1821 if isinstance(key_or_keys, (list, tuple)):
1822 for key in key_or_keys:
1823 if key not in d or d[key] is None or skip_false_values and not d[key]:
1824 continue
1825 return d[key]
1826 return default
1827 return d.get(key_or_keys, default)
1828
1829
1830 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1831 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1832
1833
1834 US_RATINGS = {
1835 'G': 0,
1836 'PG': 10,
1837 'PG-13': 13,
1838 'R': 16,
1839 'NC': 18,
1840 }
1841
1842
1843 def parse_age_limit(s):
1844 if s is None:
1845 return None
1846 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1847 return int(m.group('age')) if m else US_RATINGS.get(s)
1848
1849
1850 def strip_jsonp(code):
1851 return re.sub(
1852 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1853
1854
1855 def js_to_json(code):
1856 def fix_kv(m):
1857 v = m.group(0)
1858 if v in ('true', 'false', 'null'):
1859 return v
1860 if v.startswith('"'):
1861 v = re.sub(r"\\'", "'", v[1:-1])
1862 elif v.startswith("'"):
1863 v = v[1:-1]
1864 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1865 '\\\\': '\\\\',
1866 "\\'": "'",
1867 '"': '\\"',
1868 }[m.group(0)], v)
1869 return '"%s"' % v
1870
1871 res = re.sub(r'''(?x)
1872 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1873 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1874 [a-zA-Z_][.a-zA-Z_0-9]*
1875 ''', fix_kv, code)
1876 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1877 return res
1878
1879
1880 def qualities(quality_ids):
1881 """ Get a numeric quality value out of a list of possible values """
1882 def q(qid):
1883 try:
1884 return quality_ids.index(qid)
1885 except ValueError:
1886 return -1
1887 return q
1888
1889
1890 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1891
1892
1893 def limit_length(s, length):
1894 """ Add ellipses to overly long strings """
1895 if s is None:
1896 return None
1897 ELLIPSES = '...'
1898 if len(s) > length:
1899 return s[:length - len(ELLIPSES)] + ELLIPSES
1900 return s
1901
1902
1903 def version_tuple(v):
1904 return tuple(int(e) for e in re.split(r'[-.]', v))
1905
1906
1907 def is_outdated_version(version, limit, assume_new=True):
1908 if not version:
1909 return not assume_new
1910 try:
1911 return version_tuple(version) < version_tuple(limit)
1912 except ValueError:
1913 return not assume_new
1914
1915
1916 def ytdl_is_updateable():
1917 """ Returns if youtube-dl can be updated with -U """
1918 from zipimport import zipimporter
1919
1920 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1921
1922
1923 def args_to_str(args):
1924 # Get a short string representation for a subprocess command
1925 return ' '.join(shlex_quote(a) for a in args)
1926
1927
1928 def error_to_compat_str(err):
1929 err_str = str(err)
1930 # On python 2 error byte string must be decoded with proper
1931 # encoding rather than ascii
1932 if sys.version_info[0] < 3:
1933 err_str = err_str.decode(preferredencoding())
1934 return err_str
1935
1936
1937 def mimetype2ext(mt):
1938 ext = {
1939 'audio/mp4': 'm4a',
1940 }.get(mt)
1941 if ext is not None:
1942 return ext
1943
1944 _, _, res = mt.rpartition('/')
1945
1946 return {
1947 '3gpp': '3gp',
1948 'smptett+xml': 'tt',
1949 'srt': 'srt',
1950 'ttaf+xml': 'dfxp',
1951 'ttml+xml': 'ttml',
1952 'vtt': 'vtt',
1953 'x-flv': 'flv',
1954 'x-mp4-fragmented': 'mp4',
1955 'x-ms-wmv': 'wmv',
1956 }.get(res, res)
1957
1958
1959 def urlhandle_detect_ext(url_handle):
1960 try:
1961 url_handle.headers
1962 getheader = lambda h: url_handle.headers[h]
1963 except AttributeError: # Python < 3
1964 getheader = url_handle.info().getheader
1965
1966 cd = getheader('Content-Disposition')
1967 if cd:
1968 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1969 if m:
1970 e = determine_ext(m.group('filename'), default_ext=None)
1971 if e:
1972 return e
1973
1974 return mimetype2ext(getheader('Content-Type'))
1975
1976
1977 def encode_data_uri(data, mime_type):
1978 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1979
1980
1981 def age_restricted(content_limit, age_limit):
1982 """ Returns True iff the content should be blocked """
1983
1984 if age_limit is None: # No limit set
1985 return False
1986 if content_limit is None:
1987 return False # Content available for everyone
1988 return age_limit < content_limit
1989
1990
1991 def is_html(first_bytes):
1992 """ Detect whether a file contains HTML by examining its first bytes. """
1993
1994 BOMS = [
1995 (b'\xef\xbb\xbf', 'utf-8'),
1996 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1997 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1998 (b'\xff\xfe', 'utf-16-le'),
1999 (b'\xfe\xff', 'utf-16-be'),
2000 ]
2001 for bom, enc in BOMS:
2002 if first_bytes.startswith(bom):
2003 s = first_bytes[len(bom):].decode(enc, 'replace')
2004 break
2005 else:
2006 s = first_bytes.decode('utf-8', 'replace')
2007
2008 return re.match(r'^\s*<', s)
2009
2010
2011 def determine_protocol(info_dict):
2012 protocol = info_dict.get('protocol')
2013 if protocol is not None:
2014 return protocol
2015
2016 url = info_dict['url']
2017 if url.startswith('rtmp'):
2018 return 'rtmp'
2019 elif url.startswith('mms'):
2020 return 'mms'
2021 elif url.startswith('rtsp'):
2022 return 'rtsp'
2023
2024 ext = determine_ext(url)
2025 if ext == 'm3u8':
2026 return 'm3u8'
2027 elif ext == 'f4m':
2028 return 'f4m'
2029
2030 return compat_urllib_parse_urlparse(url).scheme
2031
2032
2033 def render_table(header_row, data):
2034 """ Render a list of rows, each as a list of values """
2035 table = [header_row] + data
2036 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2037 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2038 return '\n'.join(format_str % tuple(row) for row in table)
2039
2040
2041 def _match_one(filter_part, dct):
2042 COMPARISON_OPERATORS = {
2043 '<': operator.lt,
2044 '<=': operator.le,
2045 '>': operator.gt,
2046 '>=': operator.ge,
2047 '=': operator.eq,
2048 '!=': operator.ne,
2049 }
2050 operator_rex = re.compile(r'''(?x)\s*
2051 (?P<key>[a-z_]+)
2052 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2053 (?:
2054 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2055 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2056 )
2057 \s*$
2058 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2059 m = operator_rex.search(filter_part)
2060 if m:
2061 op = COMPARISON_OPERATORS[m.group('op')]
2062 if m.group('strval') is not None:
2063 if m.group('op') not in ('=', '!='):
2064 raise ValueError(
2065 'Operator %s does not support string values!' % m.group('op'))
2066 comparison_value = m.group('strval')
2067 else:
2068 try:
2069 comparison_value = int(m.group('intval'))
2070 except ValueError:
2071 comparison_value = parse_filesize(m.group('intval'))
2072 if comparison_value is None:
2073 comparison_value = parse_filesize(m.group('intval') + 'B')
2074 if comparison_value is None:
2075 raise ValueError(
2076 'Invalid integer value %r in filter part %r' % (
2077 m.group('intval'), filter_part))
2078 actual_value = dct.get(m.group('key'))
2079 if actual_value is None:
2080 return m.group('none_inclusive')
2081 return op(actual_value, comparison_value)
2082
2083 UNARY_OPERATORS = {
2084 '': lambda v: v is not None,
2085 '!': lambda v: v is None,
2086 }
2087 operator_rex = re.compile(r'''(?x)\s*
2088 (?P<op>%s)\s*(?P<key>[a-z_]+)
2089 \s*$
2090 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2091 m = operator_rex.search(filter_part)
2092 if m:
2093 op = UNARY_OPERATORS[m.group('op')]
2094 actual_value = dct.get(m.group('key'))
2095 return op(actual_value)
2096
2097 raise ValueError('Invalid filter part %r' % filter_part)
2098
2099
2100 def match_str(filter_str, dct):
2101 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2102
2103 return all(
2104 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2105
2106
2107 def match_filter_func(filter_str):
2108 def _match_func(info_dict):
2109 if match_str(filter_str, info_dict):
2110 return None
2111 else:
2112 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2113 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2114 return _match_func
2115
2116
2117 def parse_dfxp_time_expr(time_expr):
2118 if not time_expr:
2119 return
2120
2121 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2122 if mobj:
2123 return float(mobj.group('time_offset'))
2124
2125 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2126 if mobj:
2127 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2128
2129
2130 def srt_subtitles_timecode(seconds):
2131 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2132
2133
2134 def dfxp2srt(dfxp_data):
2135 _x = functools.partial(xpath_with_ns, ns_map={
2136 'ttml': 'http://www.w3.org/ns/ttml',
2137 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2138 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2139 })
2140
2141 class TTMLPElementParser(object):
2142 out = ''
2143
2144 def start(self, tag, attrib):
2145 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2146 self.out += '\n'
2147
2148 def end(self, tag):
2149 pass
2150
2151 def data(self, data):
2152 self.out += data
2153
2154 def close(self):
2155 return self.out.strip()
2156
2157 def parse_node(node):
2158 target = TTMLPElementParser()
2159 parser = xml.etree.ElementTree.XMLParser(target=target)
2160 parser.feed(xml.etree.ElementTree.tostring(node))
2161 return parser.close()
2162
2163 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2164 out = []
2165 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2166
2167 if not paras:
2168 raise ValueError('Invalid dfxp/TTML subtitle')
2169
2170 for para, index in zip(paras, itertools.count(1)):
2171 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2172 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2173 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2174 if begin_time is None:
2175 continue
2176 if not end_time:
2177 if not dur:
2178 continue
2179 end_time = begin_time + dur
2180 out.append('%d\n%s --> %s\n%s\n\n' % (
2181 index,
2182 srt_subtitles_timecode(begin_time),
2183 srt_subtitles_timecode(end_time),
2184 parse_node(para)))
2185
2186 return ''.join(out)
2187
2188
2189 def cli_option(params, command_option, param):
2190 param = params.get(param)
2191 return [command_option, param] if param is not None else []
2192
2193
2194 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2195 param = params.get(param)
2196 assert isinstance(param, bool)
2197 if separator:
2198 return [command_option + separator + (true_value if param else false_value)]
2199 return [command_option, true_value if param else false_value]
2200
2201
2202 def cli_valueless_option(params, command_option, param, expected_value=True):
2203 param = params.get(param)
2204 return [command_option] if param == expected_value else []
2205
2206
2207 def cli_configuration_args(params, param, default=[]):
2208 ex_args = params.get(param)
2209 if ex_args is None:
2210 return default
2211 assert isinstance(ex_args, list)
2212 return ex_args
2213
2214
2215 class ISO639Utils(object):
2216 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2217 _lang_map = {
2218 'aa': 'aar',
2219 'ab': 'abk',
2220 'ae': 'ave',
2221 'af': 'afr',
2222 'ak': 'aka',
2223 'am': 'amh',
2224 'an': 'arg',
2225 'ar': 'ara',
2226 'as': 'asm',
2227 'av': 'ava',
2228 'ay': 'aym',
2229 'az': 'aze',
2230 'ba': 'bak',
2231 'be': 'bel',
2232 'bg': 'bul',
2233 'bh': 'bih',
2234 'bi': 'bis',
2235 'bm': 'bam',
2236 'bn': 'ben',
2237 'bo': 'bod',
2238 'br': 'bre',
2239 'bs': 'bos',
2240 'ca': 'cat',
2241 'ce': 'che',
2242 'ch': 'cha',
2243 'co': 'cos',
2244 'cr': 'cre',
2245 'cs': 'ces',
2246 'cu': 'chu',
2247 'cv': 'chv',
2248 'cy': 'cym',
2249 'da': 'dan',
2250 'de': 'deu',
2251 'dv': 'div',
2252 'dz': 'dzo',
2253 'ee': 'ewe',
2254 'el': 'ell',
2255 'en': 'eng',
2256 'eo': 'epo',
2257 'es': 'spa',
2258 'et': 'est',
2259 'eu': 'eus',
2260 'fa': 'fas',
2261 'ff': 'ful',
2262 'fi': 'fin',
2263 'fj': 'fij',
2264 'fo': 'fao',
2265 'fr': 'fra',
2266 'fy': 'fry',
2267 'ga': 'gle',
2268 'gd': 'gla',
2269 'gl': 'glg',
2270 'gn': 'grn',
2271 'gu': 'guj',
2272 'gv': 'glv',
2273 'ha': 'hau',
2274 'he': 'heb',
2275 'hi': 'hin',
2276 'ho': 'hmo',
2277 'hr': 'hrv',
2278 'ht': 'hat',
2279 'hu': 'hun',
2280 'hy': 'hye',
2281 'hz': 'her',
2282 'ia': 'ina',
2283 'id': 'ind',
2284 'ie': 'ile',
2285 'ig': 'ibo',
2286 'ii': 'iii',
2287 'ik': 'ipk',
2288 'io': 'ido',
2289 'is': 'isl',
2290 'it': 'ita',
2291 'iu': 'iku',
2292 'ja': 'jpn',
2293 'jv': 'jav',
2294 'ka': 'kat',
2295 'kg': 'kon',
2296 'ki': 'kik',
2297 'kj': 'kua',
2298 'kk': 'kaz',
2299 'kl': 'kal',
2300 'km': 'khm',
2301 'kn': 'kan',
2302 'ko': 'kor',
2303 'kr': 'kau',
2304 'ks': 'kas',
2305 'ku': 'kur',
2306 'kv': 'kom',
2307 'kw': 'cor',
2308 'ky': 'kir',
2309 'la': 'lat',
2310 'lb': 'ltz',
2311 'lg': 'lug',
2312 'li': 'lim',
2313 'ln': 'lin',
2314 'lo': 'lao',
2315 'lt': 'lit',
2316 'lu': 'lub',
2317 'lv': 'lav',
2318 'mg': 'mlg',
2319 'mh': 'mah',
2320 'mi': 'mri',
2321 'mk': 'mkd',
2322 'ml': 'mal',
2323 'mn': 'mon',
2324 'mr': 'mar',
2325 'ms': 'msa',
2326 'mt': 'mlt',
2327 'my': 'mya',
2328 'na': 'nau',
2329 'nb': 'nob',
2330 'nd': 'nde',
2331 'ne': 'nep',
2332 'ng': 'ndo',
2333 'nl': 'nld',
2334 'nn': 'nno',
2335 'no': 'nor',
2336 'nr': 'nbl',
2337 'nv': 'nav',
2338 'ny': 'nya',
2339 'oc': 'oci',
2340 'oj': 'oji',
2341 'om': 'orm',
2342 'or': 'ori',
2343 'os': 'oss',
2344 'pa': 'pan',
2345 'pi': 'pli',
2346 'pl': 'pol',
2347 'ps': 'pus',
2348 'pt': 'por',
2349 'qu': 'que',
2350 'rm': 'roh',
2351 'rn': 'run',
2352 'ro': 'ron',
2353 'ru': 'rus',
2354 'rw': 'kin',
2355 'sa': 'san',
2356 'sc': 'srd',
2357 'sd': 'snd',
2358 'se': 'sme',
2359 'sg': 'sag',
2360 'si': 'sin',
2361 'sk': 'slk',
2362 'sl': 'slv',
2363 'sm': 'smo',
2364 'sn': 'sna',
2365 'so': 'som',
2366 'sq': 'sqi',
2367 'sr': 'srp',
2368 'ss': 'ssw',
2369 'st': 'sot',
2370 'su': 'sun',
2371 'sv': 'swe',
2372 'sw': 'swa',
2373 'ta': 'tam',
2374 'te': 'tel',
2375 'tg': 'tgk',
2376 'th': 'tha',
2377 'ti': 'tir',
2378 'tk': 'tuk',
2379 'tl': 'tgl',
2380 'tn': 'tsn',
2381 'to': 'ton',
2382 'tr': 'tur',
2383 'ts': 'tso',
2384 'tt': 'tat',
2385 'tw': 'twi',
2386 'ty': 'tah',
2387 'ug': 'uig',
2388 'uk': 'ukr',
2389 'ur': 'urd',
2390 'uz': 'uzb',
2391 've': 'ven',
2392 'vi': 'vie',
2393 'vo': 'vol',
2394 'wa': 'wln',
2395 'wo': 'wol',
2396 'xh': 'xho',
2397 'yi': 'yid',
2398 'yo': 'yor',
2399 'za': 'zha',
2400 'zh': 'zho',
2401 'zu': 'zul',
2402 }
2403
2404 @classmethod
2405 def short2long(cls, code):
2406 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2407 return cls._lang_map.get(code[:2])
2408
2409 @classmethod
2410 def long2short(cls, code):
2411 """Convert language code from ISO 639-2/T to ISO 639-1"""
2412 for short_name, long_name in cls._lang_map.items():
2413 if long_name == code:
2414 return short_name
2415
2416
2417 class ISO3166Utils(object):
2418 # From http://data.okfn.org/data/core/country-list
2419 _country_map = {
2420 'AF': 'Afghanistan',
2421 'AX': 'Åland Islands',
2422 'AL': 'Albania',
2423 'DZ': 'Algeria',
2424 'AS': 'American Samoa',
2425 'AD': 'Andorra',
2426 'AO': 'Angola',
2427 'AI': 'Anguilla',
2428 'AQ': 'Antarctica',
2429 'AG': 'Antigua and Barbuda',
2430 'AR': 'Argentina',
2431 'AM': 'Armenia',
2432 'AW': 'Aruba',
2433 'AU': 'Australia',
2434 'AT': 'Austria',
2435 'AZ': 'Azerbaijan',
2436 'BS': 'Bahamas',
2437 'BH': 'Bahrain',
2438 'BD': 'Bangladesh',
2439 'BB': 'Barbados',
2440 'BY': 'Belarus',
2441 'BE': 'Belgium',
2442 'BZ': 'Belize',
2443 'BJ': 'Benin',
2444 'BM': 'Bermuda',
2445 'BT': 'Bhutan',
2446 'BO': 'Bolivia, Plurinational State of',
2447 'BQ': 'Bonaire, Sint Eustatius and Saba',
2448 'BA': 'Bosnia and Herzegovina',
2449 'BW': 'Botswana',
2450 'BV': 'Bouvet Island',
2451 'BR': 'Brazil',
2452 'IO': 'British Indian Ocean Territory',
2453 'BN': 'Brunei Darussalam',
2454 'BG': 'Bulgaria',
2455 'BF': 'Burkina Faso',
2456 'BI': 'Burundi',
2457 'KH': 'Cambodia',
2458 'CM': 'Cameroon',
2459 'CA': 'Canada',
2460 'CV': 'Cape Verde',
2461 'KY': 'Cayman Islands',
2462 'CF': 'Central African Republic',
2463 'TD': 'Chad',
2464 'CL': 'Chile',
2465 'CN': 'China',
2466 'CX': 'Christmas Island',
2467 'CC': 'Cocos (Keeling) Islands',
2468 'CO': 'Colombia',
2469 'KM': 'Comoros',
2470 'CG': 'Congo',
2471 'CD': 'Congo, the Democratic Republic of the',
2472 'CK': 'Cook Islands',
2473 'CR': 'Costa Rica',
2474 'CI': 'Côte d\'Ivoire',
2475 'HR': 'Croatia',
2476 'CU': 'Cuba',
2477 'CW': 'Curaçao',
2478 'CY': 'Cyprus',
2479 'CZ': 'Czech Republic',
2480 'DK': 'Denmark',
2481 'DJ': 'Djibouti',
2482 'DM': 'Dominica',
2483 'DO': 'Dominican Republic',
2484 'EC': 'Ecuador',
2485 'EG': 'Egypt',
2486 'SV': 'El Salvador',
2487 'GQ': 'Equatorial Guinea',
2488 'ER': 'Eritrea',
2489 'EE': 'Estonia',
2490 'ET': 'Ethiopia',
2491 'FK': 'Falkland Islands (Malvinas)',
2492 'FO': 'Faroe Islands',
2493 'FJ': 'Fiji',
2494 'FI': 'Finland',
2495 'FR': 'France',
2496 'GF': 'French Guiana',
2497 'PF': 'French Polynesia',
2498 'TF': 'French Southern Territories',
2499 'GA': 'Gabon',
2500 'GM': 'Gambia',
2501 'GE': 'Georgia',
2502 'DE': 'Germany',
2503 'GH': 'Ghana',
2504 'GI': 'Gibraltar',
2505 'GR': 'Greece',
2506 'GL': 'Greenland',
2507 'GD': 'Grenada',
2508 'GP': 'Guadeloupe',
2509 'GU': 'Guam',
2510 'GT': 'Guatemala',
2511 'GG': 'Guernsey',
2512 'GN': 'Guinea',
2513 'GW': 'Guinea-Bissau',
2514 'GY': 'Guyana',
2515 'HT': 'Haiti',
2516 'HM': 'Heard Island and McDonald Islands',
2517 'VA': 'Holy See (Vatican City State)',
2518 'HN': 'Honduras',
2519 'HK': 'Hong Kong',
2520 'HU': 'Hungary',
2521 'IS': 'Iceland',
2522 'IN': 'India',
2523 'ID': 'Indonesia',
2524 'IR': 'Iran, Islamic Republic of',
2525 'IQ': 'Iraq',
2526 'IE': 'Ireland',
2527 'IM': 'Isle of Man',
2528 'IL': 'Israel',
2529 'IT': 'Italy',
2530 'JM': 'Jamaica',
2531 'JP': 'Japan',
2532 'JE': 'Jersey',
2533 'JO': 'Jordan',
2534 'KZ': 'Kazakhstan',
2535 'KE': 'Kenya',
2536 'KI': 'Kiribati',
2537 'KP': 'Korea, Democratic People\'s Republic of',
2538 'KR': 'Korea, Republic of',
2539 'KW': 'Kuwait',
2540 'KG': 'Kyrgyzstan',
2541 'LA': 'Lao People\'s Democratic Republic',
2542 'LV': 'Latvia',
2543 'LB': 'Lebanon',
2544 'LS': 'Lesotho',
2545 'LR': 'Liberia',
2546 'LY': 'Libya',
2547 'LI': 'Liechtenstein',
2548 'LT': 'Lithuania',
2549 'LU': 'Luxembourg',
2550 'MO': 'Macao',
2551 'MK': 'Macedonia, the Former Yugoslav Republic of',
2552 'MG': 'Madagascar',
2553 'MW': 'Malawi',
2554 'MY': 'Malaysia',
2555 'MV': 'Maldives',
2556 'ML': 'Mali',
2557 'MT': 'Malta',
2558 'MH': 'Marshall Islands',
2559 'MQ': 'Martinique',
2560 'MR': 'Mauritania',
2561 'MU': 'Mauritius',
2562 'YT': 'Mayotte',
2563 'MX': 'Mexico',
2564 'FM': 'Micronesia, Federated States of',
2565 'MD': 'Moldova, Republic of',
2566 'MC': 'Monaco',
2567 'MN': 'Mongolia',
2568 'ME': 'Montenegro',
2569 'MS': 'Montserrat',
2570 'MA': 'Morocco',
2571 'MZ': 'Mozambique',
2572 'MM': 'Myanmar',
2573 'NA': 'Namibia',
2574 'NR': 'Nauru',
2575 'NP': 'Nepal',
2576 'NL': 'Netherlands',
2577 'NC': 'New Caledonia',
2578 'NZ': 'New Zealand',
2579 'NI': 'Nicaragua',
2580 'NE': 'Niger',
2581 'NG': 'Nigeria',
2582 'NU': 'Niue',
2583 'NF': 'Norfolk Island',
2584 'MP': 'Northern Mariana Islands',
2585 'NO': 'Norway',
2586 'OM': 'Oman',
2587 'PK': 'Pakistan',
2588 'PW': 'Palau',
2589 'PS': 'Palestine, State of',
2590 'PA': 'Panama',
2591 'PG': 'Papua New Guinea',
2592 'PY': 'Paraguay',
2593 'PE': 'Peru',
2594 'PH': 'Philippines',
2595 'PN': 'Pitcairn',
2596 'PL': 'Poland',
2597 'PT': 'Portugal',
2598 'PR': 'Puerto Rico',
2599 'QA': 'Qatar',
2600 'RE': 'Réunion',
2601 'RO': 'Romania',
2602 'RU': 'Russian Federation',
2603 'RW': 'Rwanda',
2604 'BL': 'Saint Barthélemy',
2605 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2606 'KN': 'Saint Kitts and Nevis',
2607 'LC': 'Saint Lucia',
2608 'MF': 'Saint Martin (French part)',
2609 'PM': 'Saint Pierre and Miquelon',
2610 'VC': 'Saint Vincent and the Grenadines',
2611 'WS': 'Samoa',
2612 'SM': 'San Marino',
2613 'ST': 'Sao Tome and Principe',
2614 'SA': 'Saudi Arabia',
2615 'SN': 'Senegal',
2616 'RS': 'Serbia',
2617 'SC': 'Seychelles',
2618 'SL': 'Sierra Leone',
2619 'SG': 'Singapore',
2620 'SX': 'Sint Maarten (Dutch part)',
2621 'SK': 'Slovakia',
2622 'SI': 'Slovenia',
2623 'SB': 'Solomon Islands',
2624 'SO': 'Somalia',
2625 'ZA': 'South Africa',
2626 'GS': 'South Georgia and the South Sandwich Islands',
2627 'SS': 'South Sudan',
2628 'ES': 'Spain',
2629 'LK': 'Sri Lanka',
2630 'SD': 'Sudan',
2631 'SR': 'Suriname',
2632 'SJ': 'Svalbard and Jan Mayen',
2633 'SZ': 'Swaziland',
2634 'SE': 'Sweden',
2635 'CH': 'Switzerland',
2636 'SY': 'Syrian Arab Republic',
2637 'TW': 'Taiwan, Province of China',
2638 'TJ': 'Tajikistan',
2639 'TZ': 'Tanzania, United Republic of',
2640 'TH': 'Thailand',
2641 'TL': 'Timor-Leste',
2642 'TG': 'Togo',
2643 'TK': 'Tokelau',
2644 'TO': 'Tonga',
2645 'TT': 'Trinidad and Tobago',
2646 'TN': 'Tunisia',
2647 'TR': 'Turkey',
2648 'TM': 'Turkmenistan',
2649 'TC': 'Turks and Caicos Islands',
2650 'TV': 'Tuvalu',
2651 'UG': 'Uganda',
2652 'UA': 'Ukraine',
2653 'AE': 'United Arab Emirates',
2654 'GB': 'United Kingdom',
2655 'US': 'United States',
2656 'UM': 'United States Minor Outlying Islands',
2657 'UY': 'Uruguay',
2658 'UZ': 'Uzbekistan',
2659 'VU': 'Vanuatu',
2660 'VE': 'Venezuela, Bolivarian Republic of',
2661 'VN': 'Viet Nam',
2662 'VG': 'Virgin Islands, British',
2663 'VI': 'Virgin Islands, U.S.',
2664 'WF': 'Wallis and Futuna',
2665 'EH': 'Western Sahara',
2666 'YE': 'Yemen',
2667 'ZM': 'Zambia',
2668 'ZW': 'Zimbabwe',
2669 }
2670
2671 @classmethod
2672 def short2full(cls, code):
2673 """Convert an ISO 3166-2 country code to the corresponding full name"""
2674 return cls._country_map.get(code.upper())
2675
2676
2677 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2678 def __init__(self, proxies=None):
2679 # Set default handlers
2680 for type in ('http', 'https'):
2681 setattr(self, '%s_open' % type,
2682 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2683 meth(r, proxy, type))
2684 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2685
2686 def proxy_open(self, req, proxy, type):
2687 req_proxy = req.headers.get('Ytdl-request-proxy')
2688 if req_proxy is not None:
2689 proxy = req_proxy
2690 del req.headers['Ytdl-request-proxy']
2691
2692 if proxy == '__noproxy__':
2693 return None # No Proxy
2694 return compat_urllib_request.ProxyHandler.proxy_open(
2695 self, req, proxy, type)
2696
2697
2698 def ohdave_rsa_encrypt(data, exponent, modulus):
2699 '''
2700 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2701
2702 Input:
2703 data: data to encrypt, bytes-like object
2704 exponent, modulus: parameter e and N of RSA algorithm, both integer
2705 Output: hex string of encrypted data
2706
2707 Limitation: supports one block encryption only
2708 '''
2709
2710 payload = int(binascii.hexlify(data[::-1]), 16)
2711 encrypted = pow(payload, exponent, modulus)
2712 return '%x' % encrypted
2713
2714
2715 def encode_base_n(num, n, table=None):
2716 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2717 if not table:
2718 table = FULL_TABLE[:n]
2719
2720 if n > len(table):
2721 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2722
2723 if num == 0:
2724 return table[0]
2725
2726 ret = ''
2727 while num:
2728 ret = table[num % n] + ret
2729 num = num // n
2730 return ret
2731
2732
2733 def decode_packed_codes(code):
2734 mobj = re.search(
2735 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2736 code)
2737 obfucasted_code, base, count, symbols = mobj.groups()
2738 base = int(base)
2739 count = int(count)
2740 symbols = symbols.split('|')
2741 symbol_table = {}
2742
2743 while count:
2744 count -= 1
2745 base_n_count = encode_base_n(count, base)
2746 symbol_table[base_n_count] = symbols[count] or base_n_count
2747
2748 return re.sub(
2749 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2750 obfucasted_code)