]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
release 2014.03.03
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import contextlib
5 import ctypes
6 import datetime
7 import email.utils
8 import errno
9 import gzip
10 import itertools
11 import io
12 import json
13 import locale
14 import math
15 import os
16 import pipes
17 import platform
18 import re
19 import ssl
20 import socket
21 import struct
22 import subprocess
23 import sys
24 import traceback
25 import zlib
26
27 try:
28 import urllib.request as compat_urllib_request
29 except ImportError: # Python 2
30 import urllib2 as compat_urllib_request
31
32 try:
33 import urllib.error as compat_urllib_error
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_error
36
37 try:
38 import urllib.parse as compat_urllib_parse
39 except ImportError: # Python 2
40 import urllib as compat_urllib_parse
41
42 try:
43 from urllib.parse import urlparse as compat_urllib_parse_urlparse
44 except ImportError: # Python 2
45 from urlparse import urlparse as compat_urllib_parse_urlparse
46
47 try:
48 import urllib.parse as compat_urlparse
49 except ImportError: # Python 2
50 import urlparse as compat_urlparse
51
52 try:
53 import http.cookiejar as compat_cookiejar
54 except ImportError: # Python 2
55 import cookielib as compat_cookiejar
56
57 try:
58 import html.entities as compat_html_entities
59 except ImportError: # Python 2
60 import htmlentitydefs as compat_html_entities
61
62 try:
63 import html.parser as compat_html_parser
64 except ImportError: # Python 2
65 import HTMLParser as compat_html_parser
66
67 try:
68 import http.client as compat_http_client
69 except ImportError: # Python 2
70 import httplib as compat_http_client
71
72 try:
73 from urllib.error import HTTPError as compat_HTTPError
74 except ImportError: # Python 2
75 from urllib2 import HTTPError as compat_HTTPError
76
77 try:
78 from urllib.request import urlretrieve as compat_urlretrieve
79 except ImportError: # Python 2
80 from urllib import urlretrieve as compat_urlretrieve
81
82
83 try:
84 from subprocess import DEVNULL
85 compat_subprocess_get_DEVNULL = lambda: DEVNULL
86 except ImportError:
87 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
88
89 try:
90 from urllib.parse import parse_qs as compat_parse_qs
91 except ImportError: # Python 2
92 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
93 # Python 2's version is apparently totally broken
94 def _unquote(string, encoding='utf-8', errors='replace'):
95 if string == '':
96 return string
97 res = string.split('%')
98 if len(res) == 1:
99 return string
100 if encoding is None:
101 encoding = 'utf-8'
102 if errors is None:
103 errors = 'replace'
104 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
105 pct_sequence = b''
106 string = res[0]
107 for item in res[1:]:
108 try:
109 if not item:
110 raise ValueError
111 pct_sequence += item[:2].decode('hex')
112 rest = item[2:]
113 if not rest:
114 # This segment was just a single percent-encoded character.
115 # May be part of a sequence of code units, so delay decoding.
116 # (Stored in pct_sequence).
117 continue
118 except ValueError:
119 rest = '%' + item
120 # Encountered non-percent-encoded characters. Flush the current
121 # pct_sequence.
122 string += pct_sequence.decode(encoding, errors) + rest
123 pct_sequence = b''
124 if pct_sequence:
125 # Flush the final pct_sequence
126 string += pct_sequence.decode(encoding, errors)
127 return string
128
129 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
130 encoding='utf-8', errors='replace'):
131 qs, _coerce_result = qs, unicode
132 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
133 r = []
134 for name_value in pairs:
135 if not name_value and not strict_parsing:
136 continue
137 nv = name_value.split('=', 1)
138 if len(nv) != 2:
139 if strict_parsing:
140 raise ValueError("bad query field: %r" % (name_value,))
141 # Handle case of a control-name with no equal sign
142 if keep_blank_values:
143 nv.append('')
144 else:
145 continue
146 if len(nv[1]) or keep_blank_values:
147 name = nv[0].replace('+', ' ')
148 name = _unquote(name, encoding=encoding, errors=errors)
149 name = _coerce_result(name)
150 value = nv[1].replace('+', ' ')
151 value = _unquote(value, encoding=encoding, errors=errors)
152 value = _coerce_result(value)
153 r.append((name, value))
154 return r
155
156 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
157 encoding='utf-8', errors='replace'):
158 parsed_result = {}
159 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
160 encoding=encoding, errors=errors)
161 for name, value in pairs:
162 if name in parsed_result:
163 parsed_result[name].append(value)
164 else:
165 parsed_result[name] = [value]
166 return parsed_result
167
168 try:
169 compat_str = unicode # Python 2
170 except NameError:
171 compat_str = str
172
173 try:
174 compat_chr = unichr # Python 2
175 except NameError:
176 compat_chr = chr
177
178 try:
179 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
180 except ImportError: # Python 2.6
181 from xml.parsers.expat import ExpatError as compat_xml_parse_error
182
183 def compat_ord(c):
184 if type(c) is int: return c
185 else: return ord(c)
186
187 # This is not clearly defined otherwise
188 compiled_regex_type = type(re.compile(''))
189
190 std_headers = {
191 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
192 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
193 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
194 'Accept-Encoding': 'gzip, deflate',
195 'Accept-Language': 'en-us,en;q=0.5',
196 }
197
198 def preferredencoding():
199 """Get preferred encoding.
200
201 Returns the best encoding scheme for the system, based on
202 locale.getpreferredencoding() and some further tweaks.
203 """
204 try:
205 pref = locale.getpreferredencoding()
206 u'TEST'.encode(pref)
207 except:
208 pref = 'UTF-8'
209
210 return pref
211
212 if sys.version_info < (3,0):
213 def compat_print(s):
214 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
215 else:
216 def compat_print(s):
217 assert type(s) == type(u'')
218 print(s)
219
220 # In Python 2.x, json.dump expects a bytestream.
221 # In Python 3.x, it writes to a character stream
222 if sys.version_info < (3,0):
223 def write_json_file(obj, fn):
224 with open(fn, 'wb') as f:
225 json.dump(obj, f)
226 else:
227 def write_json_file(obj, fn):
228 with open(fn, 'w', encoding='utf-8') as f:
229 json.dump(obj, f)
230
231 if sys.version_info >= (2,7):
232 def find_xpath_attr(node, xpath, key, val):
233 """ Find the xpath xpath[@key=val] """
234 assert re.match(r'^[a-zA-Z]+$', key)
235 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
236 expr = xpath + u"[@%s='%s']" % (key, val)
237 return node.find(expr)
238 else:
239 def find_xpath_attr(node, xpath, key, val):
240 for f in node.findall(xpath):
241 if f.attrib.get(key) == val:
242 return f
243 return None
244
245 # On python2.6 the xml.etree.ElementTree.Element methods don't support
246 # the namespace parameter
247 def xpath_with_ns(path, ns_map):
248 components = [c.split(':') for c in path.split('/')]
249 replaced = []
250 for c in components:
251 if len(c) == 1:
252 replaced.append(c[0])
253 else:
254 ns, tag = c
255 replaced.append('{%s}%s' % (ns_map[ns], tag))
256 return '/'.join(replaced)
257
258 def htmlentity_transform(matchobj):
259 """Transforms an HTML entity to a character.
260
261 This function receives a match object and is intended to be used with
262 the re.sub() function.
263 """
264 entity = matchobj.group(1)
265
266 # Known non-numeric HTML entity
267 if entity in compat_html_entities.name2codepoint:
268 return compat_chr(compat_html_entities.name2codepoint[entity])
269
270 mobj = re.match(u'(?u)#(x?\\d+)', entity)
271 if mobj is not None:
272 numstr = mobj.group(1)
273 if numstr.startswith(u'x'):
274 base = 16
275 numstr = u'0%s' % numstr
276 else:
277 base = 10
278 return compat_chr(int(numstr, base))
279
280 # Unknown entity in name, return its literal representation
281 return (u'&%s;' % entity)
282
283 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
284 class BaseHTMLParser(compat_html_parser.HTMLParser):
285 def __init(self):
286 compat_html_parser.HTMLParser.__init__(self)
287 self.html = None
288
289 def loads(self, html):
290 self.html = html
291 self.feed(html)
292 self.close()
293
294 class AttrParser(BaseHTMLParser):
295 """Modified HTMLParser that isolates a tag with the specified attribute"""
296 def __init__(self, attribute, value):
297 self.attribute = attribute
298 self.value = value
299 self.result = None
300 self.started = False
301 self.depth = {}
302 self.watch_startpos = False
303 self.error_count = 0
304 BaseHTMLParser.__init__(self)
305
306 def error(self, message):
307 if self.error_count > 10 or self.started:
308 raise compat_html_parser.HTMLParseError(message, self.getpos())
309 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
310 self.error_count += 1
311 self.goahead(1)
312
313 def handle_starttag(self, tag, attrs):
314 attrs = dict(attrs)
315 if self.started:
316 self.find_startpos(None)
317 if self.attribute in attrs and attrs[self.attribute] == self.value:
318 self.result = [tag]
319 self.started = True
320 self.watch_startpos = True
321 if self.started:
322 if not tag in self.depth: self.depth[tag] = 0
323 self.depth[tag] += 1
324
325 def handle_endtag(self, tag):
326 if self.started:
327 if tag in self.depth: self.depth[tag] -= 1
328 if self.depth[self.result[0]] == 0:
329 self.started = False
330 self.result.append(self.getpos())
331
332 def find_startpos(self, x):
333 """Needed to put the start position of the result (self.result[1])
334 after the opening tag with the requested id"""
335 if self.watch_startpos:
336 self.watch_startpos = False
337 self.result.append(self.getpos())
338 handle_entityref = handle_charref = handle_data = handle_comment = \
339 handle_decl = handle_pi = unknown_decl = find_startpos
340
341 def get_result(self):
342 if self.result is None:
343 return None
344 if len(self.result) != 3:
345 return None
346 lines = self.html.split('\n')
347 lines = lines[self.result[1][0]-1:self.result[2][0]]
348 lines[0] = lines[0][self.result[1][1]:]
349 if len(lines) == 1:
350 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
351 lines[-1] = lines[-1][:self.result[2][1]]
352 return '\n'.join(lines).strip()
353 # Hack for https://github.com/rg3/youtube-dl/issues/662
354 if sys.version_info < (2, 7, 3):
355 AttrParser.parse_endtag = (lambda self, i:
356 i + len("</scr'+'ipt>")
357 if self.rawdata[i:].startswith("</scr'+'ipt>")
358 else compat_html_parser.HTMLParser.parse_endtag(self, i))
359
360 def get_element_by_id(id, html):
361 """Return the content of the tag with the specified ID in the passed HTML document"""
362 return get_element_by_attribute("id", id, html)
363
364 def get_element_by_attribute(attribute, value, html):
365 """Return the content of the tag with the specified attribute in the passed HTML document"""
366 parser = AttrParser(attribute, value)
367 try:
368 parser.loads(html)
369 except compat_html_parser.HTMLParseError:
370 pass
371 return parser.get_result()
372
373 class MetaParser(BaseHTMLParser):
374 """
375 Modified HTMLParser that isolates a meta tag with the specified name
376 attribute.
377 """
378 def __init__(self, name):
379 BaseHTMLParser.__init__(self)
380 self.name = name
381 self.content = None
382 self.result = None
383
384 def handle_starttag(self, tag, attrs):
385 if tag != 'meta':
386 return
387 attrs = dict(attrs)
388 if attrs.get('name') == self.name:
389 self.result = attrs.get('content')
390
391 def get_result(self):
392 return self.result
393
394 def get_meta_content(name, html):
395 """
396 Return the content attribute from the meta tag with the given name attribute.
397 """
398 parser = MetaParser(name)
399 try:
400 parser.loads(html)
401 except compat_html_parser.HTMLParseError:
402 pass
403 return parser.get_result()
404
405
406 def clean_html(html):
407 """Clean an HTML snippet into a readable string"""
408 # Newline vs <br />
409 html = html.replace('\n', ' ')
410 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
411 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
412 # Strip html tags
413 html = re.sub('<.*?>', '', html)
414 # Replace html entities
415 html = unescapeHTML(html)
416 return html.strip()
417
418
419 def sanitize_open(filename, open_mode):
420 """Try to open the given filename, and slightly tweak it if this fails.
421
422 Attempts to open the given filename. If this fails, it tries to change
423 the filename slightly, step by step, until it's either able to open it
424 or it fails and raises a final exception, like the standard open()
425 function.
426
427 It returns the tuple (stream, definitive_file_name).
428 """
429 try:
430 if filename == u'-':
431 if sys.platform == 'win32':
432 import msvcrt
433 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
434 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
435 stream = open(encodeFilename(filename), open_mode)
436 return (stream, filename)
437 except (IOError, OSError) as err:
438 if err.errno in (errno.EACCES,):
439 raise
440
441 # In case of error, try to remove win32 forbidden chars
442 alt_filename = os.path.join(
443 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
444 for path_part in os.path.split(filename)
445 )
446 if alt_filename == filename:
447 raise
448 else:
449 # An exception here should be caught in the caller
450 stream = open(encodeFilename(filename), open_mode)
451 return (stream, alt_filename)
452
453
454 def timeconvert(timestr):
455 """Convert RFC 2822 defined time string into system timestamp"""
456 timestamp = None
457 timetuple = email.utils.parsedate_tz(timestr)
458 if timetuple is not None:
459 timestamp = email.utils.mktime_tz(timetuple)
460 return timestamp
461
462 def sanitize_filename(s, restricted=False, is_id=False):
463 """Sanitizes a string so it could be used as part of a filename.
464 If restricted is set, use a stricter subset of allowed characters.
465 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
466 """
467 def replace_insane(char):
468 if char == '?' or ord(char) < 32 or ord(char) == 127:
469 return ''
470 elif char == '"':
471 return '' if restricted else '\''
472 elif char == ':':
473 return '_-' if restricted else ' -'
474 elif char in '\\/|*<>':
475 return '_'
476 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
477 return '_'
478 if restricted and ord(char) > 127:
479 return '_'
480 return char
481
482 result = u''.join(map(replace_insane, s))
483 if not is_id:
484 while '__' in result:
485 result = result.replace('__', '_')
486 result = result.strip('_')
487 # Common case of "Foreign band name - English song title"
488 if restricted and result.startswith('-_'):
489 result = result[2:]
490 if not result:
491 result = '_'
492 return result
493
494 def orderedSet(iterable):
495 """ Remove all duplicates from the input iterable """
496 res = []
497 for el in iterable:
498 if el not in res:
499 res.append(el)
500 return res
501
502 def unescapeHTML(s):
503 """
504 @param s a string
505 """
506 assert type(s) == type(u'')
507
508 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
509 return result
510
511
512 def encodeFilename(s, for_subprocess=False):
513 """
514 @param s The name of the file
515 """
516
517 assert type(s) == compat_str
518
519 # Python 3 has a Unicode API
520 if sys.version_info >= (3, 0):
521 return s
522
523 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
524 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
525 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
526 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
527 if not for_subprocess:
528 return s
529 else:
530 # For subprocess calls, encode with locale encoding
531 # Refer to http://stackoverflow.com/a/9951851/35070
532 encoding = preferredencoding()
533 else:
534 encoding = sys.getfilesystemencoding()
535 if encoding is None:
536 encoding = 'utf-8'
537 return s.encode(encoding, 'ignore')
538
539
540 def decodeOption(optval):
541 if optval is None:
542 return optval
543 if isinstance(optval, bytes):
544 optval = optval.decode(preferredencoding())
545
546 assert isinstance(optval, compat_str)
547 return optval
548
549 def formatSeconds(secs):
550 if secs > 3600:
551 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
552 elif secs > 60:
553 return '%d:%02d' % (secs // 60, secs % 60)
554 else:
555 return '%d' % secs
556
557
558 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
559 if sys.version_info < (3, 2):
560 import httplib
561
562 class HTTPSConnectionV3(httplib.HTTPSConnection):
563 def __init__(self, *args, **kwargs):
564 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
565
566 def connect(self):
567 sock = socket.create_connection((self.host, self.port), self.timeout)
568 if getattr(self, '_tunnel_host', False):
569 self.sock = sock
570 self._tunnel()
571 try:
572 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
573 except ssl.SSLError:
574 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
575
576 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
577 def https_open(self, req):
578 return self.do_open(HTTPSConnectionV3, req)
579 return HTTPSHandlerV3(**kwargs)
580 else:
581 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
582 context.verify_mode = (ssl.CERT_NONE
583 if opts_no_check_certificate
584 else ssl.CERT_REQUIRED)
585 context.set_default_verify_paths()
586 try:
587 context.load_default_certs()
588 except AttributeError:
589 pass # Python < 3.4
590 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
591
592 class ExtractorError(Exception):
593 """Error during info extraction."""
594 def __init__(self, msg, tb=None, expected=False, cause=None):
595 """ tb, if given, is the original traceback (so that it can be printed out).
596 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
597 """
598
599 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
600 expected = True
601 if not expected:
602 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
603 super(ExtractorError, self).__init__(msg)
604
605 self.traceback = tb
606 self.exc_info = sys.exc_info() # preserve original exception
607 self.cause = cause
608
609 def format_traceback(self):
610 if self.traceback is None:
611 return None
612 return u''.join(traceback.format_tb(self.traceback))
613
614
615 class RegexNotFoundError(ExtractorError):
616 """Error when a regex didn't match"""
617 pass
618
619
620 class DownloadError(Exception):
621 """Download Error exception.
622
623 This exception may be thrown by FileDownloader objects if they are not
624 configured to continue on errors. They will contain the appropriate
625 error message.
626 """
627 def __init__(self, msg, exc_info=None):
628 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
629 super(DownloadError, self).__init__(msg)
630 self.exc_info = exc_info
631
632
633 class SameFileError(Exception):
634 """Same File exception.
635
636 This exception will be thrown by FileDownloader objects if they detect
637 multiple files would have to be downloaded to the same file on disk.
638 """
639 pass
640
641
642 class PostProcessingError(Exception):
643 """Post Processing exception.
644
645 This exception may be raised by PostProcessor's .run() method to
646 indicate an error in the postprocessing task.
647 """
648 def __init__(self, msg):
649 self.msg = msg
650
651 class MaxDownloadsReached(Exception):
652 """ --max-downloads limit has been reached. """
653 pass
654
655
656 class UnavailableVideoError(Exception):
657 """Unavailable Format exception.
658
659 This exception will be thrown when a video is requested
660 in a format that is not available for that video.
661 """
662 pass
663
664
665 class ContentTooShortError(Exception):
666 """Content Too Short exception.
667
668 This exception may be raised by FileDownloader objects when a file they
669 download is too small for what the server announced first, indicating
670 the connection was probably interrupted.
671 """
672 # Both in bytes
673 downloaded = None
674 expected = None
675
676 def __init__(self, downloaded, expected):
677 self.downloaded = downloaded
678 self.expected = expected
679
680 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
681 """Handler for HTTP requests and responses.
682
683 This class, when installed with an OpenerDirector, automatically adds
684 the standard headers to every HTTP request and handles gzipped and
685 deflated responses from web servers. If compression is to be avoided in
686 a particular request, the original request in the program code only has
687 to include the HTTP header "Youtubedl-No-Compression", which will be
688 removed before making the real request.
689
690 Part of this code was copied from:
691
692 http://techknack.net/python-urllib2-handlers/
693
694 Andrew Rowls, the author of that code, agreed to release it to the
695 public domain.
696 """
697
698 @staticmethod
699 def deflate(data):
700 try:
701 return zlib.decompress(data, -zlib.MAX_WBITS)
702 except zlib.error:
703 return zlib.decompress(data)
704
705 @staticmethod
706 def addinfourl_wrapper(stream, headers, url, code):
707 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
708 return compat_urllib_request.addinfourl(stream, headers, url, code)
709 ret = compat_urllib_request.addinfourl(stream, headers, url)
710 ret.code = code
711 return ret
712
713 def http_request(self, req):
714 for h,v in std_headers.items():
715 if h in req.headers:
716 del req.headers[h]
717 req.add_header(h, v)
718 if 'Youtubedl-no-compression' in req.headers:
719 if 'Accept-encoding' in req.headers:
720 del req.headers['Accept-encoding']
721 del req.headers['Youtubedl-no-compression']
722 if 'Youtubedl-user-agent' in req.headers:
723 if 'User-agent' in req.headers:
724 del req.headers['User-agent']
725 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
726 del req.headers['Youtubedl-user-agent']
727 return req
728
729 def http_response(self, req, resp):
730 old_resp = resp
731 # gzip
732 if resp.headers.get('Content-encoding', '') == 'gzip':
733 content = resp.read()
734 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
735 try:
736 uncompressed = io.BytesIO(gz.read())
737 except IOError as original_ioerror:
738 # There may be junk add the end of the file
739 # See http://stackoverflow.com/q/4928560/35070 for details
740 for i in range(1, 1024):
741 try:
742 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
743 uncompressed = io.BytesIO(gz.read())
744 except IOError:
745 continue
746 break
747 else:
748 raise original_ioerror
749 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
750 resp.msg = old_resp.msg
751 # deflate
752 if resp.headers.get('Content-encoding', '') == 'deflate':
753 gz = io.BytesIO(self.deflate(resp.read()))
754 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
755 resp.msg = old_resp.msg
756 return resp
757
758 https_request = http_request
759 https_response = http_response
760
761
762 def unified_strdate(date_str):
763 """Return a string with the date in the format YYYYMMDD"""
764 upload_date = None
765 #Replace commas
766 date_str = date_str.replace(',', ' ')
767 # %z (UTC offset) is only supported in python>=3.2
768 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
769 format_expressions = [
770 '%d %B %Y',
771 '%d %b %Y',
772 '%B %d %Y',
773 '%b %d %Y',
774 '%Y-%m-%d',
775 '%d.%m.%Y',
776 '%d/%m/%Y',
777 '%Y/%m/%d %H:%M:%S',
778 '%Y-%m-%d %H:%M:%S',
779 '%d.%m.%Y %H:%M',
780 '%Y-%m-%dT%H:%M:%SZ',
781 '%Y-%m-%dT%H:%M:%S.%fZ',
782 '%Y-%m-%dT%H:%M:%S.%f0Z',
783 '%Y-%m-%dT%H:%M:%S',
784 '%Y-%m-%dT%H:%M:%S.%f',
785 '%Y-%m-%dT%H:%M',
786 ]
787 for expression in format_expressions:
788 try:
789 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
790 except ValueError:
791 pass
792 if upload_date is None:
793 timetuple = email.utils.parsedate_tz(date_str)
794 if timetuple:
795 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
796 return upload_date
797
798 def determine_ext(url, default_ext=u'unknown_video'):
799 guess = url.partition(u'?')[0].rpartition(u'.')[2]
800 if re.match(r'^[A-Za-z0-9]+$', guess):
801 return guess
802 else:
803 return default_ext
804
805 def subtitles_filename(filename, sub_lang, sub_format):
806 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
807
808 def date_from_str(date_str):
809 """
810 Return a datetime object from a string in the format YYYYMMDD or
811 (now|today)[+-][0-9](day|week|month|year)(s)?"""
812 today = datetime.date.today()
813 if date_str == 'now'or date_str == 'today':
814 return today
815 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
816 if match is not None:
817 sign = match.group('sign')
818 time = int(match.group('time'))
819 if sign == '-':
820 time = -time
821 unit = match.group('unit')
822 #A bad aproximation?
823 if unit == 'month':
824 unit = 'day'
825 time *= 30
826 elif unit == 'year':
827 unit = 'day'
828 time *= 365
829 unit += 's'
830 delta = datetime.timedelta(**{unit: time})
831 return today + delta
832 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
833
834 def hyphenate_date(date_str):
835 """
836 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
837 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
838 if match is not None:
839 return '-'.join(match.groups())
840 else:
841 return date_str
842
843 class DateRange(object):
844 """Represents a time interval between two dates"""
845 def __init__(self, start=None, end=None):
846 """start and end must be strings in the format accepted by date"""
847 if start is not None:
848 self.start = date_from_str(start)
849 else:
850 self.start = datetime.datetime.min.date()
851 if end is not None:
852 self.end = date_from_str(end)
853 else:
854 self.end = datetime.datetime.max.date()
855 if self.start > self.end:
856 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
857 @classmethod
858 def day(cls, day):
859 """Returns a range that only contains the given day"""
860 return cls(day,day)
861 def __contains__(self, date):
862 """Check if the date is in the range"""
863 if not isinstance(date, datetime.date):
864 date = date_from_str(date)
865 return self.start <= date <= self.end
866 def __str__(self):
867 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
868
869
870 def platform_name():
871 """ Returns the platform name as a compat_str """
872 res = platform.platform()
873 if isinstance(res, bytes):
874 res = res.decode(preferredencoding())
875
876 assert isinstance(res, compat_str)
877 return res
878
879
880 def write_string(s, out=None):
881 if out is None:
882 out = sys.stderr
883 assert type(s) == compat_str
884
885 if ('b' in getattr(out, 'mode', '') or
886 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
887 s = s.encode(preferredencoding(), 'ignore')
888 try:
889 out.write(s)
890 except UnicodeEncodeError:
891 # In Windows shells, this can fail even when the codec is just charmap!?
892 # See https://wiki.python.org/moin/PrintFails#Issue
893 if sys.platform == 'win32' and hasattr(out, 'encoding'):
894 s = s.encode(out.encoding, 'ignore').decode(out.encoding)
895 out.write(s)
896 else:
897 raise
898
899 out.flush()
900
901
902 def bytes_to_intlist(bs):
903 if not bs:
904 return []
905 if isinstance(bs[0], int): # Python 3
906 return list(bs)
907 else:
908 return [ord(c) for c in bs]
909
910
911 def intlist_to_bytes(xs):
912 if not xs:
913 return b''
914 if isinstance(chr(0), bytes): # Python 2
915 return ''.join([chr(x) for x in xs])
916 else:
917 return bytes(xs)
918
919
920 def get_cachedir(params={}):
921 cache_root = os.environ.get('XDG_CACHE_HOME',
922 os.path.expanduser('~/.cache'))
923 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
924
925
926 # Cross-platform file locking
927 if sys.platform == 'win32':
928 import ctypes.wintypes
929 import msvcrt
930
931 class OVERLAPPED(ctypes.Structure):
932 _fields_ = [
933 ('Internal', ctypes.wintypes.LPVOID),
934 ('InternalHigh', ctypes.wintypes.LPVOID),
935 ('Offset', ctypes.wintypes.DWORD),
936 ('OffsetHigh', ctypes.wintypes.DWORD),
937 ('hEvent', ctypes.wintypes.HANDLE),
938 ]
939
940 kernel32 = ctypes.windll.kernel32
941 LockFileEx = kernel32.LockFileEx
942 LockFileEx.argtypes = [
943 ctypes.wintypes.HANDLE, # hFile
944 ctypes.wintypes.DWORD, # dwFlags
945 ctypes.wintypes.DWORD, # dwReserved
946 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
947 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
948 ctypes.POINTER(OVERLAPPED) # Overlapped
949 ]
950 LockFileEx.restype = ctypes.wintypes.BOOL
951 UnlockFileEx = kernel32.UnlockFileEx
952 UnlockFileEx.argtypes = [
953 ctypes.wintypes.HANDLE, # hFile
954 ctypes.wintypes.DWORD, # dwReserved
955 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
956 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
957 ctypes.POINTER(OVERLAPPED) # Overlapped
958 ]
959 UnlockFileEx.restype = ctypes.wintypes.BOOL
960 whole_low = 0xffffffff
961 whole_high = 0x7fffffff
962
963 def _lock_file(f, exclusive):
964 overlapped = OVERLAPPED()
965 overlapped.Offset = 0
966 overlapped.OffsetHigh = 0
967 overlapped.hEvent = 0
968 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
969 handle = msvcrt.get_osfhandle(f.fileno())
970 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
971 whole_low, whole_high, f._lock_file_overlapped_p):
972 raise OSError('Locking file failed: %r' % ctypes.FormatError())
973
974 def _unlock_file(f):
975 assert f._lock_file_overlapped_p
976 handle = msvcrt.get_osfhandle(f.fileno())
977 if not UnlockFileEx(handle, 0,
978 whole_low, whole_high, f._lock_file_overlapped_p):
979 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
980
981 else:
982 import fcntl
983
984 def _lock_file(f, exclusive):
985 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
986
987 def _unlock_file(f):
988 fcntl.lockf(f, fcntl.LOCK_UN)
989
990
991 class locked_file(object):
992 def __init__(self, filename, mode, encoding=None):
993 assert mode in ['r', 'a', 'w']
994 self.f = io.open(filename, mode, encoding=encoding)
995 self.mode = mode
996
997 def __enter__(self):
998 exclusive = self.mode != 'r'
999 try:
1000 _lock_file(self.f, exclusive)
1001 except IOError:
1002 self.f.close()
1003 raise
1004 return self
1005
1006 def __exit__(self, etype, value, traceback):
1007 try:
1008 _unlock_file(self.f)
1009 finally:
1010 self.f.close()
1011
1012 def __iter__(self):
1013 return iter(self.f)
1014
1015 def write(self, *args):
1016 return self.f.write(*args)
1017
1018 def read(self, *args):
1019 return self.f.read(*args)
1020
1021
1022 def shell_quote(args):
1023 quoted_args = []
1024 encoding = sys.getfilesystemencoding()
1025 if encoding is None:
1026 encoding = 'utf-8'
1027 for a in args:
1028 if isinstance(a, bytes):
1029 # We may get a filename encoded with 'encodeFilename'
1030 a = a.decode(encoding)
1031 quoted_args.append(pipes.quote(a))
1032 return u' '.join(quoted_args)
1033
1034
1035 def takewhile_inclusive(pred, seq):
1036 """ Like itertools.takewhile, but include the latest evaluated element
1037 (the first element so that Not pred(e)) """
1038 for e in seq:
1039 yield e
1040 if not pred(e):
1041 return
1042
1043
1044 def smuggle_url(url, data):
1045 """ Pass additional data in a URL for internal use. """
1046
1047 sdata = compat_urllib_parse.urlencode(
1048 {u'__youtubedl_smuggle': json.dumps(data)})
1049 return url + u'#' + sdata
1050
1051
1052 def unsmuggle_url(smug_url, default=None):
1053 if not '#__youtubedl_smuggle' in smug_url:
1054 return smug_url, default
1055 url, _, sdata = smug_url.rpartition(u'#')
1056 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1057 data = json.loads(jsond)
1058 return url, data
1059
1060
1061 def format_bytes(bytes):
1062 if bytes is None:
1063 return u'N/A'
1064 if type(bytes) is str:
1065 bytes = float(bytes)
1066 if bytes == 0.0:
1067 exponent = 0
1068 else:
1069 exponent = int(math.log(bytes, 1024.0))
1070 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1071 converted = float(bytes) / float(1024 ** exponent)
1072 return u'%.2f%s' % (converted, suffix)
1073
1074
1075 def str_to_int(int_str):
1076 int_str = re.sub(r'[,\.]', u'', int_str)
1077 return int(int_str)
1078
1079
1080 def get_term_width():
1081 columns = os.environ.get('COLUMNS', None)
1082 if columns:
1083 return int(columns)
1084
1085 try:
1086 sp = subprocess.Popen(
1087 ['stty', 'size'],
1088 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1089 out, err = sp.communicate()
1090 return int(out.split()[1])
1091 except:
1092 pass
1093 return None
1094
1095
1096 def month_by_name(name):
1097 """ Return the number of a month by (locale-independently) English name """
1098
1099 ENGLISH_NAMES = [
1100 u'January', u'February', u'March', u'April', u'May', u'June',
1101 u'July', u'August', u'September', u'October', u'November', u'December']
1102 try:
1103 return ENGLISH_NAMES.index(name) + 1
1104 except ValueError:
1105 return None
1106
1107
1108 def fix_xml_ampersands(xml_str):
1109 """Replace all the '&' by '&amp;' in XML"""
1110 return re.sub(
1111 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1112 u'&amp;',
1113 xml_str)
1114
1115
1116 def setproctitle(title):
1117 assert isinstance(title, compat_str)
1118 try:
1119 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1120 except OSError:
1121 return
1122 title = title
1123 buf = ctypes.create_string_buffer(len(title) + 1)
1124 buf.value = title.encode('utf-8')
1125 try:
1126 libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1127 except AttributeError:
1128 return # Strange libc, just skip this
1129
1130
1131 def remove_start(s, start):
1132 if s.startswith(start):
1133 return s[len(start):]
1134 return s
1135
1136
1137 def url_basename(url):
1138 path = compat_urlparse.urlparse(url).path
1139 return path.strip(u'/').split(u'/')[-1]
1140
1141
1142 class HEADRequest(compat_urllib_request.Request):
1143 def get_method(self):
1144 return "HEAD"
1145
1146
1147 def int_or_none(v, scale=1):
1148 return v if v is None else (int(v) // scale)
1149
1150
1151 def parse_duration(s):
1152 if s is None:
1153 return None
1154
1155 m = re.match(
1156 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
1157 if not m:
1158 return None
1159 res = int(m.group('secs'))
1160 if m.group('mins'):
1161 res += int(m.group('mins')) * 60
1162 if m.group('hours'):
1163 res += int(m.group('hours')) * 60 * 60
1164 return res
1165
1166
1167 def prepend_extension(filename, ext):
1168 name, real_ext = os.path.splitext(filename)
1169 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1170
1171
1172 def check_executable(exe, args=[]):
1173 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1174 args can be a list of arguments for a short output (like -version) """
1175 try:
1176 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1177 except OSError:
1178 return False
1179 return exe
1180
1181
1182 class PagedList(object):
1183 def __init__(self, pagefunc, pagesize):
1184 self._pagefunc = pagefunc
1185 self._pagesize = pagesize
1186
1187 def __len__(self):
1188 # This is only useful for tests
1189 return len(self.getslice())
1190
1191 def getslice(self, start=0, end=None):
1192 res = []
1193 for pagenum in itertools.count(start // self._pagesize):
1194 firstid = pagenum * self._pagesize
1195 nextfirstid = pagenum * self._pagesize + self._pagesize
1196 if start >= nextfirstid:
1197 continue
1198
1199 page_results = list(self._pagefunc(pagenum))
1200
1201 startv = (
1202 start % self._pagesize
1203 if firstid <= start < nextfirstid
1204 else 0)
1205
1206 endv = (
1207 ((end - 1) % self._pagesize) + 1
1208 if (end is not None and firstid <= end <= nextfirstid)
1209 else None)
1210
1211 if startv != 0 or endv is not None:
1212 page_results = page_results[startv:endv]
1213 res.extend(page_results)
1214
1215 # A little optimization - if current page is not "full", ie. does
1216 # not contain page_size videos then we can assume that this page
1217 # is the last one - there are no more ids on further pages -
1218 # i.e. no need to query again.
1219 if len(page_results) + startv < self._pagesize:
1220 break
1221
1222 # If we got the whole page, but the next page is not interesting,
1223 # break out early as well
1224 if end == nextfirstid:
1225 break
1226 return res
1227
1228
1229 def uppercase_escape(s):
1230 return re.sub(
1231 r'\\U([0-9a-fA-F]{8})',
1232 lambda m: compat_chr(int(m.group(1), base=16)), s)
1233
1234 try:
1235 struct.pack(u'!I', 0)
1236 except TypeError:
1237 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1238 def struct_pack(spec, *args):
1239 if isinstance(spec, compat_str):
1240 spec = spec.encode('ascii')
1241 return struct.pack(spec, *args)
1242
1243 def struct_unpack(spec, *args):
1244 if isinstance(spec, compat_str):
1245 spec = spec.encode('ascii')
1246 return struct.unpack(spec, *args)
1247 else:
1248 struct_pack = struct.pack
1249 struct_unpack = struct.unpack
1250
1251
1252 def read_batch_urls(batch_fd):
1253 def fixup(url):
1254 if not isinstance(url, compat_str):
1255 url = url.decode('utf-8', 'replace')
1256 BOM_UTF8 = u'\xef\xbb\xbf'
1257 if url.startswith(BOM_UTF8):
1258 url = url[len(BOM_UTF8):]
1259 url = url.strip()
1260 if url.startswith(('#', ';', ']')):
1261 return False
1262 return url
1263
1264 with contextlib.closing(batch_fd) as fd:
1265 return [url for url in map(fixup, fd) if url]